Commit b2a21973 authored by marius david's avatar marius david
Browse files

preparation to work with lemmy crawling

parent 4abf5aca
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ combined.js
node_modules/
/dist*/
/result
/web/atlas.json

# Cache folders
*.pyc

entries/some_test.json

0 → 100644
+154 −0
Original line number Diff line number Diff line
{
	"id": -1,
	"name": "Some test",
	"description": "Look! It’s the fan favorite Rainbow Dash",
	"links": {
		"website": [
			"https://mlp.fandom.com/fr/wiki/Rainbow_Dash"
		],
		"discord": [
			"20percentcooler"
		],
		"subreddit": [
			"lol"
		]
	},
	"path": {
		"0": [
			[
				569,
				386
			],
			[
				570,
				388
			],
			[
				576,
				390
			],
			[
				577,
				400
			],
			[
				583,
				403
			],
			[
				585,
				402
			],
			[
				586,
				398
			],
			[
				594,
				398
			],
			[
				597,
				401
			],
			[
				602,
				401
			],
			[
				603,
				397
			],
			[
				612,
				401
			],
			[
				618,
				400
			],
			[
				626,
				388
			],
			[
				624,
				379
			],
			[
				618,
				370
			],
			[
				614,
				373
			],
			[
				610,
				384
			],
			[
				606,
				387
			],
			[
				599,
				383
			],
			[
				595,
				383
			],
			[
				591,
				375
			],
			[
				588,
				376
			],
			[
				592,
				374
			],
			[
				605,
				382
			],
			[
				609,
				381
			],
			[
				620,
				367
			],
			[
				602,
				356
			],
			[
				585,
				371
			],
			[
				572,
				366
			],
			[
				560,
				374
			],
			[
				566,
				382
			]
		]
	},
	"center": {
		"0": [
			577,
			379
		]
	}
}
 No newline at end of file
+2 −1
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@

					installPhase = ''
						mkdir -p $out
						cp -r ${./web}/* $out/
						cp -r ${./.}/web/* $out/
						${pkgs.python3}/bin/python ${./.}/tools/merge_data.py ${./.}/entries $out/atlas.json
					'';
				};
				default = website;

tools/lemmycrawl.py

0 → 100644
+98 −0
Original line number Diff line number Diff line
import json
import os
import random
import re

#from https://www.30secondsofcode.org/python/s/slugify/
def slugify(s):
	s = s.lower().strip()
	s = re.sub(r'[^\w\s-]', '-', s)
	s = re.sub(r'[\s_-]+', '_', s)
	s = re.sub(r'^-+|-+$', '-', s)
	return s

class LemmyPostProcessor:
	def __init__(self):
		self.folder = "entries/"
		self.load_data()
	
	def load_data(self):
		self.id_to_name = {}
		for filename in os.listdir(self.folder):
			path = os.path.join(self.folder, filename)
			post = None
			with open(path, "r") as f:
				post = json.load(f)
			if post["id"] in self.id_to_name:
				raise BaseException("Duplicate ID: " + str(post["id"]))
			self.id_to_name[post["id"]] = filename

	def push_entry(self, content):
		id = None
		if content["id"] == -1:
			id = random.randint(0, 9999999999)
		else:
			id = content["id"]
		
		assert type(id) == int

		name = None
		if id in self.id_to_name:
			name = self.id_to_name[id]
		else:
			name = slugify(content["name"]) + ".json"
		
		self.id_to_name[id] = name

		with open(os.path.join(self.folder, name), "w") as f:
			json.dump(content, f, indent=4)


	def process_post(self, post_body):
		post_id = post_body["id"]
		post_body = post_body["body"]

		post_json = None
		try:
			post_json = json.loads(post_body)
		except:
			print("Post does not contain valid JSON: " + str(post_id))
			return
		
		self.push_entry(post_json)


post_processor = LemmyPostProcessor()
a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"]

post_processor.process_post(a)

"""read_ids = set()
with open("../data/lemmy_read_ids.txt", "r") as f:
	for line in f.readlines():
		if line.strip() == "":
			continue
		read_ids.add(int(line.strip()))

highest_post_id = max(read_ids) or 0

next_page = None

while True:
	req = requests.get("https://toast.ooo/api/v3/post/list", params={
		"next_page": next_page,
		"community_name": "2024lemmycanvasatlas",
		"type_": "All",
		"sort": "New",
	})
	if req.status_code != 200:
		raise BaseException("Request failed with code " + str(req.status_code))

	data = req.json()

	for post in data["posts"]:
		print(post)

		if post["i"]
	
	next_page = data["next_page"] """
 No newline at end of file

tools/merge_data.py

0 → 100644
+26 −0
Original line number Diff line number Diff line
import json
import sys
import os


def usage():
	print("Usage: python merge_data.py SOURCE_DIR TARGET_FILE")
	print("Merge data from multiple JSON files in SOURCE_DIR into TARGET_FILE")

if len(sys.argv) != 3:
	usage()
	sys.exit(1)

SOURCE_DIR = sys.argv[1]
TARGET_FILE = sys.argv[2]

result = []
for filename in os.listdir(SOURCE_DIR):
	path = os.path.join(SOURCE_DIR, filename)
	with open(path, "r") as f:
		result.append(json.load(f))

with open(TARGET_FILE, "w") as f:
	json.dump(result, f)

print("Done")
 No newline at end of file
Loading