preparation to work with lemmy crawling (b2a21973) · Commits · Jens Bannmann / Fediverse Canvas Atlas 2024 · GitLab

.gitignore

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@ combined.js
		node_modules/
		/dist*/
		/result
		/web/atlas.json

		# Cache folders
		*.pyc

entries/some_test.json

0 → 100644

+154 −0

Original line number	Diff line number	Diff line
		{
		"id": -1,
		"name": "Some test",
		"description": "Look! It’s the fan favorite Rainbow Dash",
		"links": {
		"website": [
		"https://mlp.fandom.com/fr/wiki/Rainbow_Dash"
		],
		"discord": [
		"20percentcooler"
		],
		"subreddit": [
		"lol"
		]
		},
		"path": {
		"0": [
		[
		569,
		386
		],
		[
		570,
		388
		],
		[
		576,
		390
		],
		[
		577,
		400
		],
		[
		583,
		403
		],
		[
		585,
		402
		],
		[
		586,
		398
		],
		[
		594,
		398
		],
		[
		597,
		401
		],
		[
		602,
		401
		],
		[
		603,
		397
		],
		[
		612,
		401
		],
		[
		618,
		400
		],
		[
		626,
		388
		],
		[
		624,
		379
		],
		[
		618,
		370
		],
		[
		614,
		373
		],
		[
		610,
		384
		],
		[
		606,
		387
		],
		[
		599,
		383
		],
		[
		595,
		383
		],
		[
		591,
		375
		],
		[
		588,
		376
		],
		[
		592,
		374
		],
		[
		605,
		382
		],
		[
		609,
		381
		],
		[
		620,
		367
		],
		[
		602,
		356
		],
		[
		585,
		371
		],
		[
		572,
		366
		],
		[
		560,
		374
		],
		[
		566,
		382
		]
		]
		},
		"center": {
		"0": [
		577,
		379
		]
		}
		}
		No newline at end of file

flake.nix

+2 −1

Original line number	Diff line number	Diff line
		@@ -16,7 +16,8 @@

		installPhase = ''
		mkdir -p $out
		cp -r ${./web}/* $out/
		cp -r ${./.}/web/* $out/
		${pkgs.python3}/bin/python ${./.}/tools/merge_data.py ${./.}/entries $out/atlas.json
		'';
		};
		default = website;

tools/lemmycrawl.py

0 → 100644

+98 −0

Original line number	Diff line number	Diff line
		import json
		import os
		import random
		import re

		#from https://www.30secondsofcode.org/python/s/slugify/
		def slugify(s):
		s = s.lower().strip()
		s = re.sub(r'[^\w\s-]', '-', s)
		s = re.sub(r'[\s_-]+', '_', s)
		s = re.sub(r'^-+\|-+$', '-', s)
		return s

		class LemmyPostProcessor:
		def __init__(self):
		self.folder = "entries/"
		self.load_data()

		def load_data(self):
		self.id_to_name = {}
		for filename in os.listdir(self.folder):
		path = os.path.join(self.folder, filename)
		post = None
		with open(path, "r") as f:
		post = json.load(f)
		if post["id"] in self.id_to_name:
		raise BaseException("Duplicate ID: " + str(post["id"]))
		self.id_to_name[post["id"]] = filename

		def push_entry(self, content):
		id = None
		if content["id"] == -1:
		id = random.randint(0, 9999999999)
		else:
		id = content["id"]

		assert type(id) == int

		name = None
		if id in self.id_to_name:
		name = self.id_to_name[id]
		else:
		name = slugify(content["name"]) + ".json"

		self.id_to_name[id] = name

		with open(os.path.join(self.folder, name), "w") as f:
		json.dump(content, f, indent=4)


		def process_post(self, post_body):
		post_id = post_body["id"]
		post_body = post_body["body"]

		post_json = None
		try:
		post_json = json.loads(post_body)
		except:
		print("Post does not contain valid JSON: " + str(post_id))
		return

		self.push_entry(post_json)


		post_processor = LemmyPostProcessor()
		a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"]

		post_processor.process_post(a)

		"""read_ids = set()
		with open("../data/lemmy_read_ids.txt", "r") as f:
		for line in f.readlines():
		if line.strip() == "":
		continue
		read_ids.add(int(line.strip()))

		highest_post_id = max(read_ids) or 0

		next_page = None

		while True:
		req = requests.get("https://toast.ooo/api/v3/post/list", params={
		"next_page": next_page,
		"community_name": "2024lemmycanvasatlas",
		"type_": "All",
		"sort": "New",
		})
		if req.status_code != 200:
		raise BaseException("Request failed with code " + str(req.status_code))

		data = req.json()

		for post in data["posts"]:
		print(post)

		if post["i"]

		next_page = data["next_page"] """
		No newline at end of file

tools/merge_data.py

0 → 100644

+26 −0

Original line number	Diff line number	Diff line
		import json
		import sys
		import os


		def usage():
		print("Usage: python merge_data.py SOURCE_DIR TARGET_FILE")
		print("Merge data from multiple JSON files in SOURCE_DIR into TARGET_FILE")

		if len(sys.argv) != 3:
		usage()
		sys.exit(1)

		SOURCE_DIR = sys.argv[1]
		TARGET_FILE = sys.argv[2]

		result = []
		for filename in os.listdir(SOURCE_DIR):
		path = os.path.join(SOURCE_DIR, filename)
		with open(path, "r") as f:
		result.append(json.load(f))

		with open(TARGET_FILE, "w") as f:
		json.dump(result, f)

		print("Done")
		No newline at end of file