Loading .gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ combined.js node_modules/ /dist*/ /result /web/atlas.json # Cache folders *.pyc Loading entries/some_test.json 0 → 100644 +154 −0 Original line number Diff line number Diff line { "id": -1, "name": "Some test", "description": "Look! It’s the fan favorite Rainbow Dash", "links": { "website": [ "https://mlp.fandom.com/fr/wiki/Rainbow_Dash" ], "discord": [ "20percentcooler" ], "subreddit": [ "lol" ] }, "path": { "0": [ [ 569, 386 ], [ 570, 388 ], [ 576, 390 ], [ 577, 400 ], [ 583, 403 ], [ 585, 402 ], [ 586, 398 ], [ 594, 398 ], [ 597, 401 ], [ 602, 401 ], [ 603, 397 ], [ 612, 401 ], [ 618, 400 ], [ 626, 388 ], [ 624, 379 ], [ 618, 370 ], [ 614, 373 ], [ 610, 384 ], [ 606, 387 ], [ 599, 383 ], [ 595, 383 ], [ 591, 375 ], [ 588, 376 ], [ 592, 374 ], [ 605, 382 ], [ 609, 381 ], [ 620, 367 ], [ 602, 356 ], [ 585, 371 ], [ 572, 366 ], [ 560, 374 ], [ 566, 382 ] ] }, "center": { "0": [ 577, 379 ] } } No newline at end of file flake.nix +2 −1 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ installPhase = '' mkdir -p $out cp -r ${./web}/* $out/ cp -r ${./.}/web/* $out/ ${pkgs.python3}/bin/python ${./.}/tools/merge_data.py ${./.}/entries $out/atlas.json ''; }; default = website; Loading tools/lemmycrawl.py 0 → 100644 +98 −0 Original line number Diff line number Diff line import json import os import random import re #from https://www.30secondsofcode.org/python/s/slugify/ def slugify(s): s = s.lower().strip() s = re.sub(r'[^\w\s-]', '-', s) s = re.sub(r'[\s_-]+', '_', s) s = re.sub(r'^-+|-+$', '-', s) return s class LemmyPostProcessor: def __init__(self): self.folder = "entries/" self.load_data() def load_data(self): self.id_to_name = {} for filename in os.listdir(self.folder): path = os.path.join(self.folder, filename) post = None with open(path, "r") as f: post = json.load(f) if post["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(post["id"])) self.id_to_name[post["id"]] = filename def push_entry(self, content): id = None if content["id"] == -1: id = random.randint(0, 9999999999) else: id = content["id"] assert type(id) == int name = None if id in self.id_to_name: name = self.id_to_name[id] else: name = slugify(content["name"]) + ".json" self.id_to_name[id] = name with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) def process_post(self, post_body): post_id = post_body["id"] post_body = post_body["body"] post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return self.push_entry(post_json) post_processor = LemmyPostProcessor() a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"] post_processor.process_post(a) """read_ids = set() with open("../data/lemmy_read_ids.txt", "r") as f: for line in f.readlines(): if line.strip() == "": continue read_ids.add(int(line.strip())) highest_post_id = max(read_ids) or 0 next_page = None while True: req = requests.get("https://toast.ooo/api/v3/post/list", params={ "next_page": next_page, "community_name": "2024lemmycanvasatlas", "type_": "All", "sort": "New", }) if req.status_code != 200: raise BaseException("Request failed with code " + str(req.status_code)) data = req.json() for post in data["posts"]: print(post) if post["i"] next_page = data["next_page"] """ No newline at end of file tools/merge_data.py 0 → 100644 +26 −0 Original line number Diff line number Diff line import json import sys import os def usage(): print("Usage: python merge_data.py SOURCE_DIR TARGET_FILE") print("Merge data from multiple JSON files in SOURCE_DIR into TARGET_FILE") if len(sys.argv) != 3: usage() sys.exit(1) SOURCE_DIR = sys.argv[1] TARGET_FILE = sys.argv[2] result = [] for filename in os.listdir(SOURCE_DIR): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: result.append(json.load(f)) with open(TARGET_FILE, "w") as f: json.dump(result, f) print("Done") No newline at end of file Loading
.gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ combined.js node_modules/ /dist*/ /result /web/atlas.json # Cache folders *.pyc Loading
entries/some_test.json 0 → 100644 +154 −0 Original line number Diff line number Diff line { "id": -1, "name": "Some test", "description": "Look! It’s the fan favorite Rainbow Dash", "links": { "website": [ "https://mlp.fandom.com/fr/wiki/Rainbow_Dash" ], "discord": [ "20percentcooler" ], "subreddit": [ "lol" ] }, "path": { "0": [ [ 569, 386 ], [ 570, 388 ], [ 576, 390 ], [ 577, 400 ], [ 583, 403 ], [ 585, 402 ], [ 586, 398 ], [ 594, 398 ], [ 597, 401 ], [ 602, 401 ], [ 603, 397 ], [ 612, 401 ], [ 618, 400 ], [ 626, 388 ], [ 624, 379 ], [ 618, 370 ], [ 614, 373 ], [ 610, 384 ], [ 606, 387 ], [ 599, 383 ], [ 595, 383 ], [ 591, 375 ], [ 588, 376 ], [ 592, 374 ], [ 605, 382 ], [ 609, 381 ], [ 620, 367 ], [ 602, 356 ], [ 585, 371 ], [ 572, 366 ], [ 560, 374 ], [ 566, 382 ] ] }, "center": { "0": [ 577, 379 ] } } No newline at end of file
flake.nix +2 −1 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ installPhase = '' mkdir -p $out cp -r ${./web}/* $out/ cp -r ${./.}/web/* $out/ ${pkgs.python3}/bin/python ${./.}/tools/merge_data.py ${./.}/entries $out/atlas.json ''; }; default = website; Loading
tools/lemmycrawl.py 0 → 100644 +98 −0 Original line number Diff line number Diff line import json import os import random import re #from https://www.30secondsofcode.org/python/s/slugify/ def slugify(s): s = s.lower().strip() s = re.sub(r'[^\w\s-]', '-', s) s = re.sub(r'[\s_-]+', '_', s) s = re.sub(r'^-+|-+$', '-', s) return s class LemmyPostProcessor: def __init__(self): self.folder = "entries/" self.load_data() def load_data(self): self.id_to_name = {} for filename in os.listdir(self.folder): path = os.path.join(self.folder, filename) post = None with open(path, "r") as f: post = json.load(f) if post["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(post["id"])) self.id_to_name[post["id"]] = filename def push_entry(self, content): id = None if content["id"] == -1: id = random.randint(0, 9999999999) else: id = content["id"] assert type(id) == int name = None if id in self.id_to_name: name = self.id_to_name[id] else: name = slugify(content["name"]) + ".json" self.id_to_name[id] = name with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) def process_post(self, post_body): post_id = post_body["id"] post_body = post_body["body"] post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return self.push_entry(post_json) post_processor = LemmyPostProcessor() a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"] post_processor.process_post(a) """read_ids = set() with open("../data/lemmy_read_ids.txt", "r") as f: for line in f.readlines(): if line.strip() == "": continue read_ids.add(int(line.strip())) highest_post_id = max(read_ids) or 0 next_page = None while True: req = requests.get("https://toast.ooo/api/v3/post/list", params={ "next_page": next_page, "community_name": "2024lemmycanvasatlas", "type_": "All", "sort": "New", }) if req.status_code != 200: raise BaseException("Request failed with code " + str(req.status_code)) data = req.json() for post in data["posts"]: print(post) if post["i"] next_page = data["next_page"] """ No newline at end of file
tools/merge_data.py 0 → 100644 +26 −0 Original line number Diff line number Diff line import json import sys import os def usage(): print("Usage: python merge_data.py SOURCE_DIR TARGET_FILE") print("Merge data from multiple JSON files in SOURCE_DIR into TARGET_FILE") if len(sys.argv) != 3: usage() sys.exit(1) SOURCE_DIR = sys.argv[1] TARGET_FILE = sys.argv[2] result = [] for filename in os.listdir(SOURCE_DIR): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: result.append(json.load(f)) with open(TARGET_FILE, "w") as f: json.dump(result, f) print("Done") No newline at end of file