Loading tools/lemmycrawl.py→tools/entry_manager.py +37 −28 Original line number Diff line number Diff line Loading @@ -11,23 +11,30 @@ def slugify(s): s = re.sub(r'^-+|-+$', '-', s) return s class LemmyPostProcessor: class EntryManager: def __init__(self): self.folder = "entries/" self.load_data() def load_data(self): self.id_to_name = {} self.combined_source_list = {} for filename in os.listdir(self.folder): path = os.path.join(self.folder, filename) post = None entry = None with open(path, "r") as f: post = json.load(f) if post["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(post["id"])) self.id_to_name[post["id"]] = filename entry = json.load(f) if entry["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(entry["id"])) self.id_to_name[entry["id"]] = filename if "source_list" in entry: for (source_name, source_list) in entry["source_list"].items(): if source_name in self.combined_source_list: self.combined_source_list[source_name].extend(source_list) else: self.combined_source_list[source_name] = source_list def push_entry(self, content): def push_entry(self, content, source_name = None, source_id = None): id = None if content["id"] == -1: id = None Loading @@ -47,28 +54,30 @@ def push_entry(self, content): self.id_to_name[id] = name with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) def process_post(self, post_body): post_id = post_body["id"] post_body = post_body["body"] if source_name is not None and source_id is not None: if "source_list" not in content: content["source_list"] = {} post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return if source_name in content["source_list"]: content["source_list"][source_name].append(source_id) else: content["source_list"][source_name] = [source_id] self.push_entry(post_json) if source_name in self.combined_source_list: self.combined_source_list[source_name].append(source_id) else: self.combined_source_list[source_name] = [source_id] with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) post_processor = LemmyPostProcessor() a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"] def entry_by_id_if_exist(self, id): return self.id_to_name.get(id) post_processor.process_post(a) def is_post_processed(self, source_name, source_id): if source_name in self.combined_source_list: return source_id in self.combined_source_list[source_name] return False """read_ids = set() with open("../data/lemmy_read_ids.txt", "r") as f: Loading tools/lemmy_fetcher.py 0 → 100644 +67 −0 Original line number Diff line number Diff line import json import subprocess from entry_manager import EntryManager from lib.github_forge import GithubForge class LemmyFetcher: def __init__(self, entry_manager): self.entry_manager = entry_manager self.forge = GithubForge() self.forge.from_env() def process_post(self, post_body, make_pr = False): post_id = post_body["id"] if self.entry_manager.is_post_processed("lemmy", post_id): print("Post already processed: " + str(post_id)) return post_body = post_body["body"] post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return new_branch_name = "lemmy-" + str(post_id) if make_pr: if self.forge.does_pr_already_exist(new_branch_name): raise BaseException("PR already exists for this branch, refusing to continue: " + new_branch_name) # Yes, I used AI assisted tooling. And they work pretty well. Thanks codeium. # get git branch name branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True).strip() if branch != "main": raise BaseException("Not on main branch, refusing to continue: " + branch) # check if any content in entries is dirty if "entries/" in subprocess.check_output(["git", "status", "--porcelain"], text=True): raise BaseException("Some change in entries are uncommited, refusing to continue") # check new branch does not already exist if new_branch_name in subprocess.check_output(["git", "branch", "--list", new_branch_name], text=True): print("Branch " + new_branch_name + " already exists, deleting it") subprocess.check_call(["git", "branch", "-D", new_branch_name]) # create new branch subprocess.check_call(["git", "checkout", "-b", new_branch_name]) self.entry_manager.push_entry(post_json, "lemmy", post_id) if make_pr: subprocess.check_call(["git", "add", "entries/"]) subprocess.check_call(["git", "commit", "-m", "Add submission from Lemmy post " + str(post_id)]) subprocess.check_call(["git", "push", "--force", "origin", new_branch_name]) print("branch pushed, making PR") self.forge.make_pr_between_branches(new_branch_name, "main", "Add submission from Lemmy post " + str(post_id), "Add submission from Lemmy post " + str(post_id)) subprocess.check_call(["git", "checkout", "main"]) em = EntryManager() lm = LemmyFetcher(em) lm.process_post(json.load(open("tools/test_post.json", "r"))["posts"][0]["post"], False) print("done") No newline at end of file tools/lib/forge_base.py 0 → 100644 +11 −0 Original line number Diff line number Diff line class ForgeBase: """Limited API for interability between diverse software forge (basically just opening a PR)""" def __init__(self): pass def make_pr_between_branches(self, from_branch, to_branch, title, message): raise NotImplementedError() def does_pr_already_exist(self, from_branch): raise NotImplementedError() No newline at end of file tools/lib/github_forge.py 0 → 100644 +44 −0 Original line number Diff line number Diff line from github import Auth from github import Github from lib.forge_base import ForgeBase import os class GithubForge(ForgeBase): def __init__(self): self.github = None self.repo = None pass def from_env(self): self.github_token = os.environ.get("GITHUB_TOKEN") self.owner_name = os.environ.get("GITHUB_OWNER") self.repo_name = os.environ.get("GITHUB_REPO_NAME") def load(self): auth = Auth.Token(self.github_token) self.github = Github(auth = auth) self.repo = self.github.get_repo(self.owner_name + "/" + self.repo_name) def load_if_needed(self): if self.github is None: self.load() def does_pr_already_exist(self, from_branch): self.load_if_needed() print(self.repo_name + ":" + from_branch) for pr in self.repo.get_pulls(state="open", head=from_branch): return True return False def make_pr_between_branches(self, from_branch, to_branch, title, body): self.load_if_needed() self.repo.create_pull( title = title, base = to_branch, head = from_branch, body = body ) No newline at end of file Loading
tools/lemmycrawl.py→tools/entry_manager.py +37 −28 Original line number Diff line number Diff line Loading @@ -11,23 +11,30 @@ def slugify(s): s = re.sub(r'^-+|-+$', '-', s) return s class LemmyPostProcessor: class EntryManager: def __init__(self): self.folder = "entries/" self.load_data() def load_data(self): self.id_to_name = {} self.combined_source_list = {} for filename in os.listdir(self.folder): path = os.path.join(self.folder, filename) post = None entry = None with open(path, "r") as f: post = json.load(f) if post["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(post["id"])) self.id_to_name[post["id"]] = filename entry = json.load(f) if entry["id"] in self.id_to_name: raise BaseException("Duplicate ID: " + str(entry["id"])) self.id_to_name[entry["id"]] = filename if "source_list" in entry: for (source_name, source_list) in entry["source_list"].items(): if source_name in self.combined_source_list: self.combined_source_list[source_name].extend(source_list) else: self.combined_source_list[source_name] = source_list def push_entry(self, content): def push_entry(self, content, source_name = None, source_id = None): id = None if content["id"] == -1: id = None Loading @@ -47,28 +54,30 @@ def push_entry(self, content): self.id_to_name[id] = name with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) def process_post(self, post_body): post_id = post_body["id"] post_body = post_body["body"] if source_name is not None and source_id is not None: if "source_list" not in content: content["source_list"] = {} post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return if source_name in content["source_list"]: content["source_list"][source_name].append(source_id) else: content["source_list"][source_name] = [source_id] self.push_entry(post_json) if source_name in self.combined_source_list: self.combined_source_list[source_name].append(source_id) else: self.combined_source_list[source_name] = [source_id] with open(os.path.join(self.folder, name), "w") as f: json.dump(content, f, indent=4) post_processor = LemmyPostProcessor() a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"] def entry_by_id_if_exist(self, id): return self.id_to_name.get(id) post_processor.process_post(a) def is_post_processed(self, source_name, source_id): if source_name in self.combined_source_list: return source_id in self.combined_source_list[source_name] return False """read_ids = set() with open("../data/lemmy_read_ids.txt", "r") as f: Loading
tools/lemmy_fetcher.py 0 → 100644 +67 −0 Original line number Diff line number Diff line import json import subprocess from entry_manager import EntryManager from lib.github_forge import GithubForge class LemmyFetcher: def __init__(self, entry_manager): self.entry_manager = entry_manager self.forge = GithubForge() self.forge.from_env() def process_post(self, post_body, make_pr = False): post_id = post_body["id"] if self.entry_manager.is_post_processed("lemmy", post_id): print("Post already processed: " + str(post_id)) return post_body = post_body["body"] post_json = None try: post_json = json.loads(post_body) except: print("Post does not contain valid JSON: " + str(post_id)) return new_branch_name = "lemmy-" + str(post_id) if make_pr: if self.forge.does_pr_already_exist(new_branch_name): raise BaseException("PR already exists for this branch, refusing to continue: " + new_branch_name) # Yes, I used AI assisted tooling. And they work pretty well. Thanks codeium. # get git branch name branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True).strip() if branch != "main": raise BaseException("Not on main branch, refusing to continue: " + branch) # check if any content in entries is dirty if "entries/" in subprocess.check_output(["git", "status", "--porcelain"], text=True): raise BaseException("Some change in entries are uncommited, refusing to continue") # check new branch does not already exist if new_branch_name in subprocess.check_output(["git", "branch", "--list", new_branch_name], text=True): print("Branch " + new_branch_name + " already exists, deleting it") subprocess.check_call(["git", "branch", "-D", new_branch_name]) # create new branch subprocess.check_call(["git", "checkout", "-b", new_branch_name]) self.entry_manager.push_entry(post_json, "lemmy", post_id) if make_pr: subprocess.check_call(["git", "add", "entries/"]) subprocess.check_call(["git", "commit", "-m", "Add submission from Lemmy post " + str(post_id)]) subprocess.check_call(["git", "push", "--force", "origin", new_branch_name]) print("branch pushed, making PR") self.forge.make_pr_between_branches(new_branch_name, "main", "Add submission from Lemmy post " + str(post_id), "Add submission from Lemmy post " + str(post_id)) subprocess.check_call(["git", "checkout", "main"]) em = EntryManager() lm = LemmyFetcher(em) lm.process_post(json.load(open("tools/test_post.json", "r"))["posts"][0]["post"], False) print("done") No newline at end of file
tools/lib/forge_base.py 0 → 100644 +11 −0 Original line number Diff line number Diff line class ForgeBase: """Limited API for interability between diverse software forge (basically just opening a PR)""" def __init__(self): pass def make_pr_between_branches(self, from_branch, to_branch, title, message): raise NotImplementedError() def does_pr_already_exist(self, from_branch): raise NotImplementedError() No newline at end of file
tools/lib/github_forge.py 0 → 100644 +44 −0 Original line number Diff line number Diff line from github import Auth from github import Github from lib.forge_base import ForgeBase import os class GithubForge(ForgeBase): def __init__(self): self.github = None self.repo = None pass def from_env(self): self.github_token = os.environ.get("GITHUB_TOKEN") self.owner_name = os.environ.get("GITHUB_OWNER") self.repo_name = os.environ.get("GITHUB_REPO_NAME") def load(self): auth = Auth.Token(self.github_token) self.github = Github(auth = auth) self.repo = self.github.get_repo(self.owner_name + "/" + self.repo_name) def load_if_needed(self): if self.github is None: self.load() def does_pr_already_exist(self, from_branch): self.load_if_needed() print(self.repo_name + ":" + from_branch) for pr in self.repo.get_pulls(state="open", head=from_branch): return True return False def make_pr_between_branches(self, from_branch, to_branch, title, body): self.load_if_needed() self.repo.create_pull( title = title, base = to_branch, head = from_branch, body = body ) No newline at end of file