Commit f6b0b940 authored by marius david's avatar marius david
Browse files

progress on lemmy fetcher tool

parent 0870970c
Loading
Loading
Loading
Loading
+37 −28
Original line number Diff line number Diff line
@@ -11,23 +11,30 @@ def slugify(s):
	s = re.sub(r'^-+|-+$', '-', s)
	return s

class LemmyPostProcessor:
class EntryManager:
	def __init__(self):
		self.folder = "entries/"
		self.load_data()
	
	def load_data(self):
		self.id_to_name = {}
		self.combined_source_list = {}
		for filename in os.listdir(self.folder):
			path = os.path.join(self.folder, filename)
			post = None
			entry = None
			with open(path, "r") as f:
				post = json.load(f)
			if post["id"] in self.id_to_name:
				raise BaseException("Duplicate ID: " + str(post["id"]))
			self.id_to_name[post["id"]] = filename
				entry = json.load(f)
			if entry["id"] in self.id_to_name:
				raise BaseException("Duplicate ID: " + str(entry["id"]))
			self.id_to_name[entry["id"]] = filename
			if "source_list" in entry:
				for (source_name, source_list) in entry["source_list"].items():
					if source_name in self.combined_source_list:
						self.combined_source_list[source_name].extend(source_list)
					else:
						self.combined_source_list[source_name] = source_list

	def push_entry(self, content):
	def push_entry(self, content, source_name = None, source_id = None):
		id = None
		if content["id"] == -1:
			id = None
@@ -47,28 +54,30 @@ def push_entry(self, content):
		
		self.id_to_name[id] = name

		with open(os.path.join(self.folder, name), "w") as f:
			json.dump(content, f, indent=4)


	def process_post(self, post_body):
		post_id = post_body["id"]
		post_body = post_body["body"]
		if source_name is not None and source_id is not None:
			if "source_list" not in content:
				content["source_list"] = {}

		post_json = None
		try:
			post_json = json.loads(post_body)
		except:
			print("Post does not contain valid JSON: " + str(post_id))
			return
			if source_name in content["source_list"]:
				content["source_list"][source_name].append(source_id)
			else:
				content["source_list"][source_name] = [source_id]
			
		self.push_entry(post_json)
			if source_name in self.combined_source_list:
				self.combined_source_list[source_name].append(source_id)
			else:
				self.combined_source_list[source_name] = [source_id]

		with open(os.path.join(self.folder, name), "w") as f:
			json.dump(content, f, indent=4)
	
post_processor = LemmyPostProcessor()
a = json.load(open("tools/test_post.json", "r"))["posts"][0]["post"]
	def entry_by_id_if_exist(self, id):
		return self.id_to_name.get(id)
	
post_processor.process_post(a)
	def is_post_processed(self, source_name, source_id):
		if source_name in self.combined_source_list:
			return source_id in self.combined_source_list[source_name]
		return False

"""read_ids = set()
with open("../data/lemmy_read_ids.txt", "r") as f:

tools/lemmy_fetcher.py

0 → 100644
+67 −0
Original line number Diff line number Diff line
import json
import subprocess
from entry_manager import EntryManager
from lib.github_forge import GithubForge

class LemmyFetcher:
	def __init__(self, entry_manager):
		self.entry_manager = entry_manager
		self.forge = GithubForge()
		self.forge.from_env()
	
	
	def process_post(self, post_body, make_pr = False):
		post_id = post_body["id"]
		if self.entry_manager.is_post_processed("lemmy", post_id):
			print("Post already processed: " + str(post_id))
			return
		
		post_body = post_body["body"]

		post_json = None
		try:
			post_json = json.loads(post_body)
		except:
			print("Post does not contain valid JSON: " + str(post_id))
			return
		
		new_branch_name = "lemmy-" + str(post_id)
		if make_pr:
			if self.forge.does_pr_already_exist(new_branch_name):
				raise BaseException("PR already exists for this branch, refusing to continue: " + new_branch_name)
			# Yes, I used AI assisted tooling. And they work pretty well. Thanks codeium.
			# get git branch name
			branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True).strip()
			if branch != "main":
				raise BaseException("Not on main branch, refusing to continue: " + branch)
			# check if any content in entries is dirty
			if "entries/" in subprocess.check_output(["git", "status", "--porcelain"], text=True):
				raise BaseException("Some change in entries are uncommited, refusing to continue")
			# check new branch does not already exist
			if new_branch_name in subprocess.check_output(["git", "branch", "--list", new_branch_name], text=True):
				print("Branch " + new_branch_name + " already exists, deleting it")
				subprocess.check_call(["git", "branch", "-D", new_branch_name])

			# create new branch
			subprocess.check_call(["git", "checkout", "-b", new_branch_name])

		self.entry_manager.push_entry(post_json, "lemmy", post_id)

		if make_pr:
			subprocess.check_call(["git", "add", "entries/"])
			subprocess.check_call(["git", "commit", "-m", "Add submission from Lemmy post " + str(post_id)])
			subprocess.check_call(["git", "push", "--force", "origin", new_branch_name])
			print("branch pushed, making PR")
			self.forge.make_pr_between_branches(new_branch_name, "main", "Add submission from Lemmy post " + str(post_id), "Add submission from Lemmy post " + str(post_id))
			subprocess.check_call(["git", "checkout", "main"])



	

em = EntryManager()
lm = LemmyFetcher(em)

lm.process_post(json.load(open("tools/test_post.json", "r"))["posts"][0]["post"], False)

print("done")
 No newline at end of file
+11 −0
Original line number Diff line number Diff line
class ForgeBase:
	"""Limited API for interability between diverse software forge
	(basically just opening a PR)"""
	def __init__(self):
		pass
	
	def make_pr_between_branches(self, from_branch, to_branch, title, message):
		raise NotImplementedError()
	
	def does_pr_already_exist(self, from_branch):
		raise NotImplementedError()
 No newline at end of file
+44 −0
Original line number Diff line number Diff line
from github import Auth
from github import Github

from lib.forge_base import ForgeBase
import os

class GithubForge(ForgeBase):
	def __init__(self):
		self.github = None
		self.repo = None
		pass
	
	def from_env(self):
		self.github_token = os.environ.get("GITHUB_TOKEN")
		self.owner_name = os.environ.get("GITHUB_OWNER")
		self.repo_name = os.environ.get("GITHUB_REPO_NAME")

	def load(self):
		auth = Auth.Token(self.github_token)
		self.github = Github(auth = auth)
		self.repo = self.github.get_repo(self.owner_name + "/" + self.repo_name)
	
	def load_if_needed(self):
		if self.github is None:
			self.load()
	
	def does_pr_already_exist(self, from_branch):
		self.load_if_needed()

		print(self.repo_name + ":" + from_branch)
		for pr in self.repo.get_pulls(state="open", head=from_branch):
			return True

		return False
	
	def make_pr_between_branches(self, from_branch, to_branch, title, body):
		self.load_if_needed()

		self.repo.create_pull(
			title = title,
			base = to_branch,
			head = from_branch,
			body = body
		)
 No newline at end of file