Version 2.0, with vastly improved toot fetching capabilities!

2018-10-25 12:37:11 +10:00 · 2018-10-25 12:37:11 +10:00 · 19899cafee
parent 3d059d0b9b
commit 19899cafee
1 changed files with 112 additions and 69 deletions
--- a/main.py
+++ b/main.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 # toot downloader version two!!
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@ -6,97 +7,86 @@
 from mastodon import Mastodon
 from os import path
 from bs4 import BeautifulSoup
-import shutil, os, sqlite3, signal, sys, json
+import os, sqlite3, signal, sys, json, re
-# import re
+import requests
 scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
 cfg = json.load(open('config.json', 'r'))
-if not path.exists("clientcred.secret"):
+if os.path.exists("clientcred.secret"):
    print("Upgrading to new storage method")
    cc = open("clientcred.secret").read().split("\n")
    cfg['client'] = {
        "id": cc[0],
        "secret": cc[1]
    }
    cfg['secret'] = open("usercred.secret").read().rstrip("\n")
    os.remove("clientcred.secret")
    os.remove("usercred.secret")
    print("No clientcred.secret, registering application")
    Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=cfg['site'], to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks")
-if not path.exists("usercred.secret"):
+if "client" not in cfg:
-    print("No usercred.secret, registering application")
+	print("No client credentials, registering application")
-    client = Mastodon(client_id="clientcred.secret", api_base_url=cfg['site'])
+	client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
-    print("Visit this url:")
+		api_base_url=cfg['site'],
-    print(client.auth_request_url(scopes=scopes))
+		scopes=scopes,
-    client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes)
+		website="https://github.com/Lynnesbian/mstdn-ebooks")
-def parse_toot(toot):
+	cfg['client'] = {
-	if toot.spoiler_text != "": return
+		"id": client_id,
-	if toot.reblog is not None: return
+		"secret": client_secret
-	if toot.visibility not in ["public", "unlisted"]: return
+	}
-	soup = BeautifulSoup(toot.content, "html.parser")
+if "secret" not in cfg:
 	print("No user credentials, logging in")
 	client = Mastodon(client_id = cfg['client']['id'],
 		client_secret = cfg['client']['secret'],
 		api_base_url=cfg['site'])
-	# pull the mentions out
+	print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
-	# for mention in soup.select("span.h-card"):
+	cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
 	#     mention.unwrap()
-	# for mention in soup.select("a.u-url.mention"):
+json.dump(cfg, open("config.json", "w+"))
-	#     mention.unwrap()
+
 def extract_toot(toot):
 	toot = toot.replace("&apos;", "'")
 	toot = toot.replace("&quot;", '"')
 	soup = BeautifulSoup(toot, "html.parser")
 	# this is the code that removes all mentions
 	# TODO: make it so that it removes the @ and instance but keeps the name
 	for mention in soup.select("span.h-card"):
-		mention.decompose()
+		mention.a.unwrap()
 		mention.span.unwrap()
-	# make all linebreaks actual linebreaks
+	# replace <br> with linebreak
 	for lb in soup.select("br"):
 		lb.insert_after("\n")
 		lb.decompose()
-	# make each p element its own line because sometimes they decide not to be
+	# replace <p> with linebreak
 	for p in soup.select("p"):
 		p.insert_after("\n")
 		p.unwrap()
-	# keep hashtags in the toots
+	# fix hashtags
 	for ht in soup.select("a.hashtag"):
 		ht.unwrap()
-	# unwrap all links (i like the bots posting links)
+	# fix links
 	for link in soup.select("a"):
 		link.insert_after(link["href"])
 		link.decompose()
-	text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
+	toot = soup.get_text()
-
+	toot = toot.rstrip("\n") #remove trailing newline
-	# next up: store this and patch markovify to take it
+	toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
-	# return {"text": text, "mentions": mentions, "links": links}
+	return(toot)
 	# it's 4am though so we're not doing that now, but i still want the parser updates
 	#todo: we split above and join now, which is dumb, but i don't wanna mess with the map code bc i don't understand it uwu
 	text = "\n".join(list(text)) 
 	text = text.replace("&apos;", "'")
 	return text
 def get_toots(client, id, since_id):
 	i = 0
 	toots = client.account_statuses(id, since_id = since_id)
 	while toots is not None and len(toots) > 0:
 		for toot in toots:
 			t = parse_toot(toot)
 			if t != None:
 				yield {
 					"content": t,
 					"id": toot.id
 				}
 		try:
 			toots = client.fetch_next(toots)
 		except TimeoutError:
 			print("Operation timed out, committing to database and exiting.")
 			db.commit()
 			db.close()
 			sys.exit(1)
 		i += 1
 		if i%10 == 0:
 			print(i)
 client = Mastodon(
-		client_id="clientcred.secret", 
+	client_id=cfg['client']['id'],
-		access_token="usercred.secret", 
+	client_secret = cfg['client']['secret'], 
 	access_token=cfg['secret'], 
 	api_base_url=cfg['site'])
 me = client.account_verify_credentials()
@ -105,7 +95,7 @@ following = client.account_following(me.id)
 db = sqlite3.connect("toots.db")
 db.text_factory=str
 c = db.cursor()
-c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
+c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
 db.commit()
 def handleCtrlC(signal, frame):
@ -121,12 +111,65 @@ for f in following:
 		last_toot = last_toot[0]
 	else:
 		last_toot = 0
-	print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot))
+	print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
-	for t in get_toots(client, f.id, last_toot):
+
-		# try:
+	#find the user's activitypub outbox
-		c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content']))
+	print("WebFingering...")
-		# except:
+	instance = re.search(r"^.*@(.+)", f.acct)
-		# 	pass #ignore toots that can't be encoded properly
+	if instance == None:
 		instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
 	else:
 		instance = instance.group(1)
 	# print("{} is on {}".format(f.acct, instance))
 	try:
 		r = requests.get("https://{}/.well-known/host-meta".format(instance))
 		uri = re.search(r'template="([^"]+)"', r.text).group(1)
 		uri = uri.format(uri = "{}@{}".format(f.username, instance))
 		r = requests.get(uri)
 		uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this
 		uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
 		r = requests.get(uri)
 		j = r.json()
 	except Exception:
 		print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
 		sys.exit(1)
 	print("Downloading and parsing toots", end='', flush=True)
 	current = None
 	try:
 		while len(j['orderedItems']) > 0:
 			for oi in j['orderedItems']:
 				if oi['type'] == "Create":
 					# its a toost baby
 					content = oi['object']['content']
 					if oi['object']['summary'] != None:
 						#don't download CW'd toots
 						continue
 					toot = extract_toot(content)
 					# print(toot)
 					try:
 						c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
 							(re.search(r"[^\/]+$", oi['object']['id']).group(0),
 							f.id,
 							oi['object']['id'],
 							toot
 							)
 						)
 						pass
 					except:
 						pass #ignore any toots that don't go into the DB
 			# sys.exit(0)
 			r = requests.get(j['prev'])
 			j = r.json()
 			print('.', end='', flush=True)
 		print(" Done!")
 		db.commit()
 	except:
 		print("Encountered an error! Saving toots to database and exiting.")
 		db.commit()
 		db.close()
 		sys.exit(1)
 db.commit()
 db.execute("VACUUM") #compact db