Version 2.0, with vastly improved toot fetching capabilities!
This commit is contained in:
		
							parent
							
								
									3d059d0b9b
								
							
						
					
					
						commit
						19899cafee
					
				
							
								
								
									
										181
									
								
								main.py
								
								
								
								
							
							
						
						
									
										181
									
								
								main.py
								
								
								
								
							| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
#!/usr/bin/env python3
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
 | 
					# toot downloader version two!!
 | 
				
			||||||
# This Source Code Form is subject to the terms of the Mozilla Public
 | 
					# This Source Code Form is subject to the terms of the Mozilla Public
 | 
				
			||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
 | 
					# License, v. 2.0. If a copy of the MPL was not distributed with this
 | 
				
			||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | 
					# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | 
				
			||||||
| 
						 | 
					@ -6,98 +7,87 @@
 | 
				
			||||||
from mastodon import Mastodon
 | 
					from mastodon import Mastodon
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
from bs4 import BeautifulSoup
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
import shutil, os, sqlite3, signal, sys, json
 | 
					import os, sqlite3, signal, sys, json, re
 | 
				
			||||||
# import re
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
 | 
					scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
 | 
				
			||||||
cfg = json.load(open('config.json', 'r'))
 | 
					cfg = json.load(open('config.json', 'r'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if not path.exists("clientcred.secret"):
 | 
					if os.path.exists("clientcred.secret"):
 | 
				
			||||||
 | 
					    print("Upgrading to new storage method")
 | 
				
			||||||
 | 
					    cc = open("clientcred.secret").read().split("\n")
 | 
				
			||||||
 | 
					    cfg['client'] = {
 | 
				
			||||||
 | 
					        "id": cc[0],
 | 
				
			||||||
 | 
					        "secret": cc[1]
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    cfg['secret'] = open("usercred.secret").read().rstrip("\n")
 | 
				
			||||||
 | 
					    os.remove("clientcred.secret")
 | 
				
			||||||
 | 
					    os.remove("usercred.secret")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("No clientcred.secret, registering application")
 | 
					if "client" not in cfg:
 | 
				
			||||||
    Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=cfg['site'], to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks")
 | 
						print("No client credentials, registering application")
 | 
				
			||||||
 | 
						client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
 | 
				
			||||||
 | 
							api_base_url=cfg['site'],
 | 
				
			||||||
 | 
							scopes=scopes,
 | 
				
			||||||
 | 
							website="https://github.com/Lynnesbian/mstdn-ebooks")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if not path.exists("usercred.secret"):
 | 
						cfg['client'] = {
 | 
				
			||||||
    print("No usercred.secret, registering application")
 | 
							"id": client_id,
 | 
				
			||||||
    client = Mastodon(client_id="clientcred.secret", api_base_url=cfg['site'])
 | 
							"secret": client_secret
 | 
				
			||||||
    print("Visit this url:")
 | 
						}
 | 
				
			||||||
    print(client.auth_request_url(scopes=scopes))
 | 
					 | 
				
			||||||
    client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse_toot(toot):
 | 
					if "secret" not in cfg:
 | 
				
			||||||
	if toot.spoiler_text != "": return
 | 
						print("No user credentials, logging in")
 | 
				
			||||||
	if toot.reblog is not None: return
 | 
						client = Mastodon(client_id = cfg['client']['id'],
 | 
				
			||||||
	if toot.visibility not in ["public", "unlisted"]: return
 | 
							client_secret = cfg['client']['secret'],
 | 
				
			||||||
 | 
							api_base_url=cfg['site'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	soup = BeautifulSoup(toot.content, "html.parser")
 | 
						print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
 | 
				
			||||||
 | 
						cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					json.dump(cfg, open("config.json", "w+"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def extract_toot(toot):
 | 
				
			||||||
 | 
						toot = toot.replace("'", "'")
 | 
				
			||||||
 | 
						toot = toot.replace(""", '"')
 | 
				
			||||||
 | 
						soup = BeautifulSoup(toot, "html.parser")
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	# pull the mentions out
 | 
					 | 
				
			||||||
	# for mention in soup.select("span.h-card"):
 | 
					 | 
				
			||||||
	#     mention.unwrap()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	# for mention in soup.select("a.u-url.mention"):
 | 
					 | 
				
			||||||
	#     mention.unwrap()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	# this is the code that removes all mentions
 | 
						# this is the code that removes all mentions
 | 
				
			||||||
	# TODO: make it so that it removes the @ and instance but keeps the name
 | 
						# TODO: make it so that it removes the @ and instance but keeps the name
 | 
				
			||||||
	for mention in soup.select("span.h-card"):
 | 
						for mention in soup.select("span.h-card"):
 | 
				
			||||||
		mention.decompose()
 | 
							mention.a.unwrap()
 | 
				
			||||||
 | 
							mention.span.unwrap()
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	# make all linebreaks actual linebreaks
 | 
						# replace <br> with linebreak
 | 
				
			||||||
	for lb in soup.select("br"):
 | 
						for lb in soup.select("br"):
 | 
				
			||||||
		lb.insert_after("\n")
 | 
							lb.insert_after("\n")
 | 
				
			||||||
		lb.decompose()
 | 
							lb.decompose()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# make each p element its own line because sometimes they decide not to be
 | 
						# replace <p> with linebreak
 | 
				
			||||||
	for p in soup.select("p"):
 | 
						for p in soup.select("p"):
 | 
				
			||||||
		p.insert_after("\n")
 | 
							p.insert_after("\n")
 | 
				
			||||||
		p.unwrap()
 | 
							p.unwrap()
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	# keep hashtags in the toots
 | 
						# fix hashtags
 | 
				
			||||||
	for ht in soup.select("a.hashtag"):
 | 
						for ht in soup.select("a.hashtag"):
 | 
				
			||||||
		ht.unwrap()
 | 
							ht.unwrap()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# unwrap all links (i like the bots posting links)
 | 
						# fix links
 | 
				
			||||||
	for link in soup.select("a"):
 | 
						for link in soup.select("a"):
 | 
				
			||||||
		link.insert_after(link["href"])
 | 
							link.insert_after(link["href"])
 | 
				
			||||||
		link.decompose()
 | 
							link.decompose()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
 | 
						toot = soup.get_text()
 | 
				
			||||||
 | 
						toot = toot.rstrip("\n") #remove trailing newline
 | 
				
			||||||
	# next up: store this and patch markovify to take it
 | 
						toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
 | 
				
			||||||
	# return {"text": text, "mentions": mentions, "links": links}
 | 
						return(toot)
 | 
				
			||||||
	# it's 4am though so we're not doing that now, but i still want the parser updates
 | 
					 | 
				
			||||||
	#todo: we split above and join now, which is dumb, but i don't wanna mess with the map code bc i don't understand it uwu
 | 
					 | 
				
			||||||
	text = "\n".join(list(text)) 
 | 
					 | 
				
			||||||
	text = text.replace("'", "'")
 | 
					 | 
				
			||||||
	return text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_toots(client, id, since_id):
 | 
					 | 
				
			||||||
	i = 0
 | 
					 | 
				
			||||||
	toots = client.account_statuses(id, since_id = since_id)
 | 
					 | 
				
			||||||
	while toots is not None and len(toots) > 0:
 | 
					 | 
				
			||||||
		for toot in toots:
 | 
					 | 
				
			||||||
			t = parse_toot(toot)
 | 
					 | 
				
			||||||
			if t != None:
 | 
					 | 
				
			||||||
				yield {
 | 
					 | 
				
			||||||
					"content": t,
 | 
					 | 
				
			||||||
					"id": toot.id
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
		try:
 | 
					 | 
				
			||||||
			toots = client.fetch_next(toots)
 | 
					 | 
				
			||||||
		except TimeoutError:
 | 
					 | 
				
			||||||
			print("Operation timed out, committing to database and exiting.")
 | 
					 | 
				
			||||||
			db.commit()
 | 
					 | 
				
			||||||
			db.close()
 | 
					 | 
				
			||||||
			sys.exit(1)
 | 
					 | 
				
			||||||
		i += 1
 | 
					 | 
				
			||||||
		if i%10 == 0:
 | 
					 | 
				
			||||||
			print(i)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
client = Mastodon(
 | 
					client = Mastodon(
 | 
				
			||||||
		client_id="clientcred.secret", 
 | 
						client_id=cfg['client']['id'],
 | 
				
			||||||
		access_token="usercred.secret", 
 | 
						client_secret = cfg['client']['secret'], 
 | 
				
			||||||
		api_base_url=cfg['site'])
 | 
						access_token=cfg['secret'], 
 | 
				
			||||||
 | 
						api_base_url=cfg['site'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
me = client.account_verify_credentials()
 | 
					me = client.account_verify_credentials()
 | 
				
			||||||
following = client.account_following(me.id)
 | 
					following = client.account_following(me.id)
 | 
				
			||||||
| 
						 | 
					@ -105,7 +95,7 @@ following = client.account_following(me.id)
 | 
				
			||||||
db = sqlite3.connect("toots.db")
 | 
					db = sqlite3.connect("toots.db")
 | 
				
			||||||
db.text_factory=str
 | 
					db.text_factory=str
 | 
				
			||||||
c = db.cursor()
 | 
					c = db.cursor()
 | 
				
			||||||
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
 | 
					c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
 | 
				
			||||||
db.commit()
 | 
					db.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def handleCtrlC(signal, frame):
 | 
					def handleCtrlC(signal, frame):
 | 
				
			||||||
| 
						 | 
					@ -121,12 +111,65 @@ for f in following:
 | 
				
			||||||
		last_toot = last_toot[0]
 | 
							last_toot = last_toot[0]
 | 
				
			||||||
	else:
 | 
						else:
 | 
				
			||||||
		last_toot = 0
 | 
							last_toot = 0
 | 
				
			||||||
	print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot))
 | 
						print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
 | 
				
			||||||
	for t in get_toots(client, f.id, last_toot):
 | 
					
 | 
				
			||||||
		# try:
 | 
						#find the user's activitypub outbox
 | 
				
			||||||
		c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content']))
 | 
						print("WebFingering...")
 | 
				
			||||||
		# except:
 | 
						instance = re.search(r"^.*@(.+)", f.acct)
 | 
				
			||||||
		# 	pass #ignore toots that can't be encoded properly
 | 
						if instance == None:
 | 
				
			||||||
 | 
							instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
 | 
				
			||||||
 | 
						else:
 | 
				
			||||||
 | 
							instance = instance.group(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# print("{} is on {}".format(f.acct, instance))
 | 
				
			||||||
 | 
						try:
 | 
				
			||||||
 | 
							r = requests.get("https://{}/.well-known/host-meta".format(instance))
 | 
				
			||||||
 | 
							uri = re.search(r'template="([^"]+)"', r.text).group(1)
 | 
				
			||||||
 | 
							uri = uri.format(uri = "{}@{}".format(f.username, instance))
 | 
				
			||||||
 | 
							r = requests.get(uri)
 | 
				
			||||||
 | 
							uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this
 | 
				
			||||||
 | 
							uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
 | 
				
			||||||
 | 
							r = requests.get(uri)
 | 
				
			||||||
 | 
							j = r.json()
 | 
				
			||||||
 | 
						except Exception:
 | 
				
			||||||
 | 
							print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
 | 
				
			||||||
 | 
							sys.exit(1)
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						print("Downloading and parsing toots", end='', flush=True)
 | 
				
			||||||
 | 
						current = None
 | 
				
			||||||
 | 
						try:
 | 
				
			||||||
 | 
							while len(j['orderedItems']) > 0:
 | 
				
			||||||
 | 
								for oi in j['orderedItems']:
 | 
				
			||||||
 | 
									if oi['type'] == "Create":
 | 
				
			||||||
 | 
										# its a toost baby
 | 
				
			||||||
 | 
										content = oi['object']['content']
 | 
				
			||||||
 | 
										if oi['object']['summary'] != None:
 | 
				
			||||||
 | 
											#don't download CW'd toots
 | 
				
			||||||
 | 
											continue
 | 
				
			||||||
 | 
										toot = extract_toot(content)
 | 
				
			||||||
 | 
										# print(toot)
 | 
				
			||||||
 | 
										try:
 | 
				
			||||||
 | 
											c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
 | 
				
			||||||
 | 
												(re.search(r"[^\/]+$", oi['object']['id']).group(0),
 | 
				
			||||||
 | 
												f.id,
 | 
				
			||||||
 | 
												oi['object']['id'],
 | 
				
			||||||
 | 
												toot
 | 
				
			||||||
 | 
												)
 | 
				
			||||||
 | 
											)
 | 
				
			||||||
 | 
											pass
 | 
				
			||||||
 | 
										except:
 | 
				
			||||||
 | 
											pass #ignore any toots that don't go into the DB
 | 
				
			||||||
 | 
								# sys.exit(0)
 | 
				
			||||||
 | 
								r = requests.get(j['prev'])
 | 
				
			||||||
 | 
								j = r.json()
 | 
				
			||||||
 | 
								print('.', end='', flush=True)
 | 
				
			||||||
 | 
							print(" Done!")
 | 
				
			||||||
 | 
							db.commit()
 | 
				
			||||||
 | 
						except:
 | 
				
			||||||
 | 
							print("Encountered an error! Saving toots to database and exiting.")
 | 
				
			||||||
 | 
							db.commit()
 | 
				
			||||||
 | 
							db.close()
 | 
				
			||||||
 | 
							sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
db.commit()
 | 
					db.commit()
 | 
				
			||||||
db.execute("VACUUM") #compact db
 | 
					db.execute("VACUUM") #compact db
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Reference in New Issue