From 19899cafee7a146c259854f87857f178d4ddb2aa Mon Sep 17 00:00:00 2001 From: Lynne Date: Thu, 25 Oct 2018 12:37:11 +1000 Subject: [PATCH] Version 2.0, with vastly improved toot fetching capabilities! --- main.py | 181 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 112 insertions(+), 69 deletions(-) diff --git a/main.py b/main.py index 5f6872d..cb9d2cd 100755 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# toot downloader version two!! # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. @@ -6,98 +7,87 @@ from mastodon import Mastodon from os import path from bs4 import BeautifulSoup -import shutil, os, sqlite3, signal, sys, json -# import re +import os, sqlite3, signal, sys, json, re +import requests scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"] cfg = json.load(open('config.json', 'r')) -if not path.exists("clientcred.secret"): +if os.path.exists("clientcred.secret"): + print("Upgrading to new storage method") + cc = open("clientcred.secret").read().split("\n") + cfg['client'] = { + "id": cc[0], + "secret": cc[1] + } + cfg['secret'] = open("usercred.secret").read().rstrip("\n") + os.remove("clientcred.secret") + os.remove("usercred.secret") + - print("No clientcred.secret, registering application") - Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=cfg['site'], to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks") +if "client" not in cfg: + print("No client credentials, registering application") + client_id, client_secret = Mastodon.create_app("mstdn-ebooks", + api_base_url=cfg['site'], + scopes=scopes, + website="https://github.com/Lynnesbian/mstdn-ebooks") -if not path.exists("usercred.secret"): - print("No usercred.secret, registering application") - client = Mastodon(client_id="clientcred.secret", api_base_url=cfg['site']) - print("Visit this url:") - print(client.auth_request_url(scopes=scopes)) - client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes) + cfg['client'] = { + "id": client_id, + "secret": client_secret + } -def parse_toot(toot): - if toot.spoiler_text != "": return - if toot.reblog is not None: return - if toot.visibility not in ["public", "unlisted"]: return +if "secret" not in cfg: + print("No user credentials, logging in") + client = Mastodon(client_id = cfg['client']['id'], + client_secret = cfg['client']['secret'], + api_base_url=cfg['site']) - soup = BeautifulSoup(toot.content, "html.parser") + print("Open this URL: {}".format(client.auth_request_url(scopes=scopes))) + cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes) + +json.dump(cfg, open("config.json", "w+")) + +def extract_toot(toot): + toot = toot.replace("'", "'") + toot = toot.replace(""", '"') + soup = BeautifulSoup(toot, "html.parser") - # pull the mentions out - # for mention in soup.select("span.h-card"): - # mention.unwrap() - - # for mention in soup.select("a.u-url.mention"): - # mention.unwrap() - # this is the code that removes all mentions # TODO: make it so that it removes the @ and instance but keeps the name for mention in soup.select("span.h-card"): - mention.decompose() + mention.a.unwrap() + mention.span.unwrap() - # make all linebreaks actual linebreaks + # replace
with linebreak for lb in soup.select("br"): lb.insert_after("\n") lb.decompose() - # make each p element its own line because sometimes they decide not to be + # replace

with linebreak for p in soup.select("p"): p.insert_after("\n") p.unwrap() - # keep hashtags in the toots + # fix hashtags for ht in soup.select("a.hashtag"): ht.unwrap() - # unwrap all links (i like the bots posting links) + # fix links for link in soup.select("a"): link.insert_after(link["href"]) link.decompose() - text = map(lambda a: a.strip(), soup.get_text().strip().split("\n")) - - # next up: store this and patch markovify to take it - # return {"text": text, "mentions": mentions, "links": links} - # it's 4am though so we're not doing that now, but i still want the parser updates - #todo: we split above and join now, which is dumb, but i don't wanna mess with the map code bc i don't understand it uwu - text = "\n".join(list(text)) - text = text.replace("'", "'") - return text - -def get_toots(client, id, since_id): - i = 0 - toots = client.account_statuses(id, since_id = since_id) - while toots is not None and len(toots) > 0: - for toot in toots: - t = parse_toot(toot) - if t != None: - yield { - "content": t, - "id": toot.id - } - try: - toots = client.fetch_next(toots) - except TimeoutError: - print("Operation timed out, committing to database and exiting.") - db.commit() - db.close() - sys.exit(1) - i += 1 - if i%10 == 0: - print(i) + toot = soup.get_text() + toot = toot.rstrip("\n") #remove trailing newline + toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning + return(toot) client = Mastodon( - client_id="clientcred.secret", - access_token="usercred.secret", - api_base_url=cfg['site']) + client_id=cfg['client']['id'], + client_secret = cfg['client']['secret'], + access_token=cfg['secret'], + api_base_url=cfg['site']) me = client.account_verify_credentials() following = client.account_following(me.id) @@ -105,7 +95,7 @@ following = client.account_following(me.id) db = sqlite3.connect("toots.db") db.text_factory=str c = db.cursor() -c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID") +c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID") db.commit() def handleCtrlC(signal, frame): @@ -121,12 +111,65 @@ for f in following: last_toot = last_toot[0] else: last_toot = 0 - print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot)) - for t in get_toots(client, f.id, last_toot): - # try: - c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content'])) - # except: - # pass #ignore toots that can't be encoded properly + print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot)) + + #find the user's activitypub outbox + print("WebFingering...") + instance = re.search(r"^.*@(.+)", f.acct) + if instance == None: + instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1) + else: + instance = instance.group(1) + + # print("{} is on {}".format(f.acct, instance)) + try: + r = requests.get("https://{}/.well-known/host-meta".format(instance)) + uri = re.search(r'template="([^"]+)"', r.text).group(1) + uri = uri.format(uri = "{}@{}".format(f.username, instance)) + r = requests.get(uri) + uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this + uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot) + r = requests.get(uri) + j = r.json() + except Exception: + print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)") + sys.exit(1) + + print("Downloading and parsing toots", end='', flush=True) + current = None + try: + while len(j['orderedItems']) > 0: + for oi in j['orderedItems']: + if oi['type'] == "Create": + # its a toost baby + content = oi['object']['content'] + if oi['object']['summary'] != None: + #don't download CW'd toots + continue + toot = extract_toot(content) + # print(toot) + try: + c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", + (re.search(r"[^\/]+$", oi['object']['id']).group(0), + f.id, + oi['object']['id'], + toot + ) + ) + pass + except: + pass #ignore any toots that don't go into the DB + # sys.exit(0) + r = requests.get(j['prev']) + j = r.json() + print('.', end='', flush=True) + print(" Done!") + db.commit() + except: + print("Encountered an error! Saving toots to database and exiting.") + db.commit() + db.close() + sys.exit(1) db.commit() db.execute("VACUUM") #compact db