Version 2.0, with vastly improved toot fetching capabilities!
This commit is contained in:
parent
3d059d0b9b
commit
19899cafee
181
main.py
181
main.py
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
# toot downloader version two!!
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
@ -6,98 +7,87 @@
|
|||
from mastodon import Mastodon
|
||||
from os import path
|
||||
from bs4 import BeautifulSoup
|
||||
import shutil, os, sqlite3, signal, sys, json
|
||||
# import re
|
||||
import os, sqlite3, signal, sys, json, re
|
||||
import requests
|
||||
|
||||
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
|
||||
cfg = json.load(open('config.json', 'r'))
|
||||
|
||||
if not path.exists("clientcred.secret"):
|
||||
if os.path.exists("clientcred.secret"):
|
||||
print("Upgrading to new storage method")
|
||||
cc = open("clientcred.secret").read().split("\n")
|
||||
cfg['client'] = {
|
||||
"id": cc[0],
|
||||
"secret": cc[1]
|
||||
}
|
||||
cfg['secret'] = open("usercred.secret").read().rstrip("\n")
|
||||
os.remove("clientcred.secret")
|
||||
os.remove("usercred.secret")
|
||||
|
||||
|
||||
print("No clientcred.secret, registering application")
|
||||
Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=cfg['site'], to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks")
|
||||
if "client" not in cfg:
|
||||
print("No client credentials, registering application")
|
||||
client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
|
||||
api_base_url=cfg['site'],
|
||||
scopes=scopes,
|
||||
website="https://github.com/Lynnesbian/mstdn-ebooks")
|
||||
|
||||
if not path.exists("usercred.secret"):
|
||||
print("No usercred.secret, registering application")
|
||||
client = Mastodon(client_id="clientcred.secret", api_base_url=cfg['site'])
|
||||
print("Visit this url:")
|
||||
print(client.auth_request_url(scopes=scopes))
|
||||
client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes)
|
||||
cfg['client'] = {
|
||||
"id": client_id,
|
||||
"secret": client_secret
|
||||
}
|
||||
|
||||
def parse_toot(toot):
|
||||
if toot.spoiler_text != "": return
|
||||
if toot.reblog is not None: return
|
||||
if toot.visibility not in ["public", "unlisted"]: return
|
||||
if "secret" not in cfg:
|
||||
print("No user credentials, logging in")
|
||||
client = Mastodon(client_id = cfg['client']['id'],
|
||||
client_secret = cfg['client']['secret'],
|
||||
api_base_url=cfg['site'])
|
||||
|
||||
soup = BeautifulSoup(toot.content, "html.parser")
|
||||
print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
|
||||
cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
|
||||
|
||||
json.dump(cfg, open("config.json", "w+"))
|
||||
|
||||
def extract_toot(toot):
|
||||
toot = toot.replace("'", "'")
|
||||
toot = toot.replace(""", '"')
|
||||
soup = BeautifulSoup(toot, "html.parser")
|
||||
|
||||
# pull the mentions out
|
||||
# for mention in soup.select("span.h-card"):
|
||||
# mention.unwrap()
|
||||
|
||||
# for mention in soup.select("a.u-url.mention"):
|
||||
# mention.unwrap()
|
||||
|
||||
# this is the code that removes all mentions
|
||||
# TODO: make it so that it removes the @ and instance but keeps the name
|
||||
for mention in soup.select("span.h-card"):
|
||||
mention.decompose()
|
||||
mention.a.unwrap()
|
||||
mention.span.unwrap()
|
||||
|
||||
# make all linebreaks actual linebreaks
|
||||
# replace <br> with linebreak
|
||||
for lb in soup.select("br"):
|
||||
lb.insert_after("\n")
|
||||
lb.decompose()
|
||||
|
||||
# make each p element its own line because sometimes they decide not to be
|
||||
# replace <p> with linebreak
|
||||
for p in soup.select("p"):
|
||||
p.insert_after("\n")
|
||||
p.unwrap()
|
||||
|
||||
# keep hashtags in the toots
|
||||
# fix hashtags
|
||||
for ht in soup.select("a.hashtag"):
|
||||
ht.unwrap()
|
||||
|
||||
# unwrap all links (i like the bots posting links)
|
||||
# fix links
|
||||
for link in soup.select("a"):
|
||||
link.insert_after(link["href"])
|
||||
link.decompose()
|
||||
|
||||
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
|
||||
|
||||
# next up: store this and patch markovify to take it
|
||||
# return {"text": text, "mentions": mentions, "links": links}
|
||||
# it's 4am though so we're not doing that now, but i still want the parser updates
|
||||
#todo: we split above and join now, which is dumb, but i don't wanna mess with the map code bc i don't understand it uwu
|
||||
text = "\n".join(list(text))
|
||||
text = text.replace("'", "'")
|
||||
return text
|
||||
|
||||
def get_toots(client, id, since_id):
|
||||
i = 0
|
||||
toots = client.account_statuses(id, since_id = since_id)
|
||||
while toots is not None and len(toots) > 0:
|
||||
for toot in toots:
|
||||
t = parse_toot(toot)
|
||||
if t != None:
|
||||
yield {
|
||||
"content": t,
|
||||
"id": toot.id
|
||||
}
|
||||
try:
|
||||
toots = client.fetch_next(toots)
|
||||
except TimeoutError:
|
||||
print("Operation timed out, committing to database and exiting.")
|
||||
db.commit()
|
||||
db.close()
|
||||
sys.exit(1)
|
||||
i += 1
|
||||
if i%10 == 0:
|
||||
print(i)
|
||||
toot = soup.get_text()
|
||||
toot = toot.rstrip("\n") #remove trailing newline
|
||||
toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
|
||||
return(toot)
|
||||
|
||||
client = Mastodon(
|
||||
client_id="clientcred.secret",
|
||||
access_token="usercred.secret",
|
||||
api_base_url=cfg['site'])
|
||||
client_id=cfg['client']['id'],
|
||||
client_secret = cfg['client']['secret'],
|
||||
access_token=cfg['secret'],
|
||||
api_base_url=cfg['site'])
|
||||
|
||||
me = client.account_verify_credentials()
|
||||
following = client.account_following(me.id)
|
||||
|
@ -105,7 +95,7 @@ following = client.account_following(me.id)
|
|||
db = sqlite3.connect("toots.db")
|
||||
db.text_factory=str
|
||||
c = db.cursor()
|
||||
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
|
||||
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
|
||||
db.commit()
|
||||
|
||||
def handleCtrlC(signal, frame):
|
||||
|
@ -121,12 +111,65 @@ for f in following:
|
|||
last_toot = last_toot[0]
|
||||
else:
|
||||
last_toot = 0
|
||||
print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot))
|
||||
for t in get_toots(client, f.id, last_toot):
|
||||
# try:
|
||||
c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content']))
|
||||
# except:
|
||||
# pass #ignore toots that can't be encoded properly
|
||||
print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
|
||||
|
||||
#find the user's activitypub outbox
|
||||
print("WebFingering...")
|
||||
instance = re.search(r"^.*@(.+)", f.acct)
|
||||
if instance == None:
|
||||
instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
|
||||
else:
|
||||
instance = instance.group(1)
|
||||
|
||||
# print("{} is on {}".format(f.acct, instance))
|
||||
try:
|
||||
r = requests.get("https://{}/.well-known/host-meta".format(instance))
|
||||
uri = re.search(r'template="([^"]+)"', r.text).group(1)
|
||||
uri = uri.format(uri = "{}@{}".format(f.username, instance))
|
||||
r = requests.get(uri)
|
||||
uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this
|
||||
uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
|
||||
r = requests.get(uri)
|
||||
j = r.json()
|
||||
except Exception:
|
||||
print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
|
||||
sys.exit(1)
|
||||
|
||||
print("Downloading and parsing toots", end='', flush=True)
|
||||
current = None
|
||||
try:
|
||||
while len(j['orderedItems']) > 0:
|
||||
for oi in j['orderedItems']:
|
||||
if oi['type'] == "Create":
|
||||
# its a toost baby
|
||||
content = oi['object']['content']
|
||||
if oi['object']['summary'] != None:
|
||||
#don't download CW'd toots
|
||||
continue
|
||||
toot = extract_toot(content)
|
||||
# print(toot)
|
||||
try:
|
||||
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
|
||||
(re.search(r"[^\/]+$", oi['object']['id']).group(0),
|
||||
f.id,
|
||||
oi['object']['id'],
|
||||
toot
|
||||
)
|
||||
)
|
||||
pass
|
||||
except:
|
||||
pass #ignore any toots that don't go into the DB
|
||||
# sys.exit(0)
|
||||
r = requests.get(j['prev'])
|
||||
j = r.json()
|
||||
print('.', end='', flush=True)
|
||||
print(" Done!")
|
||||
db.commit()
|
||||
except:
|
||||
print("Encountered an error! Saving toots to database and exiting.")
|
||||
db.commit()
|
||||
db.close()
|
||||
sys.exit(1)
|
||||
|
||||
db.commit()
|
||||
db.execute("VACUUM") #compact db
|
||||
|
|
Reference in New Issue