171 lines
5.5 KiB
Python
Executable File
171 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# toot downloader version two!!
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
from mastodon import Mastodon
|
|
from os import path
|
|
import os
|
|
import sqlite3
|
|
import signal
|
|
import sys
|
|
import json
|
|
import re
|
|
import requests
|
|
from util import get_config, extract_toot
|
|
|
|
cfg = get_config()
|
|
|
|
client = Mastodon(
|
|
access_token=cfg['secret'],
|
|
api_base_url=cfg['site'])
|
|
|
|
me = client.account_verify_credentials()
|
|
following = client.account_following(me.id)
|
|
|
|
db = sqlite3.connect("toots.db")
|
|
db.text_factory = str
|
|
c = db.cursor()
|
|
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
|
|
db.commit()
|
|
|
|
|
|
def handleCtrlC(signal, frame):
|
|
print("\nPREMATURE EVACUATION - Saving chunks")
|
|
db.commit()
|
|
sys.exit(1)
|
|
|
|
|
|
signal.signal(signal.SIGINT, handleCtrlC)
|
|
|
|
|
|
def get_toots_legacy(client, id):
|
|
i = 0
|
|
toots = client.account_statuses(id)
|
|
while toots is not None and len(toots) > 0:
|
|
for toot in toots:
|
|
if toot.spoiler_text != "":
|
|
continue
|
|
if toot.reblog is not None:
|
|
continue
|
|
if toot.visibility not in ["public", "unlisted"]:
|
|
continue
|
|
t = extract_toot(toot.content)
|
|
if t != None:
|
|
yield {
|
|
"toot": t,
|
|
"id": toot.id,
|
|
"uri": toot.uri
|
|
}
|
|
toots = client.fetch_next(toots)
|
|
i += 1
|
|
if i % 20 == 0:
|
|
print('.', end='', flush=True)
|
|
|
|
|
|
for f in following:
|
|
last_toot = c.execute(
|
|
"SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
|
|
if last_toot != None:
|
|
last_toot = last_toot[0]
|
|
else:
|
|
last_toot = 0
|
|
print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
|
|
|
|
# find the user's activitypub outbox
|
|
print("WebFingering...")
|
|
instance = re.search(r"^.*@(.+)", f.acct)
|
|
if instance == None:
|
|
instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
|
|
else:
|
|
instance = instance.group(1)
|
|
|
|
if instance == "bofa.lol":
|
|
print("rest in piece bofa, skipping")
|
|
continue
|
|
|
|
# print("{} is on {}".format(f.acct, instance))
|
|
try:
|
|
r = requests.get(
|
|
"https://{}/.well-known/host-meta".format(instance), timeout=10)
|
|
uri = re.search(r'template="([^"]+)"', r.text).group(1)
|
|
uri = uri.format(uri="{}@{}".format(f.username, instance))
|
|
r = requests.get(
|
|
uri, headers={"Accept": "application/json"}, timeout=10)
|
|
j = r.json()
|
|
if len(j['aliases']) == 1: # TODO: this is a hack on top of a hack, fix it
|
|
uri = j['aliases'][0]
|
|
else:
|
|
uri = j['aliases'][1]
|
|
uri = "{}/outbox?page=true".format(uri)
|
|
r = requests.get(uri, timeout=10)
|
|
j = r.json()
|
|
except Exception:
|
|
print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
|
|
sys.exit(1)
|
|
|
|
pleroma = False
|
|
if 'first' in j and type(j['first']) != str:
|
|
print("Pleroma instance detected")
|
|
pleroma = True
|
|
j = j['first']
|
|
else:
|
|
print("Mastodon instance detected")
|
|
uri = "{}&min_id={}".format(uri, last_toot)
|
|
r = requests.get(uri)
|
|
j = r.json()
|
|
|
|
print("Downloading and parsing toots", end='', flush=True)
|
|
done = False
|
|
try:
|
|
while not done and len(j['orderedItems']) > 0:
|
|
for oi in j['orderedItems']:
|
|
if oi['type'] != "Create":
|
|
continue # not a toost. fuck outta here
|
|
|
|
# its a toost baby
|
|
content = oi['object']['content']
|
|
if oi['object']['summary'] != None:
|
|
# don't download CW'd toots
|
|
continue
|
|
toot = extract_toot(content)
|
|
# print(toot)
|
|
try:
|
|
if pleroma:
|
|
if c.execute("SELECT COUNT(*) FROM toots WHERE id LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0:
|
|
# we've caught up to the notices we've already downloaded, so we can stop now
|
|
done = True
|
|
break
|
|
pid = re.search(r"[^\/]+$", oi['object']['id']).group(0)
|
|
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
|
|
(pid,
|
|
f.id,
|
|
oi['object']['id'],
|
|
toot
|
|
)
|
|
)
|
|
pass
|
|
except:
|
|
pass # ignore any toots that don't successfully go into the DB
|
|
# sys.exit(0)
|
|
if not pleroma:
|
|
r = requests.get(j['prev'], timeout=15)
|
|
else:
|
|
r = requests.get(j['next'], timeout=15)
|
|
j = r.json()
|
|
print('.', end='', flush=True)
|
|
print(" Done!")
|
|
db.commit()
|
|
except:
|
|
print("Encountered an error! Saving toots to database and continuing.")
|
|
db.commit()
|
|
# db.close()
|
|
|
|
print("Done!")
|
|
|
|
db.commit()
|
|
db.execute("VACUUM") # compact db
|
|
db.commit()
|
|
db.close()
|