amber-ebooks-archived/main.py

#!/usr/bin/env python3
# toot downloader version two!!
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from mastodon import Mastodon
from os import path
import os
import sqlite3
import signal
import sys
import json
import re
import requests
from util import get_config, extract_toot

cfg = get_config()

client = Mastodon(
    access_token=cfg['secret'],
    api_base_url=cfg['site'])

me = client.account_verify_credentials()
following = client.account_following(me.id)

db = sqlite3.connect("toots.db")
db.text_factory = str
c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
db.commit()


def handleCtrlC(signal, frame):
    print("\nPREMATURE EVACUATION - Saving chunks")
    db.commit()
    sys.exit(1)


signal.signal(signal.SIGINT, handleCtrlC)


def get_toots_legacy(client, id):
    i = 0
    toots = client.account_statuses(id)
    while toots is not None and len(toots) > 0:
        for toot in toots:
            if toot.spoiler_text != "":
                continue
            if toot.reblog is not None:
                continue
            if toot.visibility not in ["public", "unlisted"]:
                continue
            t = extract_toot(toot.content)
            if t != None:
                yield {
                    "toot": t,
                    "id": toot.id,
                    "uri": toot.uri
                }
            toots = client.fetch_next(toots)
            i += 1
            if i % 20 == 0:
                print('.', end='', flush=True)


for f in following:
    last_toot = c.execute(
        "SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
    if last_toot != None:
        last_toot = last_toot[0]
    else:
        last_toot = 0
    print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))

    # find the user's activitypub outbox
    print("WebFingering...")
    instance = re.search(r"^.*@(.+)", f.acct)
    if instance == None:
        instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
    else:
        instance = instance.group(1)

    if instance == "bofa.lol":
        print("rest in piece bofa, skipping")
        continue

    # print("{} is on {}".format(f.acct, instance))
    try:
        r = requests.get(
            "https://{}/.well-known/host-meta".format(instance), timeout=10)
        uri = re.search(r'template="([^"]+)"', r.text).group(1)
        uri = uri.format(uri="{}@{}".format(f.username, instance))
        r = requests.get(
            uri, headers={"Accept": "application/json"}, timeout=10)
        j = r.json()
        if len(j['aliases']) == 1:  # TODO: this is a hack on top of a hack, fix it
            uri = j['aliases'][0]
        else:
            uri = j['aliases'][1]
        uri = "{}/outbox?page=true".format(uri)
        r = requests.get(uri, timeout=10)
        j = r.json()
    except Exception:
        print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
        sys.exit(1)

    pleroma = False
    if 'first' in j and type(j['first']) != str:
        print("Pleroma instance detected")
        pleroma = True
        j = j['first']
    else:
        print("Mastodon instance detected")
        uri = "{}&min_id={}".format(uri, last_toot)
        r = requests.get(uri)
        j = r.json()

    print("Downloading and parsing toots", end='', flush=True)
    done = False
    try:
        while not done and len(j['orderedItems']) > 0:
            for oi in j['orderedItems']:
                if oi['type'] != "Create":
                    continue  # not a toost. fuck outta here

                # its a toost baby
                content = oi['object']['content']
                if oi['object']['summary'] != None:
                    # don't download CW'd toots
                    continue
                toot = extract_toot(content)
                # print(toot)
                try:
                    if pleroma:
                        if c.execute("SELECT COUNT(*) FROM toots WHERE id LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0:
                            # we've caught up to the notices we've already downloaded, so we can stop now
                            done = True
                            break
                    pid = re.search(r"[^\/]+$", oi['object']['id']).group(0)
                    c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
                              (pid,
                               f.id,
                               oi['object']['id'],
                               toot
                               )
                              )
                    pass
                except:
                    pass  # ignore any toots that don't successfully go into the DB
            # sys.exit(0)
            if not pleroma:
                r = requests.get(j['prev'], timeout=15)
            else:
                r = requests.get(j['next'], timeout=15)
            j = r.json()
            print('.', end='', flush=True)
        print(" Done!")
        db.commit()
    except:
        print("Encountered an error! Saving toots to database and continuing.")
        db.commit()
        # db.close()

print("Done!")

db.commit()
db.execute("VACUUM")  # compact db
db.commit()
db.close()