From 767911757bcbd8ef2d0f895356fe7c7c82300e63 Mon Sep 17 00:00:00 2001 From: "Andi N. Fiziks" Date: Wed, 5 Dec 2018 04:15:19 +0000 Subject: [PATCH] First pass at attempting to pull login values from env vars --- .gitignore | 1 + Procfile | 3 + app.json | 17 ++- create.py | 106 ++++++++++-------- gen.py | 60 +++++----- main.py | 324 ++++++++++++++++++++++------------------------------- util.py | 97 ++++++++++++++++ 7 files changed, 335 insertions(+), 273 deletions(-) create mode 100644 Procfile create mode 100644 util.py diff --git a/.gitignore b/.gitignore index d23632f..4c3ed36 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ toots.db toots.db-journal toots.db-wal __pycache__/* +public \ No newline at end of file diff --git a/Procfile b/Procfile new file mode 100644 index 0000000..dc056c1 --- /dev/null +++ b/Procfile @@ -0,0 +1,3 @@ +generate: python gen.py +train: python main.py +web: mkdir public && cd public && python -m http.server $PORT \ No newline at end of file diff --git a/app.json b/app.json index 6f11d0e..68441ef 100644 --- a/app.json +++ b/app.json @@ -1,8 +1,13 @@ { - "name": "mstdn-ebooks", - "description": "An ebooks bot for Mastodon (and compatible) users", - "repository": "https://github.com/Lynnesbian/mstdn-ebooks", - "keywords": ["python", "mastodon"], - "website":"https://fedi.lynnesbian.space/@lynnesbian", - "image":"heroku/heroku" + "name": "mstdn-ebooks", + "description": "An ebooks bot for Mastodon (and compatible) users", + "repository": "https://github.com/Lynnesbian/mstdn-ebooks", + "keywords": ["python", "mastodon"], + "website": "https://fedi.lynnesbian.space/@lynnesbian", + "image": "heroku/heroku", + "env": { + "MASTODON_API_TOKEN": "", + "MASTODON_BASE_URL": "" + }, + "addons": ["scheduler"] } diff --git a/create.py b/create.py index caa62a3..050c16a 100755 --- a/create.py +++ b/create.py @@ -4,59 +4,67 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import markovify -import json -import re, random, multiprocessing, time, sqlite3, shutil, os +import re +import random +import multiprocessing +import time +import sqlite3 +import shutil +import os + def make_sentence(output): - class nlt_fixed(markovify.NewlineText): - def test_sentence_input(self, sentence): - return True #all sentences are valid <3 + class nlt_fixed(markovify.NewlineText): + def test_sentence_input(self, sentence): + return True # all sentences are valid <3 - # with open("corpus.txt", encoding="utf-8") as fp: - # model = nlt_fixed(fp.read()) + # with open("corpus.txt", encoding="utf-8") as fp: + # model = nlt_fixed(fp.read()) - shutil.copyfile("toots.db", "toots-copy.db") - db = sqlite3.connect("toots-copy.db") - db.text_factory=str - c = db.cursor() - toots = c.execute("SELECT content FROM `toots`").fetchall() - toots_str = "" - for toot in toots: - toots_str += "\n{}".format(toot[0]) - model = nlt_fixed(toots_str) - toots_str = None - db.close() - os.remove("toots-copy.db") + shutil.copyfile("toots.db", "toots-copy.db") + db = sqlite3.connect("toots-copy.db") + db.text_factory = str + c = db.cursor() + toots = c.execute("SELECT content FROM `toots`").fetchall() + toots_str = "" + for toot in toots: + toots_str += "\n{}".format(toot[0]) + model = nlt_fixed(toots_str) + toots_str = None + db.close() + os.remove("toots-copy.db") - sentence = None - tries = 0 - while sentence is None and tries < 10: - sentence = model.make_short_sentence(500, tries=10000) - tries = tries + 1 - sentence = re.sub("^@\u202B[^ ]* ", "", sentence) - output.send(sentence) + sentence = None + tries = 0 + while sentence is None and tries < 10: + sentence = model.make_short_sentence(500, tries=10000) + tries = tries + 1 + sentence = re.sub("^@\u202B[^ ]* ", "", sentence) + output.send(sentence) -def make_toot(force_markov = False, args = None): - return make_toot_markov() -def make_toot_markov(query = None): - tries = 0 - toot = None - while toot == None and tries < 25: - pin, pout = multiprocessing.Pipe(False) - p = multiprocessing.Process(target = make_sentence, args = [pout]) - p.start() - p.join(10) - if p.is_alive(): - p.terminate() - p.join() - toot = None - tries = tries + 1 - else: - toot = pin.recv() - if toot == None: - toot = "Toot generation failed! Contact Lynne for assistance." - return { - "toot":toot, - "media":None - } +def make_toot(force_markov=False, args=None): + return make_toot_markov() + + +def make_toot_markov(query=None): + tries = 0 + toot = None + while toot == None and tries < 25: + pin, pout = multiprocessing.Pipe(False) + p = multiprocessing.Process(target=make_sentence, args=[pout]) + p.start() + p.join(10) + if p.is_alive(): + p.terminate() + p.join() + toot = None + tries = tries + 1 + else: + toot = pin.recv() + if toot == None: + toot = "Toot generation failed! Contact Lynne for assistance." + return { + "toot": toot, + "media": None + } diff --git a/gen.py b/gen.py index a07dcce..8c7e72c 100755 --- a/gen.py +++ b/gen.py @@ -4,41 +4,47 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. from mastodon import Mastodon -import argparse, sys, traceback, json +import argparse +import sys +import traceback +import json import create +from util import get_config parser = argparse.ArgumentParser(description='Generate and post a toot.') -parser.add_argument('reply', metavar='reply', type=str, nargs='?', - help='ID of the status to reply to') +parser.add_argument('reply', metavar='reply', type=str, nargs='?', + help='ID of the status to reply to') parser.add_argument('-s', '--simulate', dest='simulate', action='store_true', - help="Print the toot to stdout without posting it") + help="Print the toot to stdout without posting it") args = parser.parse_args() -cfg = json.load(open('config.json')) +cfg = get_config() client = Mastodon( - client_id=cfg['client']['id'], - client_secret=cfg['client']['secret'], - access_token=cfg['secret'], - api_base_url=cfg['site']) + access_token=cfg['secret'], + api_base_url=cfg['site']) if __name__ == '__main__': - toot = create.make_toot() - if not args.simulate: - try: - if toot['media'] != None: - mediaID = client.media_post(toot['media'], description = toot['toot']) - client.status_post(toot['toot'].replace("\n", " "), - media_ids = [mediaID], visibility = "unlisted", spoiler_text = cfg['cw']) - else: - client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = cfg['cw']) - except Exception as err: - toot = { - "toot": - "Mistress @lynnesbian@fedi.lynnesbian.space, something has gone terribly" \ - + " wrong! While attempting to post a toot, I received the following" \ - + " error:\n" + "\n".join(traceback.format_tb(sys.exc_info()[2])) - } - client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = "Error!") - print(toot['toot']) + toot = create.make_toot() + if not args.simulate: + try: + if toot['media'] != None: + mediaID = client.media_post( + toot['media'], description=toot['toot']) + client.status_post(toot['toot'].replace("\n", " "), + media_ids=[mediaID], visibility="unlisted", spoiler_text=cfg['cw']) + else: + client.status_post( + toot['toot'], visibility='unlisted', spoiler_text=cfg['cw']) + except Exception as err: + toot = { + "toot": + "Mistress @lynnesbian@fedi.lynnesbian.space, something has gone terribly" + + " wrong! While attempting to post a toot, I received the following" + + " error:\n" + + "\n".join(traceback.format_tb(sys.exc_info()[2])) + } + client.status_post( + toot['toot'], visibility='unlisted', spoiler_text="Error!") + print(toot['toot']) diff --git a/main.py b/main.py index b167a8e..00b14af 100755 --- a/main.py +++ b/main.py @@ -6,223 +6,165 @@ from mastodon import Mastodon from os import path -from bs4 import BeautifulSoup -import os, sqlite3, signal, sys, json, re +import os +import sqlite3 +import signal +import sys +import json +import re import requests +from util import get_config, extract_toot -scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications"] -cfg = json.load(open('config.json', 'r')) - -if os.path.exists("clientcred.secret"): - print("Upgrading to new storage method") - cc = open("clientcred.secret").read().split("\n") - cfg['client'] = { - "id": cc[0], - "secret": cc[1] - } - cfg['secret'] = open("usercred.secret").read().rstrip("\n") - os.remove("clientcred.secret") - os.remove("usercred.secret") - - -if "client" not in cfg: - print("No client credentials, registering application") - client_id, client_secret = Mastodon.create_app("mstdn-ebooks", - api_base_url=cfg['site'], - scopes=scopes, - website="https://github.com/Lynnesbian/mstdn-ebooks") - - cfg['client'] = { - "id": client_id, - "secret": client_secret - } - -if "secret" not in cfg: - print("No user credentials, logging in") - client = Mastodon(client_id = cfg['client']['id'], - client_secret = cfg['client']['secret'], - api_base_url=cfg['site']) - - print("Open this URL: {}".format(client.auth_request_url(scopes=scopes))) - cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes) - -json.dump(cfg, open("config.json", "w+")) - -def extract_toot(toot): - toot = toot.replace("'", "'") - toot = toot.replace(""", '"') - soup = BeautifulSoup(toot, "html.parser") - - # this is the code that removes all mentions - # TODO: make it so that it removes the @ and instance but keeps the name - for mention in soup.select("span.h-card"): - mention.a.unwrap() - mention.span.unwrap() - - # replace
with linebreak - for lb in soup.select("br"): - lb.insert_after("\n") - lb.decompose() - - # replace

with linebreak - for p in soup.select("p"): - p.insert_after("\n") - p.unwrap() - - # fix hashtags - for ht in soup.select("a.hashtag"): - ht.unwrap() - - # fix links - for link in soup.select("a"): - link.insert_after(link["href"]) - link.decompose() - - toot = soup.get_text() - toot = toot.rstrip("\n") #remove trailing newline - toot = toot.replace("@", "@\u200B") #put a zws between @ and username to avoid mentioning - return(toot) +cfg = get_config() client = Mastodon( - client_id=cfg['client']['id'], - client_secret = cfg['client']['secret'], - access_token=cfg['secret'], - api_base_url=cfg['site']) + access_token=cfg['secret'], + api_base_url=cfg['site']) me = client.account_verify_credentials() following = client.account_following(me.id) db = sqlite3.connect("toots.db") -db.text_factory=str +db.text_factory = str c = db.cursor() c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID") db.commit() + def handleCtrlC(signal, frame): - print("\nPREMATURE EVACUATION - Saving chunks") - db.commit() - sys.exit(1) + print("\nPREMATURE EVACUATION - Saving chunks") + db.commit() + sys.exit(1) + signal.signal(signal.SIGINT, handleCtrlC) + def get_toots_legacy(client, id): - i = 0 - toots = client.account_statuses(id) - while toots is not None and len(toots) > 0: - for toot in toots: - if toot.spoiler_text != "": continue - if toot.reblog is not None: continue - if toot.visibility not in ["public", "unlisted"]: continue - t = extract_toot(toot.content) - if t != None: - yield { - "toot": t, - "id": toot.id, - "uri": toot.uri - } - toots = client.fetch_next(toots) - i += 1 - if i%20 == 0: - print('.', end='', flush=True) + i = 0 + toots = client.account_statuses(id) + while toots is not None and len(toots) > 0: + for toot in toots: + if toot.spoiler_text != "": + continue + if toot.reblog is not None: + continue + if toot.visibility not in ["public", "unlisted"]: + continue + t = extract_toot(toot.content) + if t != None: + yield { + "toot": t, + "id": toot.id, + "uri": toot.uri + } + toots = client.fetch_next(toots) + i += 1 + if i % 20 == 0: + print('.', end='', flush=True) + for f in following: - last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone() - if last_toot != None: - last_toot = last_toot[0] - else: - last_toot = 0 - print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot)) + last_toot = c.execute( + "SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone() + if last_toot != None: + last_toot = last_toot[0] + else: + last_toot = 0 + print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot)) - #find the user's activitypub outbox - print("WebFingering...") - instance = re.search(r"^.*@(.+)", f.acct) - if instance == None: - instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1) - else: - instance = instance.group(1) + # find the user's activitypub outbox + print("WebFingering...") + instance = re.search(r"^.*@(.+)", f.acct) + if instance == None: + instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1) + else: + instance = instance.group(1) - if instance == "bofa.lol": - print("rest in piece bofa, skipping") - continue - - # print("{} is on {}".format(f.acct, instance)) - try: - r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10) - uri = re.search(r'template="([^"]+)"', r.text).group(1) - uri = uri.format(uri = "{}@{}".format(f.username, instance)) - r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10) - j = r.json() - if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it - uri = j['aliases'][0] - else: - uri = j['aliases'][1] - uri = "{}/outbox?page=true".format(uri) - r = requests.get(uri, timeout=10) - j = r.json() - except Exception: - print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)") - sys.exit(1) + if instance == "bofa.lol": + print("rest in piece bofa, skipping") + continue - pleroma = False - if 'first' in j and type(j['first']) != str: - print("Pleroma instance detected") - pleroma = True - j = j['first'] - else: - print("Mastodon instance detected") - uri = "{}&min_id={}".format(uri, last_toot) - r = requests.get(uri) - j = r.json() + # print("{} is on {}".format(f.acct, instance)) + try: + r = requests.get( + "https://{}/.well-known/host-meta".format(instance), timeout=10) + uri = re.search(r'template="([^"]+)"', r.text).group(1) + uri = uri.format(uri="{}@{}".format(f.username, instance)) + r = requests.get( + uri, headers={"Accept": "application/json"}, timeout=10) + j = r.json() + if len(j['aliases']) == 1: # TODO: this is a hack on top of a hack, fix it + uri = j['aliases'][0] + else: + uri = j['aliases'][1] + uri = "{}/outbox?page=true".format(uri) + r = requests.get(uri, timeout=10) + j = r.json() + except Exception: + print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)") + sys.exit(1) - print("Downloading and parsing toots", end='', flush=True) - done = False - try: - while not done and len(j['orderedItems']) > 0: - for oi in j['orderedItems']: - if oi['type'] != "Create": - continue #not a toost. fuck outta here - - # its a toost baby - content = oi['object']['content'] - if oi['object']['summary'] != None: - #don't download CW'd toots - continue - toot = extract_toot(content) - # print(toot) - try: - if pleroma: - if c.execute("SELECT COUNT(*) FROM toots WHERE id LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0: - #we've caught up to the notices we've already downloaded, so we can stop now - done = True - break - pid = re.search(r"[^\/]+$", oi['object']['id']).group(0) - c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", - (pid, - f.id, - oi['object']['id'], - toot - ) - ) - pass - except: - pass #ignore any toots that don't successfully go into the DB - # sys.exit(0) - if not pleroma: - r = requests.get(j['prev'], timeout=15) - else: - r = requests.get(j['next'], timeout=15) - j = r.json() - print('.', end='', flush=True) - print(" Done!") - db.commit() - except: - print("Encountered an error! Saving toots to database and continuing.") - db.commit() - # db.close() + pleroma = False + if 'first' in j and type(j['first']) != str: + print("Pleroma instance detected") + pleroma = True + j = j['first'] + else: + print("Mastodon instance detected") + uri = "{}&min_id={}".format(uri, last_toot) + r = requests.get(uri) + j = r.json() + + print("Downloading and parsing toots", end='', flush=True) + done = False + try: + while not done and len(j['orderedItems']) > 0: + for oi in j['orderedItems']: + if oi['type'] != "Create": + continue # not a toost. fuck outta here + + # its a toost baby + content = oi['object']['content'] + if oi['object']['summary'] != None: + # don't download CW'd toots + continue + toot = extract_toot(content) + # print(toot) + try: + if pleroma: + if c.execute("SELECT COUNT(*) FROM toots WHERE id LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0: + # we've caught up to the notices we've already downloaded, so we can stop now + done = True + break + pid = re.search(r"[^\/]+$", oi['object']['id']).group(0) + c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", + (pid, + f.id, + oi['object']['id'], + toot + ) + ) + pass + except: + pass # ignore any toots that don't successfully go into the DB + # sys.exit(0) + if not pleroma: + r = requests.get(j['prev'], timeout=15) + else: + r = requests.get(j['next'], timeout=15) + j = r.json() + print('.', end='', flush=True) + print(" Done!") + db.commit() + except: + print("Encountered an error! Saving toots to database and continuing.") + db.commit() + # db.close() print("Done!") db.commit() -db.execute("VACUUM") #compact db +db.execute("VACUUM") # compact db db.commit() -db.close() \ No newline at end of file +db.close() diff --git a/util.py b/util.py new file mode 100644 index 0000000..8ebcbfc --- /dev/null +++ b/util.py @@ -0,0 +1,97 @@ +""" +Various utility tools +""" + +import json +import os +from mastodon import Mastodon +from bs4 import BeautifulSoup + + +def get_config(): + access_token = os.getenv("MASTODON_API_TOKEN") + api_base_url = os.getenv("MASTODON_API_BASE_URL") + + if (access_token and api_base_url): #  Heroku mode; use preset token + return { + "secret": access_token, + "site": api_base_url, + "is_heroku": True + } + else: #  Local mode; do OAuth login dance + scopes = ["read:statuses", "read:accounts", + "read:follows", "write:statuses", "read:notifications"] + cfg = json.load(open('config.json', 'r')) + + if os.path.exists("clientcred.secret"): + print("Upgrading to new storage method") + cc = open("clientcred.secret").read().split("\n") + cfg['client'] = { + "id": cc[0], + "secret": cc[1] + } + cfg['secret'] = open("usercred.secret").read().rstrip("\n") + os.remove("clientcred.secret") + os.remove("usercred.secret") + + if "client" not in cfg: + print("No client credentials, registering application") + client_id, client_secret = Mastodon.create_app("mstdn-ebooks", + api_base_url=cfg['site'], + scopes=scopes, + website="https://github.com/Lynnesbian/mstdn-ebooks") + + cfg['client'] = { + "id": client_id, + "secret": client_secret + } + + if "secret" not in cfg: + print("No user credentials, logging in") + client = Mastodon(client_id=cfg['client']['id'], + client_secret=cfg['client']['secret'], + api_base_url=cfg['site']) + + print("Open this URL: {}".format( + client.auth_request_url(scopes=scopes))) + cfg['secret'] = client.log_in( + code=input("Secret: "), scopes=scopes) + + json.dump(cfg, open("config.json", "w+")) + + +def extract_toot(toot): + toot = toot.replace("'", "'") + toot = toot.replace(""", '"') + soup = BeautifulSoup(toot, "html.parser") + + # this is the code that removes all mentions + # TODO: make it so that it removes the @ and instance but keeps the name + for mention in soup.select("span.h-card"): + mention.a.unwrap() + mention.span.unwrap() + + # replace
with linebreak + for lb in soup.select("br"): + lb.insert_after("\n") + lb.decompose() + + # replace

with linebreak + for p in soup.select("p"): + p.insert_after("\n") + p.unwrap() + + # fix hashtags + for ht in soup.select("a.hashtag"): + ht.unwrap() + + # fix links + for link in soup.select("a"): + link.insert_after(link["href"]) + link.decompose() + + toot = soup.get_text() + toot = toot.rstrip("\n") # remove trailing newline + # put a zws between @ and username to avoid mentioning + toot = toot.replace("@", "@\u200B") + return(toot)