added extract_toot function to functions.py

This commit is contained in:
Lynne 2019-01-11 22:55:31 +10:00
parent 85fec32c83
commit de3449ae56
No known key found for this signature in database
GPG Key ID: FB7B970303ACE499
1 changed files with 23 additions and 0 deletions

View File

@ -60,3 +60,26 @@ def make_toot_markov(query = None):
"toot": toot, "toot": toot,
"media": None "media": None
} }
def extract_toot(toot):
soup = BeautifulSoup(toot, "html.parser")
for lb in soup.select("br"): #replace <br> with linebreak
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"): #ditto for <p>
p.insert_after("\n")
p.unwrap()
for ht in soup.select("a.hashtag"): #make hashtags no longer links, just text
ht.unwrap()
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
link.insert_after(link["href"])
link.decompose()
toot = soup.get_text()
text = re.sub("https://([^/]+)/(@[^ ]+)", r"\2@\1", text) #put mastodon-style mentions back in
text = re.sub("https://([^/]+)/users/([^ ]+)", r"@\2@\1", text) #put pleroma-style mentions back in
text = text.rstrip("\n") #remove trailing newline
return text