Make HTML documents sexier

This commit is contained in:
blank X 2020-09-04 17:20:09 +07:00
parent b3ee8bcece
commit 5fe3809c4e
1 changed files with 41 additions and 6 deletions

View File

@ -15,6 +15,7 @@ import yaml
import praw
import aiohttp
import aiocron
from bs4 import BeautifulSoup
from telethon import TelegramClient, events
from telethon.utils import chunks
@ -189,7 +190,8 @@ async def main():
splitted = os.path.splitext(parsed[2])
domain = getattr(random_post, 'domain', parsed[1])
preview = getattr(random_post, 'preview', None)
if domain.endswith('imgur.com'):
if domain.endswith(
'imgur.com'):
parsed[1] = 'i.imgur.com'
if parsed[2].startswith('/a/'):
albumid = os.path.split(parsed[2])[1]
@ -204,11 +206,16 @@ async def main():
files = []
captions = []
for a, i in enumerate(apidata['images']):
to_append = f'{text}\n#{a + 1}'
to_append = f'#{a + 1}'
desc = i['description']
if desc:
to_append += ': ' + html.escape(desc)
captions.append(to_append)
to_append += ': ' + desc.strip()
caplength = 2047 - len(client.parse_mode.parse(text)[0])
captext = to_append[:caplength]
if len(captext) >= caplength:
captext = captext[:-1]
captext += ''
captions.append(text + '\n' + html.escape(captext))
filename = os.path.join(tempdir, str(time.time()))
await _download_file(filename, f'https://i.imgur.com/{i["hash"]}{i["ext"]}')
files.append(filename)
@ -243,8 +250,36 @@ async def main():
await _download_file(filename, url)
ext = await _get_file_ext(filename)
if ext.startswith('.htm'):
with open(filename) as file:
soup = BeautifulSoup(file.read())
ptitle = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:title' and tag.attrs.get('content')) or soup.find('title')
if ptitle:
ptitle = ptitle.attrs.get('content', ptitle.text).strip()
pdesc = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:description' and tag.attrs.get('content')) or soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('name') == 'description' and tag.attrs.get('content'))
if pdesc:
pdesc = pdesc.attrs.get('content', pdesc.text).strip()
pimg = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:image' and tag.attrs.get('content'))
if pimg:
pimg = pimg.attrs.get('content', '').strip()
tat = f'{text}\n\nURL: '
if ptitle:
tat += f'<a href="{url}">{html.escape(ptitle)}</a>'
else:
tat += url
files = []
captions = [f'<a href="{url}">{zws}</a>{text}\n\nURL: {url}']
if pimg:
await _download_file(filename, pimg)
files.append(filename)
tat = f'<a href="{url}">{zws}</a>{tat}'
if pdesc:
caplength = 2047 if pimg else 4095
caplength -= len(client.parse_mode.parse(tat)[0])
captext = pdesc[:caplength]
if len(captext) >= caplength:
captext = captext[:-1]
captext += ''
tat += '\n' + captext
captions = [tat]
for a, i in enumerate(files):
ext = await _get_file_ext(i)
os.rename(i, i + ext)
@ -254,7 +289,7 @@ async def main():
if getattr(random_post, 'selftext', None):
caplength = 4094 - len(client.parse_mode.parse(text)[0])
text += '\n\n'
captext = random_post.selftext[:caplength]
captext = random_post.selftext.strip()[:caplength]
if len(captext) >= caplength:
captext = captext[:-1]
captext += ''