Make HTML documents sexier

2020-09-04 17:20:09 +07:00 · 2020-09-04 17:20:09 +07:00 · 5fe3809c4e
parent b3ee8bcece
commit 5fe3809c4e
1 changed files with 41 additions and 6 deletions
--- a/redditbot.py
+++ b/redditbot.py
@ -15,6 +15,7 @@ import yaml
 import praw
 import aiohttp
 import aiocron
+from bs4 import BeautifulSoup
 from telethon import TelegramClient, events
 from telethon.utils import chunks

@ -189,7 +190,8 @@ async def main():
                    splitted = os.path.splitext(parsed[2])
                    domain = getattr(random_post, 'domain', parsed[1])
                    preview = getattr(random_post, 'preview', None)
-                    if domain.endswith('imgur.com'):
+                    if domain.endswith(
+                    'imgur.com'):
                        parsed[1] = 'i.imgur.com'
                        if parsed[2].startswith('/a/'):
                            albumid = os.path.split(parsed[2])[1]
@ -204,11 +206,16 @@ async def main():
                                files = []
                                captions = []
                                for a, i in enumerate(apidata['images']):
-                                    to_append = f'{text}\n#{a + 1}'
+                                    to_append = f'#{a + 1}'
                                    desc = i['description']
                                    if desc:
-                                        to_append += ': ' + html.escape(desc)
-                                    captions.append(to_append)
+                                        to_append += ': ' + desc.strip()
+                                    caplength = 2047 - len(client.parse_mode.parse(text)[0])
+                                    captext = to_append[:caplength]
+                                    if len(captext) >= caplength:
+                                        captext = captext[:-1]
+                                        captext += '…'
+                                    captions.append(text + '\n' + html.escape(captext))
                                    filename = os.path.join(tempdir, str(time.time()))
                                    await _download_file(filename, f'https://i.imgur.com/{i["hash"]}{i["ext"]}')
                                    files.append(filename)
@ -243,8 +250,36 @@ async def main():
                            await _download_file(filename, url)
                    ext = await _get_file_ext(filename)
                    if ext.startswith('.htm'):
+                        with open(filename) as file:
+                            soup = BeautifulSoup(file.read())
+                        ptitle = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:title' and tag.attrs.get('content')) or soup.find('title')
+                        if ptitle:
+                            ptitle = ptitle.attrs.get('content', ptitle.text).strip()
+                        pdesc = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:description' and tag.attrs.get('content')) or soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('name') == 'description' and tag.attrs.get('content'))
+                        if pdesc:
+                            pdesc = pdesc.attrs.get('content', pdesc.text).strip()
+                        pimg = soup.find(lambda tag: tag.name == 'meta' and tag.attrs.get('property') == 'og:image' and tag.attrs.get('content'))
+                        if pimg:
+                            pimg = pimg.attrs.get('content', '').strip()
+                        tat = f'{text}\n\nURL: '
+                        if ptitle:
+                            tat += f'<a href="{url}">{html.escape(ptitle)}</a>'
+                        else:
+                            tat += url
                        files = []
-                        captions = [f'<a href="{url}">{zws}</a>{text}\n\nURL: {url}']
+                        if pimg:
+                            await _download_file(filename, pimg)
+                            files.append(filename)
+                            tat = f'<a href="{url}">{zws}</a>{tat}'
+                        if pdesc:
+                            caplength = 2047 if pimg else 4095
+                            caplength -= len(client.parse_mode.parse(tat)[0])
+                            captext = pdesc[:caplength]
+                            if len(captext) >= caplength:
+                                captext = captext[:-1]
+                                captext += '…'
+                            tat += '\n' + captext
+                        captions = [tat]
                for a, i in enumerate(files):
                    ext = await _get_file_ext(i)
                    os.rename(i, i + ext)
@ -254,7 +289,7 @@ async def main():
            if getattr(random_post, 'selftext', None):
                caplength = 4094 - len(client.parse_mode.parse(text)[0])
                text += '\n\n'
-                captext = random_post.selftext[:caplength]
+                captext = random_post.selftext.strip()[:caplength]
                if len(captext) >= caplength:
                    captext = captext[:-1]
                    captext += '…'