blankx.gitlab.io/genbloglist.py

96 lines
4.2 KiB
Python
Raw Normal View History

2021-07-15 10:46:24 +00:00
import sys
import html
import shlex
2021-07-22 08:10:23 +00:00
from xml.etree import ElementTree
from urllib.request import urlopen, Request
from urllib.parse import urlparse, urlunparse, urljoin
def find_ignore_ns(element, tag):
for i in element.iter():
if i.tag == tag:
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
for i in element.iter():
if i.tag.endswith('}' + tag):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
is_link = lambda i: i.attrib.get('rel') != 'self' and i.attrib.get('type', 'text/html') == 'text/html'
def find_link(element):
for i in element.iter():
if i.tag == 'link' and is_link(i):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
for i in element.iter():
if i.tag.endswith('}link') and is_link(i):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
2021-07-15 10:46:24 +00:00
with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out:
for i in file:
inp = shlex.split(i, comments=True)
if 'blog' in inp[1:]:
2021-07-22 08:10:23 +00:00
feedurl = inp[0]
2021-08-09 14:29:37 +00:00
if feedurl.startswith('exec:~/.local/bin/konbata '):
text = f'<li><b><a href="{html.escape(feedurl[26:])}">{html.escape(inp[-1][1:])}</a> (<a href="{html.escape(feedurl[26:])}">feed</a>)</b></li>'
print(text)
out.write(text)
continue
2022-06-12 14:16:38 +00:00
rss_bridge = urlparse(feedurl).hostname.startswith('rss.')
2021-07-15 10:46:24 +00:00
try:
2021-07-22 08:10:23 +00:00
with urlopen(Request(feedurl, headers={'User-Agent': "stop being so fucking obsessed that i'm using urllib ffs"}), timeout=60) as resp:
if resp.status != 200:
print(feedurl, 'returned', resp.status, file=sys.stderr)
continue
tree = ElementTree.ElementTree().parse(resp)
2021-07-15 10:46:24 +00:00
except Exception as e:
2021-07-22 08:10:23 +00:00
print(feedurl, 'raised', e, file=sys.stderr)
2021-07-15 10:46:24 +00:00
continue
2021-07-22 08:10:23 +00:00
if tree.tag not in ('rss', 'feed') and not tree.tag.endswith('}rss') and not tree.tag.endswith('}feed'):
print(f'{feedurl} is not a feed (root tag is {tree.tag})', file=sys.stderr)
continue
channel = find_ignore_ns(tree, 'channel')
if channel is None:
channel = tree
if find_ignore_ns(channel, 'item') is None and find_ignore_ns(channel, 'entry') is None:
2021-07-15 10:46:24 +00:00
print(feedurl, 'has no entries, skipping', file=sys.stderr)
continue
2021-07-22 08:10:23 +00:00
url = find_link(channel)
if url is None:
2021-07-15 10:46:24 +00:00
url = list(urlparse(feedurl))
url[2] = ''
url = urlunparse(url)
print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True)
url = input().strip() or url
else:
2021-07-22 08:10:23 +00:00
url = url.text or url.attrib['href']
2022-06-12 14:16:38 +00:00
text = f'<li><b><a href="{html.escape(urljoin(feedurl, url))}">{html.escape(find_ignore_ns(channel, "title").text)}</a> (<a href="{html.escape(feedurl)}">'
if rss_bridge:
text += 'auto-generated '
text += 'feed</a>)'
if not rss_bridge:
desc = find_ignore_ns(channel, 'description')
2021-07-22 08:10:23 +00:00
if desc is not None and desc.text:
if desc := desc.text.strip():
text += f':</b> {html.escape(desc)}'
else:
text += '</b>'
else:
2022-06-12 14:16:38 +00:00
desc = find_ignore_ns(channel, 'subtitle')
if desc is not None and desc.text:
if desc := desc.text.strip():
text += f':</b> {html.escape(desc)}'
else:
text += '</b>'
else:
text += '</b>'
2021-07-22 08:10:23 +00:00
print(text, '</li>', sep='')
2021-07-15 10:46:24 +00:00
out.write(text)
2021-07-22 08:10:23 +00:00
out.write('</li>\n')
out.flush()