diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4c2e0df..61232d9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,7 +3,7 @@ image: python:3.9-alpine pages: stage: deploy script: - - pip install -r requirements.txt && cd public/sbbs && python3 ../../gensbbsrss.py + - cd public/sbbs && python3 ../../gensbbsrss.py artifacts: paths: - public diff --git a/genbloglist.py b/genbloglist.py index cd9c63f..8199cd1 100644 --- a/genbloglist.py +++ b/genbloglist.py @@ -1,36 +1,85 @@ import sys import html import shlex -from urllib.parse import urlparse, urlunparse -import feedparser +from xml.etree import ElementTree +from urllib.request import urlopen, Request +from urllib.parse import urlparse, urlunparse, urljoin + +def find_ignore_ns(element, tag): + for i in element.iter(): + if i.tag == tag: + return i + if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): + break + for i in element.iter(): + if i.tag.endswith('}' + tag): + return i + if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): + break + +is_link = lambda i: i.attrib.get('rel') != 'self' and i.attrib.get('type', 'text/html') == 'text/html' + +def find_link(element): + for i in element.iter(): + if i.tag == 'link' and is_link(i): + return i + if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): + break + for i in element.iter(): + if i.tag.endswith('}link') and is_link(i): + return i + if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): + break with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out: for i in file: inp = shlex.split(i, comments=True) if 'blog' in inp[1:]: + feedurl = inp[0] try: - d = feedparser.parse(inp[0]) + with urlopen(Request(feedurl, headers={'User-Agent': "stop being so fucking obsessed that i'm using urllib ffs"}), timeout=60) as resp: + if resp.status != 200: + print(feedurl, 'returned', resp.status, file=sys.stderr) + continue + tree = ElementTree.ElementTree().parse(resp) except Exception as e: - print(inp[0], 'raised', e, file=sys.stderr) + print(feedurl, 'raised', e, file=sys.stderr) continue - feedurl = d['href'] - if not d['entries']: + if tree.tag not in ('rss', 'feed') and not tree.tag.endswith('}rss') and not tree.tag.endswith('}feed'): + print(f'{feedurl} is not a feed (root tag is {tree.tag})', file=sys.stderr) + continue + channel = find_ignore_ns(tree, 'channel') + if channel is None: + channel = tree + if find_ignore_ns(channel, 'item') is None and find_ignore_ns(channel, 'entry') is None: print(feedurl, 'has no entries, skipping', file=sys.stderr) continue - if 'links' in d['feed']: - url = next(filter(lambda i: i['type'] == 'text/html', d['feed']['links']))['href'] - else: + url = find_link(channel) + if url is None: url = list(urlparse(feedurl)) url[2] = '' url = urlunparse(url) print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True) url = input().strip() or url - desc = d['feed'].get('description') - text = f'
  • {html.escape(d["feed"]["title"])} (feed)' - if desc := d['feed'].get('description'): - text += f': {html.escape(desc)}' else: - text += '' - text += '
  • \n' - print(text, end='') + url = url.text or url.attrib['href'] + text = f'
  • {html.escape(find_ignore_ns(channel, "title").text)} (feed)' + desc = find_ignore_ns(channel, 'description') + if desc is not None and desc.text: + if desc := desc.text.strip(): + text += f': {html.escape(desc)}' + else: + text += '' + else: + desc = find_ignore_ns(channel, 'subtitle') + if desc is not None and desc.text: + if desc := desc.text.strip(): + text += f': {html.escape(desc)}' + else: + text += '' + else: + text += '' + print(text, '
  • ', sep='') out.write(text) + out.write('\n') + out.flush() diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1b25361..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -feedparser