import sys import html import shlex from xml.etree import ElementTree from urllib.request import urlopen, Request from urllib.parse import urlparse, urlunparse, urljoin def find_ignore_ns(element, tag): for i in element.iter(): if i.tag == tag: return i if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): break for i in element.iter(): if i.tag.endswith('}' + tag): return i if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): break is_link = lambda i: i.attrib.get('rel') != 'self' and i.attrib.get('type', 'text/html') == 'text/html' def find_link(element): for i in element.iter(): if i.tag == 'link' and is_link(i): return i if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): break for i in element.iter(): if i.tag.endswith('}link') and is_link(i): return i if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'): break with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out: for i in file: inp = shlex.split(i, comments=True) if 'blog' in inp[1:]: feedurl = inp[0] try: with urlopen(Request(feedurl, headers={'User-Agent': "stop being so fucking obsessed that i'm using urllib ffs"}), timeout=60) as resp: if resp.status != 200: print(feedurl, 'returned', resp.status, file=sys.stderr) continue tree = ElementTree.ElementTree().parse(resp) except Exception as e: print(feedurl, 'raised', e, file=sys.stderr) continue if tree.tag not in ('rss', 'feed') and not tree.tag.endswith('}rss') and not tree.tag.endswith('}feed'): print(f'{feedurl} is not a feed (root tag is {tree.tag})', file=sys.stderr) continue channel = find_ignore_ns(tree, 'channel') if channel is None: channel = tree if find_ignore_ns(channel, 'item') is None and find_ignore_ns(channel, 'entry') is None: print(feedurl, 'has no entries, skipping', file=sys.stderr) continue url = find_link(channel) if url is None: url = list(urlparse(feedurl)) url[2] = '' url = urlunparse(url) print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True) url = input().strip() or url else: url = url.text or url.attrib['href'] text = f'