Remove feedparser

This commit is contained in:
blank X 2021-07-22 15:10:23 +07:00
parent 1111e0f115
commit da7c5618e2
Signed by: blankie
GPG Key ID: CC15FC822C7F61F5
3 changed files with 66 additions and 18 deletions

View File

@ -3,7 +3,7 @@ image: python:3.9-alpine
pages: pages:
stage: deploy stage: deploy
script: script:
- pip install -r requirements.txt && cd public/sbbs && python3 ../../gensbbsrss.py - cd public/sbbs && python3 ../../gensbbsrss.py
artifacts: artifacts:
paths: paths:
- public - public

View File

@ -1,36 +1,85 @@
import sys import sys
import html import html
import shlex import shlex
from urllib.parse import urlparse, urlunparse from xml.etree import ElementTree
import feedparser from urllib.request import urlopen, Request
from urllib.parse import urlparse, urlunparse, urljoin
def find_ignore_ns(element, tag):
for i in element.iter():
if i.tag == tag:
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
for i in element.iter():
if i.tag.endswith('}' + tag):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
is_link = lambda i: i.attrib.get('rel') != 'self' and i.attrib.get('type', 'text/html') == 'text/html'
def find_link(element):
for i in element.iter():
if i.tag == 'link' and is_link(i):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
for i in element.iter():
if i.tag.endswith('}link') and is_link(i):
return i
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
break
with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out: with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out:
for i in file: for i in file:
inp = shlex.split(i, comments=True) inp = shlex.split(i, comments=True)
if 'blog' in inp[1:]: if 'blog' in inp[1:]:
feedurl = inp[0]
try: try:
d = feedparser.parse(inp[0]) with urlopen(Request(feedurl, headers={'User-Agent': "stop being so fucking obsessed that i'm using urllib ffs"}), timeout=60) as resp:
except Exception as e: if resp.status != 200:
print(inp[0], 'raised', e, file=sys.stderr) print(feedurl, 'returned', resp.status, file=sys.stderr)
continue continue
feedurl = d['href'] tree = ElementTree.ElementTree().parse(resp)
if not d['entries']: except Exception as e:
print(feedurl, 'raised', e, file=sys.stderr)
continue
if tree.tag not in ('rss', 'feed') and not tree.tag.endswith('}rss') and not tree.tag.endswith('}feed'):
print(f'{feedurl} is not a feed (root tag is {tree.tag})', file=sys.stderr)
continue
channel = find_ignore_ns(tree, 'channel')
if channel is None:
channel = tree
if find_ignore_ns(channel, 'item') is None and find_ignore_ns(channel, 'entry') is None:
print(feedurl, 'has no entries, skipping', file=sys.stderr) print(feedurl, 'has no entries, skipping', file=sys.stderr)
continue continue
if 'links' in d['feed']: url = find_link(channel)
url = next(filter(lambda i: i['type'] == 'text/html', d['feed']['links']))['href'] if url is None:
else:
url = list(urlparse(feedurl)) url = list(urlparse(feedurl))
url[2] = '' url[2] = ''
url = urlunparse(url) url = urlunparse(url)
print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True) print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True)
url = input().strip() or url url = input().strip() or url
desc = d['feed'].get('description') else:
text = f'<li><b><a href="{html.escape(url)}">{html.escape(d["feed"]["title"])}</a> (<a href="{html.escape(feedurl)}">feed</a>)' url = url.text or url.attrib['href']
if desc := d['feed'].get('description'): text = f'<li><b><a href="{html.escape(urljoin(feedurl, url))}">{html.escape(find_ignore_ns(channel, "title").text)}</a> (<a href="{html.escape(feedurl)}">feed</a>)'
desc = find_ignore_ns(channel, 'description')
if desc is not None and desc.text:
if desc := desc.text.strip():
text += f':</b> {html.escape(desc)}' text += f':</b> {html.escape(desc)}'
else: else:
text += '</b>' text += '</b>'
text += '</li>\n' else:
print(text, end='') desc = find_ignore_ns(channel, 'subtitle')
if desc is not None and desc.text:
if desc := desc.text.strip():
text += f':</b> {html.escape(desc)}'
else:
text += '</b>'
else:
text += '</b>'
print(text, '</li>', sep='')
out.write(text) out.write(text)
out.write('</li>\n')
out.flush()

View File

@ -1 +0,0 @@
feedparser