Remove feedparser
This commit is contained in:
parent
1111e0f115
commit
da7c5618e2
|
@ -3,7 +3,7 @@ image: python:3.9-alpine
|
||||||
pages:
|
pages:
|
||||||
stage: deploy
|
stage: deploy
|
||||||
script:
|
script:
|
||||||
- pip install -r requirements.txt && cd public/sbbs && python3 ../../gensbbsrss.py
|
- cd public/sbbs && python3 ../../gensbbsrss.py
|
||||||
artifacts:
|
artifacts:
|
||||||
paths:
|
paths:
|
||||||
- public
|
- public
|
||||||
|
|
|
@ -1,36 +1,85 @@
|
||||||
import sys
|
import sys
|
||||||
import html
|
import html
|
||||||
import shlex
|
import shlex
|
||||||
from urllib.parse import urlparse, urlunparse
|
from xml.etree import ElementTree
|
||||||
import feedparser
|
from urllib.request import urlopen, Request
|
||||||
|
from urllib.parse import urlparse, urlunparse, urljoin
|
||||||
|
|
||||||
|
def find_ignore_ns(element, tag):
|
||||||
|
for i in element.iter():
|
||||||
|
if i.tag == tag:
|
||||||
|
return i
|
||||||
|
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
|
||||||
|
break
|
||||||
|
for i in element.iter():
|
||||||
|
if i.tag.endswith('}' + tag):
|
||||||
|
return i
|
||||||
|
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
|
||||||
|
break
|
||||||
|
|
||||||
|
is_link = lambda i: i.attrib.get('rel') != 'self' and i.attrib.get('type', 'text/html') == 'text/html'
|
||||||
|
|
||||||
|
def find_link(element):
|
||||||
|
for i in element.iter():
|
||||||
|
if i.tag == 'link' and is_link(i):
|
||||||
|
return i
|
||||||
|
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
|
||||||
|
break
|
||||||
|
for i in element.iter():
|
||||||
|
if i.tag.endswith('}link') and is_link(i):
|
||||||
|
return i
|
||||||
|
if i.tag in ('item', 'entry') or i.tag.endswith('}item') or i.tag.endswith('}entry'):
|
||||||
|
break
|
||||||
|
|
||||||
with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out:
|
with open(sys.argv[1]) as file, open(sys.argv[2], 'w+') as out:
|
||||||
for i in file:
|
for i in file:
|
||||||
inp = shlex.split(i, comments=True)
|
inp = shlex.split(i, comments=True)
|
||||||
if 'blog' in inp[1:]:
|
if 'blog' in inp[1:]:
|
||||||
|
feedurl = inp[0]
|
||||||
try:
|
try:
|
||||||
d = feedparser.parse(inp[0])
|
with urlopen(Request(feedurl, headers={'User-Agent': "stop being so fucking obsessed that i'm using urllib ffs"}), timeout=60) as resp:
|
||||||
except Exception as e:
|
if resp.status != 200:
|
||||||
print(inp[0], 'raised', e, file=sys.stderr)
|
print(feedurl, 'returned', resp.status, file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
feedurl = d['href']
|
tree = ElementTree.ElementTree().parse(resp)
|
||||||
if not d['entries']:
|
except Exception as e:
|
||||||
|
print(feedurl, 'raised', e, file=sys.stderr)
|
||||||
|
continue
|
||||||
|
if tree.tag not in ('rss', 'feed') and not tree.tag.endswith('}rss') and not tree.tag.endswith('}feed'):
|
||||||
|
print(f'{feedurl} is not a feed (root tag is {tree.tag})', file=sys.stderr)
|
||||||
|
continue
|
||||||
|
channel = find_ignore_ns(tree, 'channel')
|
||||||
|
if channel is None:
|
||||||
|
channel = tree
|
||||||
|
if find_ignore_ns(channel, 'item') is None and find_ignore_ns(channel, 'entry') is None:
|
||||||
print(feedurl, 'has no entries, skipping', file=sys.stderr)
|
print(feedurl, 'has no entries, skipping', file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
if 'links' in d['feed']:
|
url = find_link(channel)
|
||||||
url = next(filter(lambda i: i['type'] == 'text/html', d['feed']['links']))['href']
|
if url is None:
|
||||||
else:
|
|
||||||
url = list(urlparse(feedurl))
|
url = list(urlparse(feedurl))
|
||||||
url[2] = ''
|
url[2] = ''
|
||||||
url = urlunparse(url)
|
url = urlunparse(url)
|
||||||
print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True)
|
print(f'No mention of main page on {feedurl}, please see {url} or enter main page url: ', file=sys.stderr, end='', flush=True)
|
||||||
url = input().strip() or url
|
url = input().strip() or url
|
||||||
desc = d['feed'].get('description')
|
else:
|
||||||
text = f'<li><b><a href="{html.escape(url)}">{html.escape(d["feed"]["title"])}</a> (<a href="{html.escape(feedurl)}">feed</a>)'
|
url = url.text or url.attrib['href']
|
||||||
if desc := d['feed'].get('description'):
|
text = f'<li><b><a href="{html.escape(urljoin(feedurl, url))}">{html.escape(find_ignore_ns(channel, "title").text)}</a> (<a href="{html.escape(feedurl)}">feed</a>)'
|
||||||
|
desc = find_ignore_ns(channel, 'description')
|
||||||
|
if desc is not None and desc.text:
|
||||||
|
if desc := desc.text.strip():
|
||||||
text += f':</b> {html.escape(desc)}'
|
text += f':</b> {html.escape(desc)}'
|
||||||
else:
|
else:
|
||||||
text += '</b>'
|
text += '</b>'
|
||||||
text += '</li>\n'
|
else:
|
||||||
print(text, end='')
|
desc = find_ignore_ns(channel, 'subtitle')
|
||||||
|
if desc is not None and desc.text:
|
||||||
|
if desc := desc.text.strip():
|
||||||
|
text += f':</b> {html.escape(desc)}'
|
||||||
|
else:
|
||||||
|
text += '</b>'
|
||||||
|
else:
|
||||||
|
text += '</b>'
|
||||||
|
print(text, '</li>', sep='')
|
||||||
out.write(text)
|
out.write(text)
|
||||||
|
out.write('</li>\n')
|
||||||
|
out.flush()
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
feedparser
|
|
Loading…
Reference in New Issue