diff --git a/gensbbsrss.py b/gensbbsrss.py index 3cf0df8..33f5b8f 100644 --- a/gensbbsrss.py +++ b/gensbbsrss.py @@ -1,11 +1,10 @@ import os +from io import BytesIO from datetime import datetime from xml.etree import ElementTree -from bs4 import BeautifulSoup -with open('index.html') as file: - soup = BeautifulSoup(file.read()) - sbbsbase = soup.find('link', rel='canonical')['href'] +tree = ElementTree.ElementTree().parse('index.html') +sbbsbase = tree.find('./head/link[@rel="canonical"]').attrib['href'] feed = ElementTree.Element('rss', version='2.0') channel = ElementTree.SubElement(feed, 'channel') @@ -17,23 +16,26 @@ description = ElementTree.SubElement(channel, 'description') description.text = 'just a showcase of software being broken :)' language = ElementTree.SubElement(channel, 'language') language.text = 'en-us' +file = BytesIO() def parse_file(i): - with open(i) as file: - soup = BeautifulSoup(file.read()) + tree = ElementTree.ElementTree().parse(i) item = ElementTree.Element('item') title = ElementTree.SubElement(item, 'title') - title.text = soup.h1.string + title.text = tree.find('body').find('h1').text link = ElementTree.SubElement(item, 'link') - link.text = soup.find('link', rel='canonical')['href'] + link.text = tree.find('./head/link[@rel="canonical"]').attrib['href'] guid = ElementTree.SubElement(item, 'guid') guid.text = f'sbbs/{os.path.splitext(i)[0]}' description = ElementTree.SubElement(item, 'description') - description.text = str(soup.find('div', class_='content')) - return (datetime.fromisoformat(soup.find('meta', {'name': 'created'})['content']), item) + ElementTree.ElementTree(tree.find('./body/div[@class="content"]')).write(file, 'utf-8', method='html') + description.text = file.getvalue().decode() + file.seek(0) + file.truncate(0) + return (datetime.fromisoformat(tree.find('./head/meta[@name="created"]').attrib['content']), item) for date, i in sorted(map(parse_file, filter(lambda i: i not in ('index.html', 'index.xml'), os.listdir('.'))), reverse=True, key=lambda i: i[0]): # http://johnbokma.com/blog/2019/10/09/rfc-822-and-rfc-3339-dates-in-python.html ctime = date.ctime() pubdate = ElementTree.SubElement(i, 'pubDate') pubdate.text = f"{ctime[0:3]}, {date.day:02d} {ctime[4:7]} {date.strftime(' %Y %H:%M:%S %z')}" channel.append(i) -ElementTree.ElementTree(feed).write('index.xml', 'utf-8') +ElementTree.ElementTree(feed).write('index.xml', 'utf-8', True) diff --git a/requirements.txt b/requirements.txt index 2620f05..1b25361 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ feedparser -beautifulsoup4