Remove beautifulsoup4

2021-07-21 15:14:30 +07:00 · 2021-07-21 15:14:30 +07:00 · e7bd1c80e5
parent 17e046bf2f
commit e7bd1c80e5
2 changed files with 13 additions and 12 deletions
--- a/gensbbsrss.py
+++ b/gensbbsrss.py
@ -1,11 +1,10 @@
 import os
+from io import BytesIO
 from datetime import datetime
 from xml.etree import ElementTree
-from bs4 import BeautifulSoup

-with open('index.html') as file:
-    soup = BeautifulSoup(file.read())
-    sbbsbase = soup.find('link', rel='canonical')['href']
+tree = ElementTree.ElementTree().parse('index.html')
+sbbsbase = tree.find('./head/link[@rel="canonical"]').attrib['href']

 feed = ElementTree.Element('rss', version='2.0')
 channel = ElementTree.SubElement(feed, 'channel')
@ -17,23 +16,26 @@ description = ElementTree.SubElement(channel, 'description')
 description.text = 'just a showcase of software being broken :)'
 language = ElementTree.SubElement(channel, 'language')
 language.text = 'en-us'
+file = BytesIO()
 def parse_file(i):
-    with open(i) as file:
-        soup = BeautifulSoup(file.read())
+    tree = ElementTree.ElementTree().parse(i)
    item = ElementTree.Element('item')
    title = ElementTree.SubElement(item, 'title')
-    title.text = soup.h1.string
+    title.text = tree.find('body').find('h1').text
    link = ElementTree.SubElement(item, 'link')
-    link.text = soup.find('link', rel='canonical')['href']
+    link.text = tree.find('./head/link[@rel="canonical"]').attrib['href']
    guid = ElementTree.SubElement(item, 'guid')
    guid.text = f'sbbs/{os.path.splitext(i)[0]}'
    description = ElementTree.SubElement(item, 'description')
-    description.text = str(soup.find('div', class_='content'))
-    return (datetime.fromisoformat(soup.find('meta', {'name': 'created'})['content']), item)
+    ElementTree.ElementTree(tree.find('./body/div[@class="content"]')).write(file, 'utf-8', method='html')
+    description.text = file.getvalue().decode()
+    file.seek(0)
+    file.truncate(0)
+    return (datetime.fromisoformat(tree.find('./head/meta[@name="created"]').attrib['content']), item)
 for date, i in sorted(map(parse_file, filter(lambda i: i not in ('index.html', 'index.xml'), os.listdir('.'))), reverse=True, key=lambda i: i[0]):
    # http://johnbokma.com/blog/2019/10/09/rfc-822-and-rfc-3339-dates-in-python.html
    ctime = date.ctime()
    pubdate = ElementTree.SubElement(i, 'pubDate')
    pubdate.text = f"{ctime[0:3]}, {date.day:02d} {ctime[4:7]} {date.strftime(' %Y %H:%M:%S %z')}"
    channel.append(i)
-ElementTree.ElementTree(feed).write('index.xml', 'utf-8')
+ElementTree.ElementTree(feed).write('index.xml', 'utf-8', True)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1 @@
 feedparser
-beautifulsoup4