aboutsummaryrefslogtreecommitdiffstats
path: root/xml2static_rst.py
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@cepl.eu>2024-01-08 18:05:24 +0100
committerMatěj Cepl <mcepl@cepl.eu>2024-01-08 18:16:11 +0100
commit144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (patch)
tree5b0831e5588ec3315b67b982fba5421a3f3157d8 /xml2static_rst.py
parent862ecd1e98a825a39720f9a04f1ce1fe96d8fab3 (diff)
downloaddisqus_xml2static_rst-144133677ec6c1f6dc6f4de6298d84e8759c6aa4.tar.gz
Reorganize the script to have at least some structure.
Diffstat (limited to 'xml2static_rst.py')
-rw-r--r--xml2static_rst.py76
1 files changed, 46 insertions, 30 deletions
diff --git a/xml2static_rst.py b/xml2static_rst.py
index 5824efc..f2d26af 100644
--- a/xml2static_rst.py
+++ b/xml2static_rst.py
@@ -2,42 +2,58 @@
from os.path import basename, exists, normpath, splitext
from pprint import pprint
+from typing import Dict, List
from urllib.parse import urlparse
from xml.etree import ElementTree as ET
ns = {'dflt': 'http://disqus.com',
- 'xsi': 'http://disqus.com/disqus-internals',
+ 'dsi': 'http://disqus.com/disqus-internals',
'dsq': 'http://www.w3.org/2001/XMLSchema-instance'}
-tree = ET.parse("comments.xml")
-root = tree.getroot()
-threads = {}
-for thread in root.findall("dflt:thread", ns):
- thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
- thread_link = thread.find("dflt:link", ns).text.strip()
- if thread_id not in threads:
- threads[thread_id] = {}
- threads[thread_id]['link'] = thread_link
- threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
- threads[thread_id]['msgs'] = []
-
-out = {}
-for com_post in root.findall("dflt:post", ns):
- is_spam = com_post.find('dflt:isSpam', ns).text == 'true'
- is_del = com_post.find('dflt:isDeleted', ns).text == 'true'
- if not (is_spam or is_del):
- thread = com_post.find('dflt:thread', ns)
- thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
- blurb = threads[thread_id]['blurb']
- if blurb not in out:
- out[blurb] = ""
- msg = com_post.find('dflt:message', ns).text
- if msg is not None:
- out[blurb] += msg + '\n'
-
-for post in out:
- if len(out[post].strip()) > 0:
- print(post + '\n' + out[post])
+def init(filename: str) -> ET.Element:
+ tree = ET.parse(filename)
+ return tree.getroot()
+
+
+def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]:
+ threads: Dict[int, Dict[str, str|List]] = {}
+ for thread in root.findall("dflt:thread", ns):
+ thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
+ thread_link = thread.findtext("dflt:link", '', ns).strip()
+ if thread_id not in threads:
+ threads[thread_id] = {}
+ threads[thread_id]['link'] = thread_link
+ if thread_link is not None:
+ threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
+ threads[thread_id]['msgs'] = []
+ return threads
+
+
+def collect_posts(root: ET.Element, threads: dict) -> dict:
+ out = {}
+ for com_post in root.findall("dflt:post", ns):
+ is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true'
+ is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true'
+ if not (is_spam or is_del):
+ thread = com_post.find('dflt:thread', ns)
+ thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
+ blurb = threads[thread_id]['blurb']
+ if blurb not in out:
+ out[blurb] = ""
+ html_msg = com_post.find('dflt:message', ns)
+ if html_msg is not None:
+ out[blurb] += html_msg.text + '\n'
+
+ return out
# if not os.path.exists('comments/'):
# os.mkdir('comments')
+
+if __name__=='__main__':
+ root = init("comments.xml")
+ threads = collect_threads(root)
+ posts = collect_posts(root, threads)
+ for post in posts:
+ if len(posts[post].strip()) > 0:
+ print(post + '\n' + posts[post])
+