Reorganize the script to have at least some structure.

author: Matěj Cepl <mcepl@cepl.eu> 2024-01-08 18:05:24 +0100
committer: Matěj Cepl <mcepl@cepl.eu> 2024-01-08 18:16:11 +0100
commit: 144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (patch)
tree: 5b0831e5588ec3315b67b982fba5421a3f3157d8 /xml2static_rst.py
parent: 862ecd1e98a825a39720f9a04f1ce1fe96d8fab3 (diff)
download: disqus_xml2static_rst-144133677ec6c1f6dc6f4de6298d84e8759c6aa4.tar.gz
1 files changed, 46 insertions, 30 deletions
diff --git a/xml2static_rst.py b/xml2static_rst.py
index 5824efc..f2d26af 100644
--- a/xml2static_rst.py
+++ b/xml2static_rst.py
@@ -2,42 +2,58 @@
 
 from os.path import basename, exists, normpath, splitext
 from pprint import pprint
+from typing import Dict, List
 from urllib.parse import urlparse
 from xml.etree import ElementTree as ET
 
 ns = {'dflt': 'http://disqus.com',
-      'xsi': 'http://disqus.com/disqus-internals',
+      'dsi': 'http://disqus.com/disqus-internals',
       'dsq': 'http://www.w3.org/2001/XMLSchema-instance'}
 
-tree = ET.parse("comments.xml")
-root = tree.getroot()
-threads = {}
-for thread in root.findall("dflt:thread", ns):
-    thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
-    thread_link = thread.find("dflt:link", ns).text.strip()
-    if thread_id not in threads:
-        threads[thread_id] = {}
-    threads[thread_id]['link'] = thread_link
-    threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
-    threads[thread_id]['msgs'] = []
-
-out = {}
-for com_post in root.findall("dflt:post", ns):
-    is_spam = com_post.find('dflt:isSpam', ns).text == 'true'
-    is_del = com_post.find('dflt:isDeleted', ns).text == 'true'
-    if not (is_spam or is_del):
-        thread = com_post.find('dflt:thread', ns)
-        thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
-        blurb = threads[thread_id]['blurb']
-        if blurb not in out:
-            out[blurb] = ""
-        msg = com_post.find('dflt:message', ns).text
-        if msg is not None:
-            out[blurb] += msg  + '\n'
-    
-for post in out:
-    if len(out[post].strip()) > 0:
-        print(post + '\n' + out[post])
+def init(filename: str) -> ET.Element:
+    tree = ET.parse(filename)
+    return tree.getroot()
+
+
+def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]:
+    threads: Dict[int, Dict[str, str|List]] = {}
+    for thread in root.findall("dflt:thread", ns):
+        thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
+        thread_link = thread.findtext("dflt:link", '', ns).strip()
+        if thread_id not in threads:
+            threads[thread_id] = {}
+        threads[thread_id]['link'] = thread_link
+        if thread_link is not None:
+            threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
+        threads[thread_id]['msgs'] = []
+    return threads
+
+
+def collect_posts(root: ET.Element, threads: dict) -> dict:
+    out = {}
+    for com_post in root.findall("dflt:post", ns):
+        is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true'
+        is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true'
+        if not (is_spam or is_del):
+            thread = com_post.find('dflt:thread', ns)
+            thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
+            blurb = threads[thread_id]['blurb']
+            if blurb not in out:
+                out[blurb] = ""
+            html_msg = com_post.find('dflt:message', ns)
+            if html_msg is not None:
+                out[blurb] += html_msg.text  + '\n'
+        
+    return out
     
 # if not os.path.exists('comments/'):
 #     os.mkdir('comments')
+
+if __name__=='__main__':
+    root = init("comments.xml")
+    threads = collect_threads(root)
+    posts = collect_posts(root, threads)
+    for post in posts:
+        if len(posts[post].strip()) > 0:
+            print(post + '\n' + posts[post])
+
author	Matěj Cepl <mcepl@cepl.eu>	2024-01-08 18:05:24 +0100
committer	Matěj Cepl <mcepl@cepl.eu>	2024-01-08 18:16:11 +0100
commit	144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (patch)
tree	5b0831e5588ec3315b67b982fba5421a3f3157d8 /xml2static_rst.py
parent	862ecd1e98a825a39720f9a04f1ce1fe96d8fab3 (diff)
download	disqus_xml2static_rst-144133677ec6c1f6dc6f4de6298d84e8759c6aa4.tar.gz