diff options
author | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 18:05:24 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 18:16:11 +0100 |
commit | 144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (patch) | |
tree | 5b0831e5588ec3315b67b982fba5421a3f3157d8 /xml2static_rst.py | |
parent | 862ecd1e98a825a39720f9a04f1ce1fe96d8fab3 (diff) | |
download | disqus_xml2static_rst-144133677ec6c1f6dc6f4de6298d84e8759c6aa4.tar.gz |
Reorganize the script to have at least some structure.
Diffstat (limited to 'xml2static_rst.py')
-rw-r--r-- | xml2static_rst.py | 76 |
1 files changed, 46 insertions, 30 deletions
diff --git a/xml2static_rst.py b/xml2static_rst.py index 5824efc..f2d26af 100644 --- a/xml2static_rst.py +++ b/xml2static_rst.py @@ -2,42 +2,58 @@ from os.path import basename, exists, normpath, splitext from pprint import pprint +from typing import Dict, List from urllib.parse import urlparse from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', - 'xsi': 'http://disqus.com/disqus-internals', + 'dsi': 'http://disqus.com/disqus-internals', 'dsq': 'http://www.w3.org/2001/XMLSchema-instance'} -tree = ET.parse("comments.xml") -root = tree.getroot() -threads = {} -for thread in root.findall("dflt:thread", ns): - thread_id = thread.attrib[f"{{{ns['xsi']}}}id"] - thread_link = thread.find("dflt:link", ns).text.strip() - if thread_id not in threads: - threads[thread_id] = {} - threads[thread_id]['link'] = thread_link - threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip() - threads[thread_id]['msgs'] = [] - -out = {} -for com_post in root.findall("dflt:post", ns): - is_spam = com_post.find('dflt:isSpam', ns).text == 'true' - is_del = com_post.find('dflt:isDeleted', ns).text == 'true' - if not (is_spam or is_del): - thread = com_post.find('dflt:thread', ns) - thread_id = thread.attrib[f"{{{ns['xsi']}}}id"] - blurb = threads[thread_id]['blurb'] - if blurb not in out: - out[blurb] = "" - msg = com_post.find('dflt:message', ns).text - if msg is not None: - out[blurb] += msg + '\n' - -for post in out: - if len(out[post].strip()) > 0: - print(post + '\n' + out[post]) +def init(filename: str) -> ET.Element: + tree = ET.parse(filename) + return tree.getroot() + + +def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]: + threads: Dict[int, Dict[str, str|List]] = {} + for thread in root.findall("dflt:thread", ns): + thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) + thread_link = thread.findtext("dflt:link", '', ns).strip() + if thread_id not in threads: + threads[thread_id] = {} + threads[thread_id]['link'] = thread_link + if thread_link is not None: + threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip() + threads[thread_id]['msgs'] = [] + return threads + + +def collect_posts(root: ET.Element, threads: dict) -> dict: + out = {} + for com_post in root.findall("dflt:post", ns): + is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true' + is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true' + if not (is_spam or is_del): + thread = com_post.find('dflt:thread', ns) + thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) + blurb = threads[thread_id]['blurb'] + if blurb not in out: + out[blurb] = "" + html_msg = com_post.find('dflt:message', ns) + if html_msg is not None: + out[blurb] += html_msg.text + '\n' + + return out # if not os.path.exists('comments/'): # os.mkdir('comments') + +if __name__=='__main__': + root = init("comments.xml") + threads = collect_threads(root) + posts = collect_posts(root, threads) + for post in posts: + if len(posts[post].strip()) > 0: + print(post + '\n' + posts[post]) + |