From b99d275ae0cdd159465e8284fb8a92780a27b4b4 Mon Sep 17 00:00:00 2001 From: Matěj Cepl Date: Mon, 8 Jan 2024 20:09:23 +0100 Subject: Collect all available information. Fixes: https://todo.sr.ht/~mcepl/devel/23 --- xml2static_rst.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/xml2static_rst.py b/xml2static_rst.py index be936a2..54989a8 100644 --- a/xml2static_rst.py +++ b/xml2static_rst.py @@ -1,5 +1,6 @@ #!/usr/bin/python3 +from datetime import datetime import logging import sys from os.path import basename, exists, normpath, splitext @@ -11,10 +12,10 @@ from urllib.parse import urlparse from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', - 'dsi': 'http://disqus.com/disqus-internals', - 'dsq': 'http://www.w3.org/2001/XMLSchema-instance'} + 'dsq': 'http://disqus.com/disqus-internals', + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'} logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', - level=logging.INFO) + level=logging.DEBUG) log = logging.getLogger() def init(filename: str) -> ET.Element: @@ -30,7 +31,7 @@ def init(filename: str) -> ET.Element: def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]: threads: Dict[int, Dict[str, str|List]] = {} for thread in root.findall("dflt:thread", ns): - thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) + thread_id = int(thread.get(f"{{{ns['dsq']}}}id", 0)) thread_link = thread.findtext("dflt:link", '', ns).strip() if thread_id not in threads: threads[thread_id] = {} @@ -49,11 +50,26 @@ def collect_posts(root: ET.Element, threads: dict) -> dict: is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true' if not (is_spam or is_del): print('.', end='', file=sys.stderr, flush=True) + comment_id = int(com_post.get(f"{{{ns['dsq']}}}id", 0)) thread = com_post.find('dflt:thread', ns) - thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) + thread_id = int(thread.get(f"{{{ns['dsq']}}}id", 0)) blurb = threads[thread_id]['blurb'] if blurb not in out: - out[blurb] = "" + out[blurb] = {} + out[blurb]['msg'] = '' + + parent = com_post.find('dflt:parent', ns) + if parent is not None: + parent_id = int(parent.get(f"{{{ns['dsq']}}}id", 0)) + else: + parent_id = 0 + + created_str = com_post.findtext('dflt:createdAt', '', ns).strip() + if created_str: + created_at = datetime.strptime(created_str, "%Y-%m-%dT%H:%M:%SZ") + else: + created_at = datetime.fromtimestamp(0) + html_msg = com_post.find('dflt:message', ns) if html_msg is not None: html_msg = html_msg.text @@ -61,10 +77,18 @@ def collect_posts(root: ET.Element, threads: dict) -> dict: stderr=DEVNULL, stdout=PIPE, stdin=PIPE, encoding='utf-8') html_msg = pd_proc.communicate(html_msg)[0] - out[blurb] += html_msg + '\n' + if created_at: + html_msg = f":date: {created_at:%d.%m.%Y}\n\n{html_msg}" + if parent_id: + html_msg = f":parentID#: {parent_id}\n{html_msg}" + if comment_id: + html_msg = f":commentID#: {comment_id}\n{html_msg}" + out[blurb]['msg'] += html_msg + '\n' + + print('', file=sys.stderr) return out - + # if not os.path.exists('comments/'): # os.mkdir('comments') @@ -73,6 +97,6 @@ if __name__=='__main__': threads = collect_threads(root) posts = collect_posts(root, threads) for post in posts: - if len(posts[post].strip()) > 0: - print(post + '\n' + posts[post]) + print('\n' + post) + pprint(posts[post]) -- cgit