#!/usr/bin/python3 from datetime import datetime import logging import sys from os import mkdir from os.path import basename, exists, normpath, splitext from pprint import pprint from shutil import which from subprocess import DEVNULL, Popen, run, STDOUT, PIPE from typing import Dict, List from urllib.parse import urlparse from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', 'dsq': 'http://disqus.com/disqus-internals', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'} logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) log = logging.getLogger() def init(filename: str) -> ET.Element: try: pandoc_ver = run(['pandoc', '--version'], stderr=STDOUT, stdout=PIPE) except FileNotFoundError: print('Requires pandoc', file=sys.stderr) sys.exit(1) tree = ET.parse(filename) return tree.getroot() def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]: threads: Dict[int, Dict[str, str|List]] = {} for thread in root.findall("dflt:thread", ns): thread_id = int(thread.get(f"{{{ns['dsq']}}}id", 0)) thread_link = thread.findtext("dflt:link", '', ns).strip() if thread_id not in threads: threads[thread_id] = {} threads[thread_id]['link'] = thread_link if thread_link is not None: threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip() threads[thread_id]['msgs'] = [] return threads def collect_posts(root: ET.Element, threads: dict) -> dict: out = {} print('', file=sys.stderr, flush=True) for com_post in root.findall("dflt:post", ns): is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true' is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true' if not (is_spam or is_del): print('.', end='', file=sys.stderr, flush=True) comment_id = int(com_post.get(f"{{{ns['dsq']}}}id", 0)) thread = com_post.find('dflt:thread', ns) thread_id = int(thread.get(f"{{{ns['dsq']}}}id", 0)) blurb = threads[thread_id]['blurb'] if blurb not in out: out[blurb] = {} out[blurb]['msg'] = '' parent = com_post.find('dflt:parent', ns) if parent is not None: parent_id = int(parent.get(f"{{{ns['dsq']}}}id", 0)) else: parent_id = 0 created_str = com_post.findtext('dflt:createdAt', '', ns).strip() if created_str: created_at = datetime.strptime(created_str, "%Y-%m-%dT%H:%M:%SZ") else: created_at = datetime.fromtimestamp(0) html_msg = com_post.find('dflt:message', ns) if html_msg is not None: html_msg = html_msg.text pd_proc = Popen([which('pandoc'), '-f', 'html', '-t', 'rst'], stderr=DEVNULL, stdout=PIPE, stdin=PIPE, encoding='utf-8') html_msg = pd_proc.communicate(html_msg)[0] if created_at: html_msg = f":date: {created_at:%d.%m.%Y}\n\n{html_msg}" if parent_id: html_msg = f":parentID#: {parent_id}\n{html_msg}" if comment_id: html_msg = f":commentID#: {comment_id}\n{html_msg}" out[blurb]['msg'] += html_msg + '\n' print('', file=sys.stderr) return out if __name__=='__main__': root = init("comments.xml") threads = collect_threads(root) posts = collect_posts(root, threads) if not exists('comments/'): mkdir('comments') for post in posts: with open(f"comments/{post}.rst", "w") as out_rst: print(posts[post]['msg'].strip(), file=out_rst)