#!/usr/bin/python3 import logging import sys from os.path import basename, exists, normpath, splitext from pprint import pprint from shutil import which from subprocess import DEVNULL, Popen, run, STDOUT, PIPE from typing import Dict, List from urllib.parse import urlparse from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', 'dsi': 'http://disqus.com/disqus-internals', 'dsq': 'http://www.w3.org/2001/XMLSchema-instance'} logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.INFO) log = logging.getLogger() def init(filename: str) -> ET.Element: try: pandoc_ver = run(['pandoc', '--version'], stderr=STDOUT, stdout=PIPE) except FileNotFoundError: print('Requires pandoc', file=sys.stderr) sys.exit(1) tree = ET.parse(filename) return tree.getroot() def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]: threads: Dict[int, Dict[str, str|List]] = {} for thread in root.findall("dflt:thread", ns): thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) thread_link = thread.findtext("dflt:link", '', ns).strip() if thread_id not in threads: threads[thread_id] = {} threads[thread_id]['link'] = thread_link if thread_link is not None: threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip() threads[thread_id]['msgs'] = [] return threads def collect_posts(root: ET.Element, threads: dict) -> dict: out = {} print('', file=sys.stderr, flush=True) for com_post in root.findall("dflt:post", ns): is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true' is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true' if not (is_spam or is_del): print('.', end='', file=sys.stderr, flush=True) thread = com_post.find('dflt:thread', ns) thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) blurb = threads[thread_id]['blurb'] if blurb not in out: out[blurb] = "" html_msg = com_post.find('dflt:message', ns) if html_msg is not None: html_msg = html_msg.text pd_proc = Popen([which('pandoc'), '-f', 'html', '-t', 'rst'], stderr=DEVNULL, stdout=PIPE, stdin=PIPE, encoding='utf-8') html_msg = pd_proc.communicate(html_msg)[0] out[blurb] += html_msg + '\n' print('', file=sys.stderr) return out # if not os.path.exists('comments/'): # os.mkdir('comments') if __name__=='__main__': root = init("comments.xml") threads = collect_threads(root) posts = collect_posts(root, threads) for post in posts: if len(posts[post].strip()) > 0: print(post + '\n' + posts[post])