xml2static_rst.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

#!/usr/bin/python3

import logging
import sys
from os.path import basename, exists, normpath, splitext
from pprint import pprint
from shutil import which
from subprocess import DEVNULL, Popen, run, STDOUT, PIPE
from typing import Dict, List
from urllib.parse import urlparse
from xml.etree import ElementTree as ET

ns = {'dflt': 'http://disqus.com',
      'dsi': 'http://disqus.com/disqus-internals',
      'dsq': 'http://www.w3.org/2001/XMLSchema-instance'}
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                    level=logging.INFO)
log = logging.getLogger()

def init(filename: str) -> ET.Element:
    try:
        pandoc_ver = run(['pandoc', '--version'], stderr=STDOUT, stdout=PIPE)
    except FileNotFoundError:
        print('Requires pandoc', file=sys.stderr)
        sys.exit(1)
    tree = ET.parse(filename)
    return tree.getroot()


def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]:
    threads: Dict[int, Dict[str, str|List]] = {}
    for thread in root.findall("dflt:thread", ns):
        thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
        thread_link = thread.findtext("dflt:link", '', ns).strip()
        if thread_id not in threads:
            threads[thread_id] = {}
        threads[thread_id]['link'] = thread_link
        if thread_link is not None:
            threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
        threads[thread_id]['msgs'] = []
    return threads


def collect_posts(root: ET.Element, threads: dict) -> dict:
    out = {}
    print('', file=sys.stderr, flush=True)
    for com_post in root.findall("dflt:post", ns):
        is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true'
        is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true'
        if not (is_spam or is_del):
            print('.', end='', file=sys.stderr, flush=True)
            thread = com_post.find('dflt:thread', ns)
            thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
            blurb = threads[thread_id]['blurb']
            if blurb not in out:
                out[blurb] = ""
            html_msg = com_post.find('dflt:message', ns)
            if html_msg is not None:
                html_msg = html_msg.text
                pd_proc = Popen([which('pandoc'), '-f', 'html', '-t', 'rst'],
                                stderr=DEVNULL, stdout=PIPE, stdin=PIPE,
                                encoding='utf-8')
                html_msg = pd_proc.communicate(html_msg)[0]
                out[blurb] += html_msg  + '\n'
    print('', file=sys.stderr)
    return out
    
# if not os.path.exists('comments/'):
#     os.mkdir('comments')

if __name__=='__main__':
    root = init("comments.xml")
    threads = collect_threads(root)
    posts = collect_posts(root, threads)
    for post in posts:
        if len(posts[post].strip()) > 0:
            print(post + '\n' + posts[post])