diff options
author | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 19:13:22 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 19:13:22 +0100 |
commit | 73880905fbfd363f6c6f6a151d4e322443545993 (patch) | |
tree | c638de965c22a7f5ad95357f814d133f565e3a5b | |
parent | 144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (diff) | |
download | disqus_xml2static_rst-73880905fbfd363f6c6f6a151d4e322443545993.tar.gz |
Convert body of comments with pandoc to reStructredText.
Fixes: https://todo.sr.ht/~mcepl/devel/25
-rw-r--r-- | xml2static_rst.py | 23 |
1 files changed, 21 insertions, 2 deletions
diff --git a/xml2static_rst.py b/xml2static_rst.py index f2d26af..be936a2 100644 --- a/xml2static_rst.py +++ b/xml2static_rst.py @@ -1,7 +1,11 @@ #!/usr/bin/python3 +import logging +import sys from os.path import basename, exists, normpath, splitext from pprint import pprint +from shutil import which +from subprocess import DEVNULL, Popen, run, STDOUT, PIPE from typing import Dict, List from urllib.parse import urlparse from xml.etree import ElementTree as ET @@ -9,8 +13,16 @@ from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', 'dsi': 'http://disqus.com/disqus-internals', 'dsq': 'http://www.w3.org/2001/XMLSchema-instance'} +logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', + level=logging.INFO) +log = logging.getLogger() def init(filename: str) -> ET.Element: + try: + pandoc_ver = run(['pandoc', '--version'], stderr=STDOUT, stdout=PIPE) + except FileNotFoundError: + print('Requires pandoc', file=sys.stderr) + sys.exit(1) tree = ET.parse(filename) return tree.getroot() @@ -31,10 +43,12 @@ def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]: def collect_posts(root: ET.Element, threads: dict) -> dict: out = {} + print('', file=sys.stderr, flush=True) for com_post in root.findall("dflt:post", ns): is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true' is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true' if not (is_spam or is_del): + print('.', end='', file=sys.stderr, flush=True) thread = com_post.find('dflt:thread', ns) thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0)) blurb = threads[thread_id]['blurb'] @@ -42,8 +56,13 @@ def collect_posts(root: ET.Element, threads: dict) -> dict: out[blurb] = "" html_msg = com_post.find('dflt:message', ns) if html_msg is not None: - out[blurb] += html_msg.text + '\n' - + html_msg = html_msg.text + pd_proc = Popen([which('pandoc'), '-f', 'html', '-t', 'rst'], + stderr=DEVNULL, stdout=PIPE, stdin=PIPE, + encoding='utf-8') + html_msg = pd_proc.communicate(html_msg)[0] + out[blurb] += html_msg + '\n' + print('', file=sys.stderr) return out # if not os.path.exists('comments/'): |