aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@cepl.eu>2024-01-08 19:13:22 +0100
committerMatěj Cepl <mcepl@cepl.eu>2024-01-08 19:13:22 +0100
commit73880905fbfd363f6c6f6a151d4e322443545993 (patch)
treec638de965c22a7f5ad95357f814d133f565e3a5b
parent144133677ec6c1f6dc6f4de6298d84e8759c6aa4 (diff)
downloaddisqus_xml2static_rst-73880905fbfd363f6c6f6a151d4e322443545993.tar.gz
Convert body of comments with pandoc to reStructredText.
Fixes: https://todo.sr.ht/~mcepl/devel/25
-rw-r--r--xml2static_rst.py23
1 files changed, 21 insertions, 2 deletions
diff --git a/xml2static_rst.py b/xml2static_rst.py
index f2d26af..be936a2 100644
--- a/xml2static_rst.py
+++ b/xml2static_rst.py
@@ -1,7 +1,11 @@
#!/usr/bin/python3
+import logging
+import sys
from os.path import basename, exists, normpath, splitext
from pprint import pprint
+from shutil import which
+from subprocess import DEVNULL, Popen, run, STDOUT, PIPE
from typing import Dict, List
from urllib.parse import urlparse
from xml.etree import ElementTree as ET
@@ -9,8 +13,16 @@ from xml.etree import ElementTree as ET
ns = {'dflt': 'http://disqus.com',
'dsi': 'http://disqus.com/disqus-internals',
'dsq': 'http://www.w3.org/2001/XMLSchema-instance'}
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+ level=logging.INFO)
+log = logging.getLogger()
def init(filename: str) -> ET.Element:
+ try:
+ pandoc_ver = run(['pandoc', '--version'], stderr=STDOUT, stdout=PIPE)
+ except FileNotFoundError:
+ print('Requires pandoc', file=sys.stderr)
+ sys.exit(1)
tree = ET.parse(filename)
return tree.getroot()
@@ -31,10 +43,12 @@ def collect_threads(root: ET.Element) -> Dict[int, Dict[str, str|List]]:
def collect_posts(root: ET.Element, threads: dict) -> dict:
out = {}
+ print('', file=sys.stderr, flush=True)
for com_post in root.findall("dflt:post", ns):
is_spam = com_post.findtext('dflt:isSpam', '', ns) == 'true'
is_del = com_post.findtext('dflt:isDeleted', '', ns) == 'true'
if not (is_spam or is_del):
+ print('.', end='', file=sys.stderr, flush=True)
thread = com_post.find('dflt:thread', ns)
thread_id = int(thread.get(f"{{{ns['dsi']}}}id", 0))
blurb = threads[thread_id]['blurb']
@@ -42,8 +56,13 @@ def collect_posts(root: ET.Element, threads: dict) -> dict:
out[blurb] = ""
html_msg = com_post.find('dflt:message', ns)
if html_msg is not None:
- out[blurb] += html_msg.text + '\n'
-
+ html_msg = html_msg.text
+ pd_proc = Popen([which('pandoc'), '-f', 'html', '-t', 'rst'],
+ stderr=DEVNULL, stdout=PIPE, stdin=PIPE,
+ encoding='utf-8')
+ html_msg = pd_proc.communicate(html_msg)[0]
+ out[blurb] += html_msg + '\n'
+ print('', file=sys.stderr)
return out
# if not os.path.exists('comments/'):