blob: 5824efc003a6d45f61d95625c2d1dd8afa6678a8 (
plain) (
tree)
|
|
#!/usr/bin/python3
from os.path import basename, exists, normpath, splitext
from pprint import pprint
from urllib.parse import urlparse
from xml.etree import ElementTree as ET
ns = {'dflt': 'http://disqus.com',
'xsi': 'http://disqus.com/disqus-internals',
'dsq': 'http://www.w3.org/2001/XMLSchema-instance'}
tree = ET.parse("comments.xml")
root = tree.getroot()
threads = {}
for thread in root.findall("dflt:thread", ns):
thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
thread_link = thread.find("dflt:link", ns).text.strip()
if thread_id not in threads:
threads[thread_id] = {}
threads[thread_id]['link'] = thread_link
threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip()
threads[thread_id]['msgs'] = []
out = {}
for com_post in root.findall("dflt:post", ns):
is_spam = com_post.find('dflt:isSpam', ns).text == 'true'
is_del = com_post.find('dflt:isDeleted', ns).text == 'true'
if not (is_spam or is_del):
thread = com_post.find('dflt:thread', ns)
thread_id = thread.attrib[f"{{{ns['xsi']}}}id"]
blurb = threads[thread_id]['blurb']
if blurb not in out:
out[blurb] = ""
msg = com_post.find('dflt:message', ns).text
if msg is not None:
out[blurb] += msg + '\n'
for post in out:
if len(out[post].strip()) > 0:
print(post + '\n' + out[post])
# if not os.path.exists('comments/'):
# os.mkdir('comments')
|