diff options
author | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 17:31:09 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@cepl.eu> | 2024-01-08 17:31:09 +0100 |
commit | da2e12c20265160a3a684ae92f2f0817f73e3a34 (patch) | |
tree | 2e0965f33c9bcff04b199a98f6c865587439d616 | |
parent | f4fdcb4adb3e346c42f3be1dcb60ce0a151b1b71 (diff) | |
download | disqus_xml2static_rst-da2e12c20265160a3a684ae92f2f0817f73e3a34.tar.gz |
More steps towards parsing the file.
-rw-r--r-- | comments.xml | 2 | ||||
-rw-r--r-- | xml2static_rst.py | 30 |
2 files changed, 28 insertions, 4 deletions
diff --git a/comments.xml b/comments.xml index cd257b1..ffe98a9 100644 --- a/comments.xml +++ b/comments.xml @@ -8680,7 +8680,7 @@ xsi:schemaLocation="http://disqus.com/api/schemas/1.0/disqus.xsd http://disqus.c </message> <createdAt>2021-11-18T16:16:33Z</createdAt> <isDeleted>false</isDeleted> - <isSpam>false</isSpam> + <isSpam>true</isSpam> <author> <name>morgan</name> <isAnonymous>false</isAnonymous> diff --git a/xml2static_rst.py b/xml2static_rst.py index 9bbe0d6..f5c0d68 100644 --- a/xml2static_rst.py +++ b/xml2static_rst.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 -import os.path +from os.path import basename, exists, normpath, splitext +from pprint import pprint +from urllib.parse import urlparse from xml.etree import ElementTree as ET ns = {'dflt': 'http://disqus.com', @@ -9,9 +11,31 @@ ns = {'dflt': 'http://disqus.com', tree = ET.parse("comments.xml") root = tree.getroot() +threads = {} +for thread in root.findall("dflt:thread", ns): + thread_id = thread.attrib[f"{{{ns['xsi']}}}id"] + thread_link = thread.find("dflt:link", ns).text.strip() + if thread_id not in threads: + threads[thread_id] = {} + threads[thread_id]['link'] = thread_link + threads[thread_id]['blurb'] = splitext(basename(normpath(urlparse(thread_link).path)))[0].strip() + threads[thread_id]['msgs'] = [] + +out = {} for com_post in root.findall("dflt:post", ns): if com_post.find('dflt:isSpam', ns).text == 'false': - print(ET.tostring(com_post, encoding='unicode', default_namespace=ns['dflt'])) - + thread = com_post.find('dflt:thread', ns) + thread_id = thread.attrib[f"{{{ns['xsi']}}}id"] + blurb = threads[thread_id]['blurb'] + if blurb not in out: + out[blurb] = "" + msg = com_post.find('dflt:message', ns).text + if msg is not None: + out[blurb] += msg + '\n' + +for post in out: + if len(out[post].strip()) > 0: + print(post + '\n' + out[post]) + # if not os.path.exists('comments/'): # os.mkdir('comments') |