diff options
author | Matěj Cepl <mcepl@cepl.eu> | 2016-09-07 10:19:19 +0200 |
---|---|---|
committer | Matěj Cepl <mcepl@cepl.eu> | 2016-09-07 10:19:19 +0200 |
commit | 4191ffe4dd1c2d175b854b3f99e069b166ff4712 (patch) | |
tree | 0c9c8c812c14f9032408aad98566c1bdc94d2451 /parsing_milestoned_xml.rst | |
parent | e785d0e1c09d2eca8bbf449efbb240653b24d8d3 (diff) | |
download | blog-source-4191ffe4dd1c2d175b854b3f99e069b166ff4712.tar.gz |
More posts
Diffstat (limited to 'parsing_milestoned_xml.rst')
-rw-r--r-- | parsing_milestoned_xml.rst | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/parsing_milestoned_xml.rst b/parsing_milestoned_xml.rst new file mode 100644 index 0000000..0242103 --- /dev/null +++ b/parsing_milestoned_xml.rst @@ -0,0 +1,271 @@ +Parsing milestoned XML in Python +================================ + +:date: 2016-08-25T22:13:29 +:category: computer +:tags: python, xml, bible, recursive, generator + +.. zotero-setup:: + :style: chicago-author-date + +I am trying to write a tool in Python (using Python 3.4 to be able to +use the latest Python standard library on Windows without using any +external libraries on Windows) for some manipulation with the source +code for the Bible texts. + +Let me first explain what is the milestoned XML, because many normal +Python programmers dealing with normal XML documents may not be familiar +with it. There is a problem with using XML markup for documents with +complicated structure. One rather complete article on this topic is +:xcite:`@derose:proceedings`. + +Briefly [#]_ , the problem in many areas (especially in documents +processing) is with multiple possible hierarchies overlapping each other +(e.g., in Bibles there are divisions of text which are going across +verse and chapters boundaries and sometimes terminating in the middle of +verse, many especially English Bibles marks Jesus’ sayings with +a special element, and of course this can go over several verses etc.). +One of the ways how to overcome obvious problem that XML doesn't allow +overlapping elements is to use milestones_. So for example the book of +Bible could be divided not like + +.. code-block:: xml + + <book> + <chapter> + <verse>text</verse> + ... + </chapter> + ... + </book> + +but just putting milestones in the text, i.e.: + +.. code-block:: xml + + <book> + <chapter n="1" /> + <verse sID="ID1.1" />text of verse 1.1 + <verse eID="ID1.1" /> .... + </book> + +So, in my case the part of the document may look like + +.. code-block:: xml + + text text + <verse/> + textB textB <czap> textC textC <verse/> textD textD </czap> + +And I would like to get from some kind of iterator this series of +outputs: + +.. code-block:: python + + [(1, 1, "text text", ['text text']), + (1, 2, "textB textB textC textC", + ['<verse/>', 'textB textB', '<czap>', 'textC textC']), + (1, 3, "textD textD", ['<verse/>', 'textD textD', '</czap>'])] + +(the first two numbers should be number of the chapter and verse +respectively). + +My first attempt was in its core this iterator: + +.. code-block:: python + + def __iter__(self) -> Tuple[int, int, str]: + """ + iterate through the first level elements + + NOTE: this iterator assumes only all milestoned elements on the first + level of depth. If this assumption fails, it might be necessary to + rewrite this function (or perhaps ``text`` method) to be recursive. + """ + collected = None + + for child in self.root: + if child.tag in ['titulek']: + continue + if child.tag in ['kap', 'vers']: + if collected and collected.strip(): + yield self.cur_chapter, self.cur_verse, \ + self._list_to_clean_text(collected) + if child.tag == 'kap': + self.cur_chapter = int(child.get('n')) + elif child.tag == 'vers': + self.cur_verse = int(child.get('n')) + collected = child.tail or '' + else: + if collected is not None: + if child.text is not None: + collected += child.text + for sub_child in child: + collected += self._recursive_get_text(sub_child) + if child.tail is not None: + collected += child.tail + +(``self.root`` is a product of +``ElementTree.parse(file_name).getroot()``). The problem of this code +lies in the note. When the ``<verse/>`` element is inside of ``<czap>`` +one, it is ignored. So, obviously we have to make our iterator +recursive. My first idea was to make this script parsing and +regenerating XML: + +.. code-block:: python + + #!/usr/bin/env python3 + from xml.etree import ElementTree as ET + from typing import List + + def start_element(elem: ET.Element) -> str: + outx = ['<{} '.format(elem.tag)] + for attr, attval in elem.items(): + outx.append('{}={} '.format(attr, attval)) + outx.append('>') + return ''.join(outx) + + + def recursive_parse(elem: ET.Element) -> List[str]: + col_xml = [] + col_txt = '' + cur_chapter = chap + + if elem.text is None: + col_xml.append(ET.tostring(elem)) + if elem.tail is not None: + col_txt += elem.tail + else: + col_xml.extend([start_element(elem), elem.text]) + col_txt += elem.text + for subch in elem: + subch_xml, subch_text = recursive_parse(subch) + col_xml.extend(subch_xml) + col_txt += subch_text + col_xml.append('</{}>'.format(elem.tag)) + + if elem.tail is not None: + col_xml.append(elem.tail) + col_txt += elem.tail + + return col_xml, col_txt + + + if __name__ == '__main__': + # write result XML to CRLF-delimited file with + # ET.tostring(ET.fromstringlist(result), encoding='utf8') + # or encoding='unicode'? Better for testing? + xml_file = ET.parse('tests/data/Mat-old.xml') + + collected_XML, collected_TEXT = recursive_parse(xml_file.getroot()) + with open('test.xml', 'w', encoding='utf8', newline='\r\n') as outf: + print(ET.tostring(ET.fromstringlist(collected_XML), + encoding='unicode'), file=outf) + + with open('test.txt', 'w', encoding='utf8', newline='\r\n') as outf: + print(collected_TEXT, file=outf) + +This works correctly in sense that the generated file ``test.xml`` is +identical to the original XML file (after reformatting both files with +``tidy -i -xml -utf8``). However, it is not iterator, so I would like to +somehow combine the virtues of both snippets of code into one. +Obviously, the problem is that ``return`` in my ideal code should serve +two purposes. Once it should actually yield nicely formatted result from +the iterator, second time it should just provide content of the inner +elements (or not, if the inner element contains ``<verse/>`` element). +If my ideal world I would like to get ``recursive_parse()`` to function +as an iterator capable of something like this: + +.. code-block:: python + + if __name__ == '__main__': + xml_file = ET.parse('tests/data/Mat-old.xml') + parser = ET.XMLParser(target=ET.TreeBuilder()) + + with open('test.txt', 'w', newline='\r\n') as out_txt, \ + open('test.xml', 'w', newline='\r\n') as out_xml: + for ch, v, verse_txt, verse_xml in recursive_parse(xml_file): + print(verse_txt, file=out_txt) + # or directly parser.feed(verse_xml) + # if verse_xml is not a list + parser.feed(''.join(verse_xml)) + + print(ET.tostring(parser.close(), encoding='unicode'), + file=out_xml) + + +So, my first attempt to rewrite the iterator (so far without the XML +part I have): + +.. code-block:: python + + def __iter__(self) -> Tuple[CollectedInfo, str]: + """ + iterate through the first level elements + """ + cur_chapter = 0 + cur_verse = 0 + collected_txt = '' + # collected XML is NOT directly convertable into Element objects, + # it should be treated more like a list of SAX-like events. + # + # xml.etree.ElementTree.fromstringlist(sequence, parser=None) + # Parses an XML document from a sequence of string fragments. + # sequence is a list or other sequence containing XML data fragments. + # parser is an optional parser instance. If not given, the standard + # XMLParser parser is used. Returns an Element instance. + # + # sequence = ["<html><body>", "text</bo", "dy></html>"] + # element = ET.fromstringlist(sequence) + # self.assertEqual(ET.tostring(element), + # b'<html><body>text</body></html>') + # FIXME přidej i sběr XML útržků + # collected_xml = None + + for child in self.root.iter(): + if child.tag in ['titulek']: + collected_txt += '\n{}\n'.format(child.text) + collected_txt += child.tail or '' + if child.tag in ['kap', 'vers']: + if collected_txt and collected_txt.strip(): + yield CollectedInfo(cur_chapter, cur_verse, + re.sub(r'[\s\n]+', ' ', collected_txt, + flags=re.DOTALL).strip()), \ + child.tail or '' + + if child.tag == 'kap': + cur_chapter = int(child.get('n')) + elif child.tag == 'vers': + cur_verse = int(child.get('n')) + else: + collected_txt += child.text or '' + + for sub_child in child: + for sub_info, sub_tail in MilestonedElement(sub_child): + if sub_info.verse == 0 or sub_info.chap == 0: + collected_txt += sub_info.text + sub_tail + else: + # FIXME what happens if sub_element contains + # multiple <verse/> elements? + yield CollectedInfo( + sub_info.chap, sub_info.verse, + collected_txt + sub_info.text), '' + collected_txt = sub_tail + + collected_txt += child.tail or '' + + yield CollectedInfo(0, 0, collected_txt), '' + +Am I going the right way, or did I still not get it? + + +.. [#] From the discussion_ of the topic on the XSL list. + +.. _discussion: + http://www.oxygenxml.com/archives/xsl-list/201202/msg00170.html + +.. _milestones: + https://crosswire.org/wiki/OSIS_Bibles#OSIS_Milestones + + +.. bibliography:: |