summaryrefslogtreecommitdiffstats
path: root/parsing_milestoned_xml.rst
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@cepl.eu>2016-09-07 10:19:19 +0200
committerMatěj Cepl <mcepl@cepl.eu>2016-09-07 10:19:19 +0200
commit4191ffe4dd1c2d175b854b3f99e069b166ff4712 (patch)
tree0c9c8c812c14f9032408aad98566c1bdc94d2451 /parsing_milestoned_xml.rst
parente785d0e1c09d2eca8bbf449efbb240653b24d8d3 (diff)
downloadblog-source-4191ffe4dd1c2d175b854b3f99e069b166ff4712.tar.gz
More posts
Diffstat (limited to 'parsing_milestoned_xml.rst')
-rw-r--r--parsing_milestoned_xml.rst271
1 files changed, 271 insertions, 0 deletions
diff --git a/parsing_milestoned_xml.rst b/parsing_milestoned_xml.rst
new file mode 100644
index 0000000..0242103
--- /dev/null
+++ b/parsing_milestoned_xml.rst
@@ -0,0 +1,271 @@
+Parsing milestoned XML in Python
+================================
+
+:date: 2016-08-25T22:13:29
+:category: computer
+:tags: python, xml, bible, recursive, generator
+
+.. zotero-setup::
+ :style: chicago-author-date
+
+I am trying to write a tool in Python (using Python 3.4 to be able to
+use the latest Python standard library on Windows without using any
+external libraries on Windows) for some manipulation with the source
+code for the Bible texts.
+
+Let me first explain what is the milestoned XML, because many normal
+Python programmers dealing with normal XML documents may not be familiar
+with it. There is a problem with using XML markup for documents with
+complicated structure. One rather complete article on this topic is
+:xcite:`@derose:proceedings`.
+
+Briefly [#]_ , the problem in many areas (especially in documents
+processing) is with multiple possible hierarchies overlapping each other
+(e.g., in Bibles there are divisions of text which are going across
+verse and chapters boundaries and sometimes terminating in the middle of
+verse, many especially English Bibles marks Jesus’ sayings with
+a special element, and of course this can go over several verses etc.).
+One of the ways how to overcome obvious problem that XML doesn't allow
+overlapping elements is to use milestones_. So for example the book of
+Bible could be divided not like
+
+.. code-block:: xml
+
+ <book>
+ <chapter>
+ <verse>text</verse>
+ ...
+ </chapter>
+ ...
+ </book>
+
+but just putting milestones in the text, i.e.:
+
+.. code-block:: xml
+
+ <book>
+ <chapter n="1" />
+ <verse sID="ID1.1" />text of verse 1.1
+ <verse eID="ID1.1" /> ....
+ </book>
+
+So, in my case the part of the document may look like
+
+.. code-block:: xml
+
+ text text
+ <verse/>
+ textB textB <czap> textC textC <verse/> textD textD </czap>
+
+And I would like to get from some kind of iterator this series of
+outputs:
+
+.. code-block:: python
+
+ [(1, 1, "text text", ['text text']),
+ (1, 2, "textB textB textC textC",
+ ['<verse/>', 'textB textB', '<czap>', 'textC textC']),
+ (1, 3, "textD textD", ['<verse/>', 'textD textD', '</czap>'])]
+
+(the first two numbers should be number of the chapter and verse
+respectively).
+
+My first attempt was in its core this iterator:
+
+.. code-block:: python
+
+ def __iter__(self) -> Tuple[int, int, str]:
+ """
+ iterate through the first level elements
+
+ NOTE: this iterator assumes only all milestoned elements on the first
+ level of depth. If this assumption fails, it might be necessary to
+ rewrite this function (or perhaps ``text`` method) to be recursive.
+ """
+ collected = None
+
+ for child in self.root:
+ if child.tag in ['titulek']:
+ continue
+ if child.tag in ['kap', 'vers']:
+ if collected and collected.strip():
+ yield self.cur_chapter, self.cur_verse, \
+ self._list_to_clean_text(collected)
+ if child.tag == 'kap':
+ self.cur_chapter = int(child.get('n'))
+ elif child.tag == 'vers':
+ self.cur_verse = int(child.get('n'))
+ collected = child.tail or ''
+ else:
+ if collected is not None:
+ if child.text is not None:
+ collected += child.text
+ for sub_child in child:
+ collected += self._recursive_get_text(sub_child)
+ if child.tail is not None:
+ collected += child.tail
+
+(``self.root`` is a product of
+``ElementTree.parse(file_name).getroot()``). The problem of this code
+lies in the note. When the ``<verse/>`` element is inside of ``<czap>``
+one, it is ignored. So, obviously we have to make our iterator
+recursive. My first idea was to make this script parsing and
+regenerating XML:
+
+.. code-block:: python
+
+ #!/usr/bin/env python3
+ from xml.etree import ElementTree as ET
+ from typing import List
+
+ def start_element(elem: ET.Element) -> str:
+ outx = ['<{} '.format(elem.tag)]
+ for attr, attval in elem.items():
+ outx.append('{}={} '.format(attr, attval))
+ outx.append('>')
+ return ''.join(outx)
+
+
+ def recursive_parse(elem: ET.Element) -> List[str]:
+ col_xml = []
+ col_txt = ''
+ cur_chapter = chap
+
+ if elem.text is None:
+ col_xml.append(ET.tostring(elem))
+ if elem.tail is not None:
+ col_txt += elem.tail
+ else:
+ col_xml.extend([start_element(elem), elem.text])
+ col_txt += elem.text
+ for subch in elem:
+ subch_xml, subch_text = recursive_parse(subch)
+ col_xml.extend(subch_xml)
+ col_txt += subch_text
+ col_xml.append('</{}>'.format(elem.tag))
+
+ if elem.tail is not None:
+ col_xml.append(elem.tail)
+ col_txt += elem.tail
+
+ return col_xml, col_txt
+
+
+ if __name__ == '__main__':
+ # write result XML to CRLF-delimited file with
+ # ET.tostring(ET.fromstringlist(result), encoding='utf8')
+ # or encoding='unicode'? Better for testing?
+ xml_file = ET.parse('tests/data/Mat-old.xml')
+
+ collected_XML, collected_TEXT = recursive_parse(xml_file.getroot())
+ with open('test.xml', 'w', encoding='utf8', newline='\r\n') as outf:
+ print(ET.tostring(ET.fromstringlist(collected_XML),
+ encoding='unicode'), file=outf)
+
+ with open('test.txt', 'w', encoding='utf8', newline='\r\n') as outf:
+ print(collected_TEXT, file=outf)
+
+This works correctly in sense that the generated file ``test.xml`` is
+identical to the original XML file (after reformatting both files with
+``tidy -i -xml -utf8``). However, it is not iterator, so I would like to
+somehow combine the virtues of both snippets of code into one.
+Obviously, the problem is that ``return`` in my ideal code should serve
+two purposes. Once it should actually yield nicely formatted result from
+the iterator, second time it should just provide content of the inner
+elements (or not, if the inner element contains ``<verse/>`` element).
+If my ideal world I would like to get ``recursive_parse()`` to function
+as an iterator capable of something like this:
+
+.. code-block:: python
+
+ if __name__ == '__main__':
+ xml_file = ET.parse('tests/data/Mat-old.xml')
+ parser = ET.XMLParser(target=ET.TreeBuilder())
+
+ with open('test.txt', 'w', newline='\r\n') as out_txt, \
+ open('test.xml', 'w', newline='\r\n') as out_xml:
+ for ch, v, verse_txt, verse_xml in recursive_parse(xml_file):
+ print(verse_txt, file=out_txt)
+ # or directly parser.feed(verse_xml)
+ # if verse_xml is not a list
+ parser.feed(''.join(verse_xml))
+
+ print(ET.tostring(parser.close(), encoding='unicode'),
+ file=out_xml)
+
+
+So, my first attempt to rewrite the iterator (so far without the XML
+part I have):
+
+.. code-block:: python
+
+ def __iter__(self) -> Tuple[CollectedInfo, str]:
+ """
+ iterate through the first level elements
+ """
+ cur_chapter = 0
+ cur_verse = 0
+ collected_txt = ''
+ # collected XML is NOT directly convertable into Element objects,
+ # it should be treated more like a list of SAX-like events.
+ #
+ # xml.etree.ElementTree.fromstringlist(sequence, parser=None)
+ # Parses an XML document from a sequence of string fragments.
+ # sequence is a list or other sequence containing XML data fragments.
+ # parser is an optional parser instance. If not given, the standard
+ # XMLParser parser is used. Returns an Element instance.
+ #
+ # sequence = ["<html><body>", "text</bo", "dy></html>"]
+ # element = ET.fromstringlist(sequence)
+ # self.assertEqual(ET.tostring(element),
+ # b'<html><body>text</body></html>')
+ # FIXME přidej i sběr XML útržků
+ # collected_xml = None
+
+ for child in self.root.iter():
+ if child.tag in ['titulek']:
+ collected_txt += '\n{}\n'.format(child.text)
+ collected_txt += child.tail or ''
+ if child.tag in ['kap', 'vers']:
+ if collected_txt and collected_txt.strip():
+ yield CollectedInfo(cur_chapter, cur_verse,
+ re.sub(r'[\s\n]+', ' ', collected_txt,
+ flags=re.DOTALL).strip()), \
+ child.tail or ''
+
+ if child.tag == 'kap':
+ cur_chapter = int(child.get('n'))
+ elif child.tag == 'vers':
+ cur_verse = int(child.get('n'))
+ else:
+ collected_txt += child.text or ''
+
+ for sub_child in child:
+ for sub_info, sub_tail in MilestonedElement(sub_child):
+ if sub_info.verse == 0 or sub_info.chap == 0:
+ collected_txt += sub_info.text + sub_tail
+ else:
+ # FIXME what happens if sub_element contains
+ # multiple <verse/> elements?
+ yield CollectedInfo(
+ sub_info.chap, sub_info.verse,
+ collected_txt + sub_info.text), ''
+ collected_txt = sub_tail
+
+ collected_txt += child.tail or ''
+
+ yield CollectedInfo(0, 0, collected_txt), ''
+
+Am I going the right way, or did I still not get it?
+
+
+.. [#] From the discussion_ of the topic on the XSL list.
+
+.. _discussion:
+ http://www.oxygenxml.com/archives/xsl-list/201202/msg00170.html
+
+.. _milestones:
+ https://crosswire.org/wiki/OSIS_Bibles#OSIS_Milestones
+
+
+.. bibliography::