diff options
author | Chris Little <chrislit@crosswire.org> | 2012-08-10 17:09:14 +0000 |
---|---|---|
committer | Chris Little <chrislit@crosswire.org> | 2012-08-10 17:09:14 +0000 |
commit | 0b7afc544455c9cdc31e129ee603bf8f021e8001 (patch) | |
tree | e6a8a146494748d10440eb08a6539119103598c0 /modules | |
parent | 901e20627dce8fe68bd9d6adba3173db06d64d7a (diff) | |
download | sword-tools-0b7afc544455c9cdc31e129ee603bf8f021e8001.tar.gz |
cleaned up spacing in output
fixed output validation errors due to addition of intro tags
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@375 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-x | modules/python/usfm2osis.py | 120 |
1 files changed, 63 insertions, 57 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py index e18e6da..698292c 100755 --- a/modules/python/usfm2osis.py +++ b/modules/python/usfm2osis.py @@ -35,7 +35,7 @@ scriptVersion = '0.5' # Employ best-practice conformant OSIS # Employ modularity (functions rather than a big long script) # Employ the same command-line syntax as usfm2osis.pl -# Use & abuse Unicode tags (http://unicode.org/charts/PDF/UE0000.pdf) to simplify Regex processing +# Use non-characters for milestoning ### Roadmap: # 0.5 initial commit, including full coverage of core USFM tags @@ -49,8 +49,8 @@ scriptVersion = '0.5' # 1.x SWORD module output?, requiring SWORD bindings ### Key to non-characters: -# Used : -# Unused : +# Used : +# Unused : # book # chapter # verse @@ -67,6 +67,13 @@ scriptVersion = '0.5' # s4 # s5 # notes +# intro-list +# intro-outline +# is1 +# is2 +# is3 +# is4 +# is5 import sys, codecs, re from encodings.aliases import aliases @@ -325,7 +332,7 @@ def convertToOSIS(sFile): # \rem_text... osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis) - # \restore: unpublished, seek example + # \restore_text... if relaxedConformance: osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis) @@ -359,20 +366,23 @@ def convertToOSIS(sFile): # \imt#_text... osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction">' + m.group(2) + '</title>', osis) + # \imte#_text... + osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis) + # \is#_text... osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) - osis = re.sub(r'\\is2\s+(.+)', lambda m: u'<div type="subsection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\is2\s+(.+)', lambda m: u'<div type="subSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is3\s+(.+)', lambda m: u'<div type="x-subSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is4\s+(.+)', lambda m: u'<div type="x-subSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is5\s+(.+)', lambda m: u'<div type="x-subSubSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) # \ip_text... - osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p subType="x-introduction">\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p subType="x-introduction">\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) # \ipi_text... # \im_text... @@ -381,11 +391,11 @@ def convertToOSIS(sFile): # \imq_text... # \ipr_text... pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'} - osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) # \iq#_text... - osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u''+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL) - osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u''+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u''+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u''+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL) # \ib osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis) @@ -394,18 +404,18 @@ def convertToOSIS(sFile): osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg> # \ili#_text... - osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u''+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u''+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u''+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u''+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL) + osis = re.sub(u'(<item [^]+</item>)', ur'<list>\1</list>', osis, flags=re.DOTALL) # \iot_text... # \io#_text...(references range) - osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="head">\1</item type="head">', osis, flags=re.DOTALL) + osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">\2</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="head">\1</item type="head">', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub(u'(<item [^]+</item>)', r'<div type="outline"><list>\1</list></div>', osis, flags=re.DOTALL) + osis = re.sub(u'(<item [^]+</item>)', ur'<div type="outline"><list>\1</list></div>', osis, flags=re.DOTALL) osis = re.sub('item type="head"', 'head', osis) # \ior_text...\ior* @@ -417,9 +427,6 @@ def convertToOSIS(sFile): # \iqt_text...\iqt* osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'<q subType="x-introduction">\1</q>', osis, flags=re.DOTALL) - # \imte#_text... - osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis) - # \ie osis = re.sub(r'\\ie\b\s*', '<milestone type="x-usfm-ie"/>', osis) @@ -433,33 +440,33 @@ def convertToOSIS(sFile): """ # \ms#_text... osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'<div type="majorSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms4\s+(.+)', lambda m: u'<div type="majorSection" n="4"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms5\s+(.+)', lambda m: u'<div type="majorSection" n="5"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) # \mr_text... osis = re.sub(r'\\mr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis) # \s#_text... osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'<div type="section"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'(<div type="section">[^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) if relaxedConformance: osis = re.sub(r'\\ss\s+', r'\\s2 ', osis) osis = re.sub(r'\\sss\s+', r'\\s3 ', osis) - osis = re.sub(r'\\s2\s+(.+)', lambda m: u'<div type="subsection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s2\s+(.+)', lambda m: u'<div type="subSection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'(<div type="subSection">[^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s3\s+(.+)', lambda m: u'<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'(<div type="x-subSubSection">[^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s4\s+(.+)', lambda m: u'<div type="x-subSubSubSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'(<div type="x-subSubSubSection">[^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s5\s+(.+)', lambda m: u'<div type="x-subSubSubSubSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(u'(<div type="x-subSubSubSubSection">[^]+)', r'\1'+u'</div>\n', osis, flags=re.DOTALL) # \sr_text... osis = re.sub(r'\\sr\s+(.+)', ur'<title type="scope"><reference>\1</reference></title>', osis) @@ -564,10 +571,10 @@ def convertToOSIS(sFile): # \li#(_text...) osis = re.sub(r'\\ph\b\s*', r'\\li ', osis) osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis) - osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL) + osis = re.sub(u'(<item [^]+</item>)', ur'<list>\1</list>', osis, flags=re.DOTALL) # \b osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis) @@ -591,14 +598,14 @@ def convertToOSIS(sFile): osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL) # \q#(_text...) - osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) - osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) # \qr_text... # \qc_text... # \qm#(_text...) qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'} - osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) osis = osis.replace('\n</l>', '</l>\n') osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL) @@ -649,7 +656,7 @@ def convertToOSIS(sFile): note = re.sub(r'\\ft\s', '', note) # \fr_##SEP## - note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference>\1</reference>', note) + note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference type="annotateRef">\1</reference>', note) # \fk_ note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note) @@ -712,21 +719,20 @@ def convertToOSIS(sFile): # \xq_ note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) - # \xt_ - note = re.sub(r'\\xt\s', '', note) + # \xo_##SEP## + note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference type="annotateRef">\1</reference>', note) + # \xk_ + note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) + + # \xt_ # This isn't guaranteed to be *the* reference, but it's a good guess. + note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note) + if relaxedConformance: # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. # TODO: \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference. pass - - # \xo_##SEP## - note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note) - - # \xk_ - note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) - if relaxedConformance: note = note.replace(r'\xq*', '') note = note.replace(r'\xt*', '') @@ -743,7 +749,7 @@ def convertToOSIS(sFile): supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc* """ # \x_+_...\x* - osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) @@ -871,7 +877,7 @@ def convertToOSIS(sFile): if fig_cap: figure += '<caption>' + fig_cap + '</caption>\n' if fig_ref: - figure += '<reference>' + fig_ref + '</reference>\n' + figure += '<reference type="annotateRef">' + fig_ref + '</reference>\n' if fig_desc: figure += '<!-- fig DESC - ' + fig_desc + ' -->\n' if fig_loc: @@ -1019,12 +1025,12 @@ def convertToOSIS(sFile): osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis) osis = osis.replace('<lb type="x-p"/>', '<lb/>') # delete Unicode tags - for c in u'': + for c in u'': osis = osis.replace(c, '') - for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']: - osis = re.sub(' +</'+endBlock+'>', '</'+endBlock+r'>', osis) - osis = re.sub(' +<'+endBlock+'( eID=[^/>]+/>)', '</'+endBlock+r'\1', osis) + for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']: + osis = re.sub('\s+</'+endBlock+'>', '</'+endBlock+r'>\n', osis) + osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis) osis = re.sub(' +((</[^>]+>)+) *', r'\1 ', osis) # strip extra spaces & newlines |