diff options
author | Chris Little <chrislit@crosswire.org> | 2012-08-27 07:48:06 +0000 |
---|---|---|
committer | Chris Little <chrislit@crosswire.org> | 2012-08-27 07:48:06 +0000 |
commit | cc7f674fae7feff6206b2ce03c44562fedeec47c (patch) | |
tree | 4f14ff8f84cf68271e04e2f257bfb7bb68ba705d /modules | |
parent | 50cd7a24860a44e186591d1d6fafffe09f431fee (diff) | |
download | sword-tools-cc7f674fae7feff6206b2ce03c44562fedeec47c.tar.gz |
cleaned up excess spaces
completed Python3 compatibility implementation (still works with (C)Python2 & PyPy, but not Jython due to 2.6+ features (multiprocessing))
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@400 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-x | modules/python/usfm2osis.py | 103 |
1 files changed, 51 insertions, 52 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py index 618c932..ab6e6a8 100755 --- a/modules/python/usfm2osis.py +++ b/modules/python/usfm2osis.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import print_function, unicode_literals +#from __future__ import print_function, unicode_literals date = '$Date$' rev = '$Rev$' @@ -20,7 +20,7 @@ scriptVersion = '0.5' # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # The full text of the GNU General Public License is available at: @@ -40,7 +40,7 @@ scriptVersion = '0.5' ### Roadmap: # 0.5 initial commit, including full coverage of core USFM tags -# 0.6 file sorting options (natural/alphabetic/canonical/none); expand sub-verses with ! in osisIDs; Python3 compatability; add optional schema validator (lxml probably); docstrings; unittest; make fully OO; PyDev project? +# 0.6 file sorting options (natural/alphabetic/canonical/none); expand sub-verses with ! in osisIDs; Python3 compatability; add optional schema validator (lxml probably); docstrings; unittest; make fully OO; PyDev project? # 0.7 test suite incorporating all USFM examples from UBS ICAP and other complex cases # 0.8 more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers; clear remaining TODO items, to the extent possible # 1.0 feature complete for release & production use @@ -51,11 +51,11 @@ scriptVersion = '0.5' ### TODO for 0.6: # expand sub-verses with ! in osisIDs -# Python3 compatability # document functions (docstrings) # unittest # make fully OO -# PyDev project? +# PyDev project? +# check Python2/3 compatibility ### Key to non-characters: # Used : \uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1 @@ -88,7 +88,11 @@ scriptVersion = '0.5' import sys, codecs, re from encodings.aliases import aliases -import multiprocessing, Queue +import multiprocessing +if sys.version_info[0] < 3: + import Queue +else: + import queue as Queue import random date = date.replace('$', '').strip()[6:16] @@ -170,26 +174,26 @@ canonicalOrder = [ # 1) Book representing parts of protocanonical books follow the primary book # 2) Variants follow primary forms # 3) Books that appear in only one tradition or Bible appear following their traditional/attested antecedent - + # There's no fool-proof way to order books without knowing the tradition ahead of time, # but this ordering should get it right often for many common real Bibles. - + # Front Matter - 'FRONT', 'INTRODUCTION', + 'FRONT', 'INTRODUCTION', # OT 'Gen', 'Exod', 'Lev', 'Num', 'Deut', 'Josh', 'JoshA', 'Judg', 'JudgB', 'Ruth', '1Sam', '2Sam', '1Kgs', '2Kgs', '1Chr', '2Chr', 'PrMan', 'Jub', '1En', 'Ezra', 'Neh', 'Tob', 'TobS', 'Jdt', 'Esth', 'EsthGr', 'AddEsth', '1Meq', '2Meq', '3Meq', 'Job', 'Ps', 'AddPs', '5ApocSyrPss', 'PsMet', 'Odes', 'Prov', 'Reproof', 'Eccl', 'Song', - 'Wis', 'Sir', 'WSir', 'PrSol', 'PssSol', - 'Isa', 'Jer', 'Lam', 'PrJer', 'Bar', 'EpJer', '2Bar', 'EpBar', '4Bar', - 'Ezek', 'Dan', 'DanGr', 'DanTh', 'PrAzar', 'Sus', 'SusTh', 'Bel', 'BelTh', + 'Wis', 'Sir', 'WSir', 'PrSol', 'PssSol', + 'Isa', 'Jer', 'Lam', 'PrJer', 'Bar', 'EpJer', '2Bar', 'EpBar', '4Bar', + 'Ezek', 'Dan', 'DanGr', 'DanTh', 'PrAzar', 'Sus', 'SusTh', 'Bel', 'BelTh', 'Hos', 'Joel', 'Amos', 'Obad', 'Jonah', 'Mic', 'Nah', 'Hab', 'Zeph', 'Hag', 'Zech', 'Mal', # Intertestamentals '1Esd', '2Esd', '4Ezra', '5Ezra', '6Ezra', - '1Macc', '2Macc', '3Macc', '4Macc', + '1Macc', '2Macc', '3Macc', '4Macc', # NT 'Matt', 'Mark', 'Luke', 'John', 'Acts', 'Rom', '1Cor', '2Cor', @@ -203,7 +207,7 @@ canonicalOrder = [ # Private-Use Extensions 'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG', - + # Back Matter 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER' @@ -211,12 +215,12 @@ canonicalOrder = [ usfmNumericOrder = [ # Front Matter - 'FRONT', 'INTRODUCTION', + 'FRONT', 'INTRODUCTION', # OT 01-39 'Gen', 'Exod', 'Lev', 'Num', 'Deut', 'Josh', 'Judg', 'Ruth', - '1Sam', '2Sam', '1Kgs', '2Kgs', '1Chr', '2Chr', 'Ezra', 'Neh', - 'Esth', 'Job', 'Ps', 'Prov', 'Eccl', 'Song', 'Isa', 'Jer', + '1Sam', '2Sam', '1Kgs', '2Kgs', '1Chr', '2Chr', 'Ezra', 'Neh', + 'Esth', 'Job', 'Ps', 'Prov', 'Eccl', 'Song', 'Isa', 'Jer', 'Lam', 'Ezek', 'Dan', 'Hos', 'Joel', 'Amos', 'Obad', 'Jonah', 'Mic', 'Nah', 'Hab', 'Zeph', 'Hag', 'Zech', 'Mal', @@ -250,10 +254,10 @@ usfmNumericOrder = [ # Books not currently adopted into USFM, recommended for removal by BFBS 'JoshA', 'JudgB', 'TobS', 'DanTh', 'SusTh', 'BelTh', - + # Private-Use Extensions 'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG', - + # Back Matter 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER' @@ -396,7 +400,7 @@ def convertToOsis(sFile): """ # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.) - osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: '\uFDD0<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + '</div type="book">\uFDD0\n' , osis, flags=re.DOTALL) + osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n]*?)\n'+r'(.*)(?=\\id|$)', lambda m: '\uFDD0<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + '</div type="book">\uFDD0\n' , osis, flags=re.DOTALL) # \ide_<ENCODING> osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above @@ -406,7 +410,7 @@ def convertToOsis(sFile): # \rem_text... osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis) - + # \restore_text... if relaxedConformance: osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis) @@ -460,7 +464,7 @@ def convertToOsis(sFile): # \imq_text... # \ipr_text... pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'} - osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL) # \iq#_text... osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL) @@ -473,24 +477,24 @@ def convertToOsis(sFile): osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg> # \ili#_text... - osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1" subType="x-introduction">\uFDE0\1\uFDE0</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1" subType="x-introduction">\uFDE0\2\uFDE0</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', '<item type="x-indent-1" subType="x-introduction">\uFDE0'+r'\1'+'\uFDE0</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', '<item type="x-indent-\1" subType="x-introduction">\uFDE0'+r'\2'+'\uFDE0</item>', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4]+</item>)', r'\uFDD3<list>\1</list>\uFDD3', osis, flags=re.DOTALL) + osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4]+</item>)', '\uFDD3<list>'+r'\1'+'</list>\uFDD3', osis, flags=re.DOTALL) # \iot_text... # \io#_text...(references range) - osis = re.sub(r'\\io\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1" subType="x-introduction">\uFDE1\1\uFDE1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1" subType="x-introduction">\uFDE1\2\uFDE1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', r'<item type="head">\uFDE1\1\uFDE1</item type="head">', osis, flags=re.DOTALL) + osis = re.sub(r'\\io\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', '<item type="x-indent-1" subType="x-introduction">\uFDE1'+r'\1'+'\uFDE1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', '<item type="x-indent-\1" subType="x-introduction">\uFDE1'+r'\2'+'\uFDE1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', '<item type="head">\uFDE1'+r'\1'+'\uFDE1</item type="head">', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0]+</item>)', r'\uFDD3<div type="outline"><list>\1</list></div>\uFDD3', osis, flags=re.DOTALL) + osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0]+</item>)', '\uFDD3<div type="outline"><list>'+r'\1'+'</list></div>\uFDD3', osis, flags=re.DOTALL) osis = re.sub('item type="head"', 'head', osis) # \ior_text...\ior* osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'<reference>\1</reference>', osis, flags=re.DOTALL) - - # \iex # TODO: look for example; I have no idea what this would look like in context + + # \iex # TODO: look for example; I have no idea what this would look like in context osis = re.sub(r'\\iex\b\s*(.+?)'+'?=(\s*(\\c|</div type="book">\uFDD0))', r'<div type="bridge">\1</div>', osis, flags=re.DOTALL) # \iqt_text...\iqt* @@ -538,14 +542,14 @@ def convertToOsis(sFile): osis = re.sub('(\uFDDE<div type="x-subSubSubSubSection">[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+)', r'\1'+'</div>\uFDDE\n', osis, flags=re.DOTALL) # \sr_text... - osis = re.sub(r'\\sr\s+(.+)', r'\uFDD4<title type="scope"><reference>\1</reference></title>', osis) + osis = re.sub(r'\\sr\s+(.+)', '\uFDD4<title type="scope"><reference>'+r'\1</reference></title>', osis) # \r_text... - osis = re.sub(r'\\r\s+(.+)', r'\uFDD4<title type="parallel"><reference type="parallel">\1</reference></title>', osis) + osis = re.sub(r'\\r\s+(.+)', '\uFDD4<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis) # \rq_text...\rq* osis = re.sub(r'\\rq\s+(.+?)\\rq\*', r'<reference type="source">\1</reference>', osis, flags=re.DOTALL) # \d_text... - osis = re.sub(r'\\d\s+(.+)', r'\uFDD4<title canonical="true" type="psalm">\1</title>', osis) + osis = re.sub(r'\\d\s+(.+)', '\uFDD4<title canonical="true" type="psalm">'+r'\1</title>', osis) # \sp_text... osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis) @@ -564,7 +568,7 @@ def convertToOsis(sFile): supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp* """ # \c_# - osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: '\uFDD1<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) + '<chapter eID="$BOOK$.' + m.group(1) + '"/>\uFDD3\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: '\uFDD1<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) + '<chapter eID="$BOOK$.' + m.group(1) + '"/>\uFDD3\n', osis, flags=re.DOTALL) # \cp_# # \ca_#\ca* @@ -590,7 +594,7 @@ def convertToOsis(sFile): osis = re.sub(r'\\cd\b\s+(.+)', '\uFDD4<title type="x-description">'+r'\1</title>', osis) # \v_# - osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: '\uFDD2<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + '" sID="$BOOK$.$CHAP$.' + m.group(1) + '"/>' + m.group(2) + '<verse eID="$BOOK$.$CHAP$.' + m.group(1) + '"/>\uFDD2\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: '\uFDD2<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + '" sID="$BOOK$.$CHAP$.' + m.group(1) + '"/>' + m.group(2) + '<verse eID="$BOOK$.$CHAP$.' + m.group(1) + '"/>\uFDD2\n', osis, flags=re.DOTALL) # \vp_#\vp* # \va_#\va* @@ -626,7 +630,7 @@ def convertToOsis(sFile): # \pmo(_text...) # \pm(_text...) # \pmc(_text...) - # \pmr_text... # deprecated: map to same as \pr + # \pmr_text... # deprecated: map to same as \pr # \pi#(_Sample text...) # \mi(_text...) # \nb @@ -638,7 +642,7 @@ def convertToOsis(sFile): paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb' if relaxedConformance: paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5' - osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL) # \cls_text... osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<closer>' + m.group(1) + '\uFDD3</closer>\n', osis, flags=re.DOTALL) @@ -650,7 +654,7 @@ def convertToOsis(sFile): osis = re.sub(r'\\li\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL) osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1]+</item>)', r'\uFDD3<list>\1</list>\uFDD3', osis, flags=re.DOTALL) + osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1]+</item>)', '\uFDD3<list>'+r'\1'+'</list>\uFDD3', osis, flags=re.DOTALL) # \b osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis) @@ -789,7 +793,7 @@ def convertToOsis(sFile): # \xt_ # This isn't guaranteed to be *the* reference, but it's a good guess. note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'<reference>\1</reference>', note) - + if relaxedConformance: # TODO: move this to a concorance/index-specific section? # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. @@ -862,7 +866,7 @@ def convertToOsis(sFile): osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL) # \sls_...\sls* - osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # find a better mapping than <foreign>? + osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # TODO: find a better mapping than <foreign>? if relaxedConformance: # \addpn...\addpn* @@ -873,7 +877,6 @@ def convertToOsis(sFile): osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'<seg type="keyword" n="3">\1</seg>', osis, flags=re.DOTALL) osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'<seg type="keyword" n="4">\1</seg>', osis, flags=re.DOTALL) osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'<seg type="keyword" n="5">\1</seg>', osis, flags=re.DOTALL) - return osis @@ -930,7 +933,7 @@ def convertToOsis(sFile): def makeFigure(matchObject): fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups() figure = '<figure' - if fig_file: + if fig_file: figure += ' src="' + fig_file + '"' if fig_size: figure += ' size="' + fig_size + '"' @@ -990,7 +993,7 @@ def convertToOsis(sFile): periph += 'introduction" subType="x-' + introPeripherals[periphType] else: periph += 'x-unknown' - periph += '">\n' + contents + '</div>\n' + periph += '">\n' + contents + '</div>\n' return periph osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL) @@ -1012,7 +1015,7 @@ def convertToOsis(sFile): osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) # \esb...\esbex # TODO: this likely needs to go much earlier in the process - osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', r'\uFDD5<div type="x-sidebar">\1</div>\uFDD5'+'\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '\uFDD5<div type="x-sidebar">'+r'\1'+'</div>\uFDD5\n', osis, flags=re.DOTALL) # \cat_<TAG>\cat* osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis) @@ -1036,7 +1039,7 @@ def convertToOsis(sFile): # \z{X}...\z{X}* osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'<seg type="x-\1">\2</seg>', osis, flags=re.DOTALL) - + # \z{X} osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-usfm-z-\1"/>', osis) @@ -1063,7 +1066,6 @@ def convertToOsis(sFile): return ' '.join(osisID) osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis) - # fill in book & chapter values bookChunks = osis.split('\uFDD0') osis = '' @@ -1110,7 +1112,6 @@ def convertToOsis(sFile): osis = re.sub(' ?\n\n+', '\n', osis) return osis - ### Processing starts here if encoding: osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' @@ -1128,7 +1129,6 @@ def convertToOsis(sFile): print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.')) encoding = 'utf-8' - # call individual conversion processors in series osis = cvtPreprocess(osis, relaxedConformance) osis = cvtRelaxedConformanceRemaps(osis, relaxedConformance) @@ -1154,7 +1154,7 @@ def convertToOsis(sFile): # change type on special books for sb in specialBooks: - osis = osis.replace('<div type="book" osisID="' + sb + '">', '<div type="' + sb.lower() + '">') + osis = osis.replace('<div type="book" osisID="' + sb + '">', '<div type="' + sb.lower() + '">') if DEBUG: localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis)) @@ -1366,7 +1366,6 @@ if __name__ == "__main__": k,v=result_queue.get() osisSegment[k]=v - verbosePrint('Assembling OSIS document...') osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n' @@ -1389,7 +1388,7 @@ if __name__ == "__main__": except ImportError: verbosePrint('For schema validation, install lxml') except etree.XMLSyntaxError as eVal: - print('XML Validation error: ' + eVal) + print('XML Validation error: ' + str(eVal)) osisFile = codecs.open(osisFileName, 'w', 'utf-8') osisFile.write('<?xml version="1.0" encoding="UTF-8"?>\n') |