diff options
author | Chris Little <chrislit@crosswire.org> | 2012-08-06 16:24:47 +0000 |
---|---|---|
committer | Chris Little <chrislit@crosswire.org> | 2012-08-06 16:24:47 +0000 |
commit | b00d48669293724fb1411a72922f22be87b79114 (patch) | |
tree | bc1631be40d21f28df15be935b9fcb964dd58c18 /modules | |
parent | fe24aabbdcf33428e21727247b935422e175a18f (diff) | |
download | sword-tools-b00d48669293724fb1411a72922f22be87b79114.tar.gz |
eliminated use of Plane 14 Language Tags, which required UCS-4 builds of Python in favor of U+FDD0-U+FDEF non-characters
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@368 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-x | modules/python/usfm2osis.py | 152 |
1 files changed, 90 insertions, 62 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py index 36390a8..228603d 100755 --- a/modules/python/usfm2osis.py +++ b/modules/python/usfm2osis.py @@ -48,6 +48,25 @@ scriptVersion = '0.5' # 1.x IMP output? # 1.x SWORD module output?, requiring SWORD bindings +### Key to non-characters: +# Used : +# Unused : +# book +# chapter +# verse +# paragraph +# title +# ms1 +# ms2 +# ms3 +# ms4 +# ms5 +# s1 +# s2 +# s3 +# s4 +# s5 +# notes import sys, codecs, re from encodings.aliases import aliases @@ -145,6 +164,7 @@ introPeripherals = { osis2locBk = dict() loc2osisBk = dict() verbose = bool() +ucs4 = (sys.maxunicode > 0xFFFF) """ BEGIN PSF-licensed segment @@ -206,7 +226,7 @@ def convertToOSIS(sFile): """ global loc2osisBk, osis2locBk # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.) ###TESTED### - osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n' , osis, flags=re.DOTALL) + osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n' , osis, flags=re.DOTALL) # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis) if osisBook: @@ -281,36 +301,44 @@ def convertToOSIS(sFile): supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp """ # \ms#_text... ###TESTED### ##NB: supports only \ms1 to \ms3 - osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'<div type="majorSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) - osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) - osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'<div type="majorSection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\ms4\s+(.+)', lambda m: u'<div type="majorSection" n="4"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\ms5\s+(.+)', lambda m: u'<div type="majorSection" n="5"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) # \mr_text... - osis = re.sub(r'\\mr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis) + osis = re.sub(r'\\mr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis) # \s#_text... ###TESTED### ##NB: supports only \s1 to \s3 - osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'<div type="section"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'<div type="section"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) if relaxedConformance: osis = re.sub(r'\\ss\s+', r'\\s2 ', osis) osis = re.sub(r'\\sss\s+', r'\\s3 ', osis) - osis = re.sub(r'\\s2\s+(.+)', lambda m: u'<div type="subsection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) - osis = re.sub(r'\\s3\s+(.+)', lambda m: u'<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis) - osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s2\s+(.+)', lambda m: u'<div type="subsection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s3\s+(.+)', lambda m: u'<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s4\s+(.+)', lambda m: u'<div type="x-subSubSubSection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) + osis = re.sub(r'\\s5\s+(.+)', lambda m: u'<div type="x-subSubSubSubSection"><title>' + m.group(1) + '</title>', osis) + osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL) # \sr_text... - osis = re.sub(r'\\sr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis) + osis = re.sub(r'\\sr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis) # \r_text... - osis = re.sub(r'\\r\s+(.+)', u'<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis) + osis = re.sub(r'\\r\s+(.+)', u'<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis) # \rq_text...\rq* osis = re.sub(r'\\rq\s+(.+?)\\rq\*', u'<reference type="source">'+r'\1</reference>', osis, flags=re.DOTALL) # \d_text... ###TESTED### - osis = re.sub(r'\\d\s+(.+)', u'<title canonical="true" type="psalm">'+r'\1</title>', osis) + osis = re.sub(r'\\d\s+(.+)', u'<title canonical="true" type="psalm">'+r'\1</title>', osis) # \sp_text... ###TESTED### osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis) @@ -329,7 +357,7 @@ def convertToOSIS(sFile): supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp* """ # \c_# ###TESTED### - osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: u'<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) + u'<chapter eID="$BOOK$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: u'<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) + u'<chapter eID="$BOOK$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL) # \cp_# # \ca_#\ca* @@ -349,13 +377,13 @@ def convertToOSIS(sFile): osis = re.sub(r'(<chapter [^<]+sID[^<]+/>.+?<chapter eID[^>]+/>)', replaceChapterNumber, osis, flags=re.DOTALL) # \cl_ - osis = re.sub(r'\\cl\s+(.+)', u'<title>'+r'\1</title>', osis) + osis = re.sub(r'\\cl\s+(.+)', u'<title>'+r'\1</title>', osis) # \cd_# <--This # seems to be an error - osis = re.sub(r'\\cd\b\s+(.+)', u'<title type="x-description">'+r'\1</title>', osis) + osis = re.sub(r'\\cd\b\s+(.+)', u'<title type="x-description">'+r'\1</title>', osis) # \v_# ###TESTED### - osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) + r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) + r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL) # \vp_#\vp* # \va_#\va* @@ -383,7 +411,7 @@ def convertToOSIS(sFile): supported: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b """ # \p(_text...) ###TESTED### - osis = re.sub(r'\\p\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p>\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\p\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p>\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) # \pc(_text...) # \pr(_text...) @@ -396,19 +424,19 @@ def convertToOSIS(sFile): # \mi(_text...) # \nb ###TESTED### pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'} - osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) # \cls_text... - osis = re.sub(r'\\m\s+(.+?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<closer>' + m.group(1) + u'</closer>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\m\s+(.+?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<closer>' + m.group(1) + u'</closer>\n', osis, flags=re.DOTALL) # \ph#(_text...) # \li#(_text...) ###TESTED### osis = re.sub(r'\\ph\b\s*', r'\\li ', osis) osis = re.sub(r'\\ph(\d+)\b\s*', r'\\li\1 ', osis) - osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL) - osis = re.sub(r'\\li(\d+)\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL) + osis = re.sub(r'\\li(\d+)\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL) osis = osis.replace('\n</item>', '</item>\n') - osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL) + osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL) # \b ###TESTED### osis = re.sub(r'\\b\b\s?', r'<lb type="p"/>', osis) @@ -425,23 +453,23 @@ def convertToOSIS(sFile): osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL) # \q#(_text...) ###TESTED### - osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) - osis = re.sub(r'\\q(\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q(\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) # \qr_text... # \qc_text... # \qm#(_text...) qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'} - osis = re.sub(r'\\(qr|qc|qm\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\(qr|qc|qm\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) osis = osis.replace('\n</l>', '</l>\n') - osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL) + osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL) # \b ###TESTED### osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg> # \qa_text... - osis = re.sub(r'\\qa\s+(.+)', u'<title type="acrostic">'+r'\1</title>', osis) + osis = re.sub(r'\\qa\s+(.+)', u'<title type="acrostic">'+r'\1</title>', osis) # \qac_text...\qac* osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL) @@ -455,7 +483,7 @@ def convertToOSIS(sFile): supported: \tr, \th#, \thr#, \tc#, \tcr# """ # \tr_ - osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u''+r']|\\tr\s|<lb\b|<title\b))', r'<row>\1</row>', osis, flags=re.DOTALL) + osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u''+r']|\\tr\s|<lb\b|<title\b))', r'<row>\1</row>', osis, flags=re.DOTALL) # \th#_text... # \thr#_text... @@ -474,36 +502,36 @@ def convertToOSIS(sFile): note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'<seg editions="dc">\1</seg>', note) # \fq_ ###TESTED### - note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note) + note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note) # \fqa_ ###TESTED### - note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'))', u''+r'<rdg type="alternate">\1</rdg>', note) + note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'))', u''+r'<rdg type="alternate">\1</rdg>', note) # \ft_ ###TESTED### note = re.sub(r'\\ft\s', r'', note) # \fr_##SEP## - note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference>\1</reference>', note) + note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference>\1</reference>', note) # \fk_ - note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note) + note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note) # \fl_ - note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'))', u''+r'<label>\1</label>', note) + note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'))', u''+r'<label>\1</label>', note) # \fp_ note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'<p>\1</p>', note) note = re.sub(r'(<note\b[^>]*?>)(.*?)<p>', r'\1<p>\2</p><p>', note) # \fv_ - note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'))', u''+r'<hi type="super">\1</hi>', note) + note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'))', u''+r'<hi type="super">\1</hi>', note) if relaxedConformance: note = note.replace(r'\ft*', r'') note = note.replace(r'\fq*', r'') note = note.replace(r'\fqa*', r'') - note = note.replace(u'', '') + note = note.replace(u'', '') return note @@ -513,10 +541,10 @@ def convertToOSIS(sFile): supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm* """ # \f_+_...\f* ###TESTED### - osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) # \fe_+_...\fe* - osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) @@ -530,31 +558,31 @@ def convertToOSIS(sFile): note = note.replace('\n', ' ') # \xot_refs...\xot* - note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u''+r'<seg editions="ot">\1</seg>', note) + note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u''+r'<seg editions="ot">\1</seg>', note) # \xnt_refs...\xnt* - note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u''+r'<seg editions="nt">\1</seg>', note) + note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u''+r'<seg editions="nt">\1</seg>', note) # \xdc_refs...\xdc* - note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u''+r'<seg editions="dc">\1</seg>', note) + note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u''+r'<seg editions="dc">\1</seg>', note) # \xq_ - note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) + note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) # \xt_ ###TESTED### note = re.sub(r'\\xt\s', r'', note) # \xo_##SEP## - note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note) + note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note) # \xk_ - note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) + note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note) if relaxedConformance: note = note.replace(r'\xt*', r'') note = note.replace(r'\xq*', r'') - note = note.replace(u'', '') + note = note.replace(u'', '') return note @@ -564,7 +592,7 @@ def convertToOSIS(sFile): supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc* """ # \x_+_...\x* ###TESTED### - osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL) osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) @@ -610,7 +638,7 @@ def convertToOSIS(sFile): osis = re.sub(r'\\k\s+(.+?)\\k\*', r'<seg type="keyword">\1</seg>', osis, flags=re.DOTALL) # \lit - osis = re.sub(r'\\lit\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="x-liturgical">\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\lit\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="x-liturgical">\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL) # \dc_...\dc* #### TODO: Find an example---should this really be transChange? osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL) @@ -742,15 +770,15 @@ def convertToOSIS(sFile): supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat """ # \ef...\ef* - osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'</note>', osis, flags=re.DOTALL) osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) # \ex...\ex* - osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL) + osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL) osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) # \esb...\esbex ### TODO: this likely needs to go much earlier in the process - osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '<div type="x-sidebar">\1</div>\n', osis, flags=re.DOTALL) + osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '<div type="x-sidebar">\1</div>\n', osis, flags=re.DOTALL) # \cat_<TAG>\cat* osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis) @@ -790,14 +818,14 @@ def convertToOSIS(sFile): # fill in book & chapter values - bookChunks = osis.split(u'') + bookChunks = osis.split(u'') osis = '' for bc in bookChunks: bookValue = re.search(r'<div type="book" osisID="([^"]+?)"', bc) if bookValue: bookValue = bookValue.group(1) bc = bc.replace('$BOOK$', bookValue) - chapChunks = bc.split(u'') + chapChunks = bc.split(u'') newbc = '' for cc in chapChunks: chapValue = re.search(r'<chapter osisID="[^\."]+\.([^"]+)', cc) @@ -812,17 +840,17 @@ def convertToOSIS(sFile): def osisReorderAndCleanup(osis): # assorted re-orderings - osis = re.sub(u'(<chapter eID=.+?\n)(<verse eID=.+?>)\n?', r'\2\n\1', osis) - osis = re.sub(u'([]</div>)([^]*<chapter eID.+?>)', r'\2\1', osis) - osis = re.sub(u'(</p>\n?<p>)\n?(<verse eID=.+?>)\n?', r'\2\n\1\n', osis) - osis = re.sub(u'\n(<verse eID=.+?>)', r'\1\n', osis) - osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[\n]*<verse osisID=.+?>)', r'\2\1', osis) + osis = re.sub(u'(<chapter eID=.+?\n)(<verse eID=.+?>)\n?', r'\2\n\1', osis) + osis = re.sub(u'([]</div>)([^]*<chapter eID.+?>)', r'\2\1', osis) + osis = re.sub(u'(</p>\n?<p>)\n?(<verse eID=.+?>)\n?', r'\2\n\1\n', osis) + osis = re.sub(u'\n(<verse eID=.+?>)', r'\1\n', osis) + osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[\n]*<verse osisID=.+?>)', r'\2\1', osis) # delete attributes from end tags (since they are invalid) osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis) osis = osis.replace(r'<lb type="p"/>', r'<lb/>') # delete Unicode tags - for c in u'': + for c in u'': osis = osis.replace(c, '') for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']: |