diff options
author | Chris Little <chrislit@crosswire.org> | 2012-08-10 19:34:38 +0000 |
---|---|---|
committer | Chris Little <chrislit@crosswire.org> | 2012-08-10 19:34:38 +0000 |
commit | a8c0d13fa6251133dd9a8121721cea414ae75a66 (patch) | |
tree | 1a7440b7fad3b86f9d88421ddcb39e05d7a17495 /modules | |
parent | 0b7afc544455c9cdc31e129ee603bf8f021e8001 (diff) | |
download | sword-tools-a8c0d13fa6251133dd9a8121721cea414ae75a66.tar.gz |
completed handling of non-USFM tags from style sheet & deprecated tags
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@376 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-x | modules/python/usfm2osis.py | 75 |
1 files changed, 45 insertions, 30 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py index 698292c..aa3c059 100755 --- a/modules/python/usfm2osis.py +++ b/modules/python/usfm2osis.py @@ -75,6 +75,8 @@ scriptVersion = '0.5' # is4 # is5 +# sections + import sys, codecs, re from encodings.aliases import aliases import multiprocessing, Queue @@ -272,6 +274,8 @@ def convertToOSIS(sFile): osis = osis.replace('<', '<') osis = osis.replace('>', '>') + #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis) + return osis @@ -561,8 +565,15 @@ def convertToOSIS(sFile): # \pi#(_Sample text...) # \mi(_text...) # \nb - pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'} - osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) + # \phi # deprecated + # \ps # deprecated + # \psi # deprecated + # \p# # deprecated + pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'} + paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb' + if relaxedConformance: + paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5' + osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL) # \cls_text... osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<closer>' + m.group(1) + u'</closer>\n', osis, flags=re.DOTALL) @@ -579,13 +590,6 @@ def convertToOSIS(sFile): # \b osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis) - if relaxedConformance: - # TODO: \phi: DEP: Paragraph text, indented with hanging indent - # TODO: \ps: DEP: Paragraph text, no break with next paragraph text at chapter boundary - # TODO: \psi: DEP: Paragraph text, indented, with no break with next paragraph text (at chapter boundary) - # TODO: \p#: Front or back matter text paragraph, level # (if multiple levels) - pass - return osis @@ -598,17 +602,17 @@ def convertToOSIS(sFile): osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL) # \q#(_text...) - osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) - osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL) # \qr_text... # \qc_text... # \qm#(_text...) qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'} - osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) + osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL) osis = osis.replace('\n</l>', '</l>\n') - osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL) + osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL) # \b osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg> @@ -729,9 +733,11 @@ def convertToOSIS(sFile): note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note) if relaxedConformance: - # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. - # TODO: \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference. - pass + # TODO: move this to a concorance/index-specific section? + # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. + note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', u''+r'<reference osisRef="\1">See: \1</reference>', note) + # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference. + note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', u''+r'<reference osisRef="\1">See also: \1</reference>', note) if relaxedConformance: note = note.replace(r'\xq*', '') @@ -804,9 +810,15 @@ def convertToOSIS(sFile): osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # find a better mapping than <foreign>? if relaxedConformance: - # TODO: \addpn...\addpn*: For chinese words to be dot underline & underline - # TODO: \k#: Concordance main entry text or keyword, level # - pass + # \addpn...\addpn* + osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'<hi type="x-dotUndeline">\1</hi>', osis, flags=re.DOTALL) + # \k# # TODO: unsure of this tag's purpose + osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'<seg type="keyword" n="1">\1</seg>', osis, flags=re.DOTALL) + osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'<seg type="keyword" n="2">\1</seg>', osis, flags=re.DOTALL) + osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'<seg type="keyword" n="3">\1</seg>', osis, flags=re.DOTALL) + osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'<seg type="keyword" n="4">\1</seg>', osis, flags=re.DOTALL) + osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'<seg type="keyword" n="5">\1</seg>', osis, flags=re.DOTALL) + return osis @@ -902,8 +914,8 @@ def convertToOSIS(sFile): osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL) if relaxedConformance: - # TODO: \wr...\wr*: OBS: Auxiliary - Wordlist/Glossary Reference - pass + # \wr...\wr* + osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1<index index="Reference" level1="\1"/>\2', osis, flags=re.DOTALL) return osis @@ -959,15 +971,18 @@ def convertToOSIS(sFile): supported: \z<Extension> We can't really know what these mean, but will preserve them as <milestone/> elements. """ - if relaxedConformance: - # publishing assistant markers - # \zpa-xb...\zpa-xb* : \periph Book - # \zpa-xc...\zpa-xc* : \periph Chapter - # \zpa-xv...\zpa-xv* : \periph Verse - # \zpa-xd...\zpa-xd* : \periph Description - pass - - # \z + # publishing assistant markers + # \zpa-xb...\zpa-xb* : \periph Book + # \zpa-xc...\zpa-xc* : \periph Chapter + # \zpa-xv...\zpa-xv* : \periph Verse + # \zpa-xd...\zpa-xd* : \periph Description + # TODO: Decide how these should actually be encoded. In lieu of that, + # these can all be handled by the default \z Namespace handlers: + + # \z{X}...\z{X}* + osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'<seg type="x-\1">\2</seg>', osis, flags=re.DOTALL) + + # \z{X} osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-usfm-z-\1"/>', osis) return osis |