summaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorChris Little <chrislit@crosswire.org>2012-08-10 19:34:38 +0000
committerChris Little <chrislit@crosswire.org>2012-08-10 19:34:38 +0000
commita8c0d13fa6251133dd9a8121721cea414ae75a66 (patch)
tree1a7440b7fad3b86f9d88421ddcb39e05d7a17495 /modules
parent0b7afc544455c9cdc31e129ee603bf8f021e8001 (diff)
downloadsword-tools-a8c0d13fa6251133dd9a8121721cea414ae75a66.tar.gz
completed handling of non-USFM tags from style sheet & deprecated tags
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@376 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-xmodules/python/usfm2osis.py75
1 files changed, 45 insertions, 30 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py
index 698292c..aa3c059 100755
--- a/modules/python/usfm2osis.py
+++ b/modules/python/usfm2osis.py
@@ -75,6 +75,8 @@ scriptVersion = '0.5'
# ﷥ is4
# ﷦ is5
+# ﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞ sections
+
import sys, codecs, re
from encodings.aliases import aliases
import multiprocessing, Queue
@@ -272,6 +274,8 @@ def convertToOSIS(sFile):
osis = osis.replace('<', '&lt;')
osis = osis.replace('>', '&gt;')
+ #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis)
+
return osis
@@ -561,8 +565,15 @@ def convertToOSIS(sFile):
# \pi#(_Sample text...)
# \mi(_text...)
# \nb
- pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'}
- osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)
+ # \phi # deprecated
+ # \ps # deprecated
+ # \psi # deprecated
+ # \p# # deprecated
+ pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'}
+ paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
+ if relaxedConformance:
+ paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
+ osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)
# \cls_text...
osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<closer>' + m.group(1) + u'﷓</closer>\n', osis, flags=re.DOTALL)
@@ -579,13 +590,6 @@ def convertToOSIS(sFile):
# \b
osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
- if relaxedConformance:
- # TODO: \phi: DEP: Paragraph text, indented with hanging indent
- # TODO: \ps: DEP: Paragraph text, no break with next paragraph text at chapter boundary
- # TODO: \psi: DEP: Paragraph text, indented, with no break with next paragraph text (at chapter boundary)
- # TODO: \p#: Front or back matter text paragraph, level # (if multiple levels)
- pass
-
return osis
@@ -598,17 +602,17 @@ def convertToOSIS(sFile):
osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
# \q#(_text...)
- osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
- osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
# \qr_text...
# \qc_text...
# \qm#(_text...)
qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
- osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
osis = osis.replace('\n</l>', '</l>\n')
- osis = re.sub(u'(<l [^﷐﷑﷓﷔]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
+ osis = re.sub(u'(<l [^﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
# \b
osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
@@ -729,9 +733,11 @@ def convertToOSIS(sFile):
note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
if relaxedConformance:
- # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
- # TODO: \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
- pass
+ # TODO: move this to a concorance/index-specific section?
+ # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
+ note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', u'﷟'+r'<reference osisRef="\1">See: \1</reference>', note)
+ # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
+ note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', u'﷟'+r'<reference osisRef="\1">See also: \1</reference>', note)
if relaxedConformance:
note = note.replace(r'\xq*', '')
@@ -804,9 +810,15 @@ def convertToOSIS(sFile):
osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # find a better mapping than <foreign>?
if relaxedConformance:
- # TODO: \addpn...\addpn*: For chinese words to be dot underline & underline
- # TODO: \k#: Concordance main entry text or keyword, level #
- pass
+ # \addpn...\addpn*
+ osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'<hi type="x-dotUndeline">\1</hi>', osis, flags=re.DOTALL)
+ # \k# # TODO: unsure of this tag's purpose
+ osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'<seg type="keyword" n="1">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'<seg type="keyword" n="2">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'<seg type="keyword" n="3">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'<seg type="keyword" n="4">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'<seg type="keyword" n="5">\1</seg>', osis, flags=re.DOTALL)
+
return osis
@@ -902,8 +914,8 @@ def convertToOSIS(sFile):
osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL)
if relaxedConformance:
- # TODO: \wr...\wr*: OBS: Auxiliary - Wordlist/Glossary Reference
- pass
+ # \wr...\wr*
+ osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1<index index="Reference" level1="\1"/>\2', osis, flags=re.DOTALL)
return osis
@@ -959,15 +971,18 @@ def convertToOSIS(sFile):
supported: \z<Extension>
We can't really know what these mean, but will preserve them as <milestone/> elements.
"""
- if relaxedConformance:
- # publishing assistant markers
- # \zpa-xb...\zpa-xb* : \periph Book
- # \zpa-xc...\zpa-xc* : \periph Chapter
- # \zpa-xv...\zpa-xv* : \periph Verse
- # \zpa-xd...\zpa-xd* : \periph Description
- pass
-
- # \z
+ # publishing assistant markers
+ # \zpa-xb...\zpa-xb* : \periph Book
+ # \zpa-xc...\zpa-xc* : \periph Chapter
+ # \zpa-xv...\zpa-xv* : \periph Verse
+ # \zpa-xd...\zpa-xd* : \periph Description
+ # TODO: Decide how these should actually be encoded. In lieu of that,
+ # these can all be handled by the default \z Namespace handlers:
+
+ # \z{X}...\z{X}*
+ osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'<seg type="x-\1">\2</seg>', osis, flags=re.DOTALL)
+
+ # \z{X}
osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-usfm-z-\1"/>', osis)
return osis