_(Name of file, Book name, Language, Last edited, Date etc.)
osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n]*?)\n'+r'(.*)(?=\\id|$)', lambda m: '\uFDD0\n' + (('\n') if m.group(2) else '') + m.group(3) + '\uFDD0\n' , osis, flags=re.DOTALL)
# \ide_
osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
# \sts_
osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r' '+'\n', osis)
# \rem_text...
osis = re.sub(r'\\rem\b\s+(.+)', r'', osis)
# \restore_text...
if relaxedConformance:
osis = re.sub(r'\\restore\b\s+(.+)', r'', osis)
# \h#_text...
osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'\1 '+'\n', osis)
osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'\2 '+'\n', osis)
# \toc1_text...
osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r' '+'\n', osis)
# \toc2_text...
osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r' '+'\n', osis)
# \toc3_text...
osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', r' '+'\n', osis)
return osis
def cvtIntroductions(osis, relaxedConformance):
"""Converts USFM **Introduction** tags to OSIS, returning the processed text as a string.
Supported tags: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \imt#_text...
osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '' + m.group(2) + ' ', osis)
# \imte#_text...
osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '' + m.group(2) + ' ', osis)
# \is#_text...
osis = re.sub(r'\\is1?\s+(.+)', lambda m: '\uFDE2' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDE2[^\uFDE2]+)(?!\\c\b)', r'\1'+'\uFDE2\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\is2\s+(.+)', lambda m: '\uFDE3' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDE3[^\uFDE2\uFDE3]+)(?!\\c\b)', r'\1'+'\uFDE3\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\is3\s+(.+)', lambda m: '\uFDE4' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDE4[^\uFDE2\uFDE3\uFDE4]+)(?!\\c\b)', r'\1'+'\uFDE4\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\is4\s+(.+)', lambda m: '\uFDE5' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDE5[^\uFDE2\uFDE3\uFDE4\uFDE5]+)(?!\\c\b)', r'\1'+'\uFDE5\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\is5\s+(.+)', lambda m: '\uFDE6' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDE6[^\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6]+?)(?!\\c\b)', r'\1'+'\uFDE6\n', osis, flags=re.DOTALL)
# \ip_text...
osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m[iq]?|i?p[iqr]?|lit|cls|tr|io[\dt]?|iqt?|i?li|iex?|s|c)\b|<(/?div|p|closer)\b))', lambda m: '\uFDD3\n' + m.group(1) + '\uFDD3
\n', osis, flags=re.DOTALL)
# \ipi_text...
# \im_text...
# \imi_text...
# \ipq_text...
# \imq_text...
# \ipr_text...
pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'}
osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m[iq]?|i?p[iqr]?|lit|cls|tr|io[\dt]?|iqt?|i?li|iex?|s|c)\b|<(/?div|p|closer)\b))', lambda m: '\uFDD3\n' + m.group(2) + '\uFDD3
\n', osis, flags=re.DOTALL)
# \iq#_text...
osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\(iq\d?|fig|q\d?|b)\b|\1', osis, flags=re.DOTALL)
osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\(iq\d?|fig|q\d?|b)\b|\2', osis, flags=re.DOTALL)
# \ib
osis = re.sub(r'\\ib\b\s?', '\uFDE7
', osis)
osis = osis.replace('\n', '\n')
# \ili#_text...
osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\(ili\d?|c|p|iot|io\d?|iex?)\b|<(lb|title|item|\?div)\b))', '- \uFDE0'+r'\1'+'\uFDE0
', osis, flags=re.DOTALL)
osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\(ili\d?|c|p|iot|io\d?|iex?)\b|<(lb|title|item|\?div)\b))', r'- \uFDE0'+r'\2'+'\uFDE0
', osis, flags=re.DOTALL)
osis = osis.replace('\n', '\n')
osis = re.sub('(- )', '\uFDD3
'+r'\1'+'
\uFDD3', osis, flags=re.DOTALL)
# \iot_text...
# \io#_text...(references range)
osis = re.sub(r'\\io\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\(iot|io\d?|iex?|c|p)\b|<(lb|title|item|\?div)\b))', '- \uFDE1'+r'\1'+'\uFDE1
', osis, flags=re.DOTALL)
osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\(iot|io\d?|iex?|c|p)\b|<(lb|title|item|\?div)\b))', r'- \uFDE1'+r'\2'+'\uFDE1
', osis, flags=re.DOTALL)
osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\(iot|io\d?|iex?|c|p)\b|<(lb|title|item|\?div)\b))', '- \uFDE1'+r'\1'+'\uFDE1
', osis, flags=re.DOTALL)
osis = osis.replace('\n ', '\n')
osis = re.sub('(- )', '\uFDD3
'+r'\1'+'
\uFDD3', osis, flags=re.DOTALL)
osis = re.sub('item type="head"', 'head', osis)
# \ior_text...\ior*
osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'\1 ', osis, flags=re.DOTALL)
# \iex # TODO: look for example; I have no idea what this would look like in context
osis = re.sub(r'\\iex\b\s*(.+?)'+'?=(\s*(\\c| \uFDD0))', r'\1', osis, flags=re.DOTALL)
# \iqt_text...\iqt*
osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'\1
', osis, flags=re.DOTALL)
# \ie
osis = re.sub(r'\\ie\b\s*', ' ', osis)
return osis
def cvtTitles(osis, relaxedConformance):
"""Converts USFM **Title, Heading, and Label** tags to OSIS, returning the processed text as a string.
Supported tags: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \ms#_text...
osis = re.sub(r'\\ms1?\s+(.+)', lambda m: '\uFDD5' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDD5[^\uFDD5\uFDD0]+)', r'\1'+'\uFDD5\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\ms2\s+(.+)', lambda m: '\uFDD6' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDD6[^\uFDD5\uFDD0\uFDD6]+)', r'\1'+'\uFDD6\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\ms3\s+(.+)', lambda m: '\uFDD7' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDD7[^\uFDD5\uFDD0\uFDD6\uFDD7]+)', r'\1'+'\uFDD7\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\ms4\s+(.+)', lambda m: '\uFDD8' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDD8[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8]+)', r'\1'+'\uFDD8\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\ms5\s+(.+)', lambda m: '\uFDD9' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDD9[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9]+)', r'\1'+'\uFDD9\n', osis, flags=re.DOTALL)
# \mr_text...
osis = re.sub(r'\\mr\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \s#_text...
osis = re.sub(r'\\s1?\s+(.+)', lambda m: '\uFDDA' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDDA[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA]+)', r'\1'+'\uFDDA\n', osis, flags=re.DOTALL)
if relaxedConformance:
osis = re.sub(r'\\ss\s+', r'\\s2 ', osis)
osis = re.sub(r'\\sss\s+', r'\\s3 ', osis)
osis = re.sub(r'\\s2\s+(.+)', lambda m: '\uFDDB' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDDB[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB]+)', r'\1'+'\uFDDB\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\s3\s+(.+)', lambda m: '\uFDDC' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDDC[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC]+)', r'\1'+'\uFDDC\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\s4\s+(.+)', lambda m: '\uFDDD' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDDD[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD]+)', r'\1'+'\uFDDD\n', osis, flags=re.DOTALL)
osis = re.sub(r'\\s5\s+(.+)', lambda m: '\uFDDE' + m.group(1) + ' ', osis)
osis = re.sub('(\uFDDE[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+)', r'\1'+'\uFDDE\n', osis, flags=re.DOTALL)
# \sr_text...
osis = re.sub(r'\\sr\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \r_text...
osis = re.sub(r'\\r\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \rq_text...\rq*
osis = re.sub(r'\\rq\s+(.+?)\\rq\*', r'\1 ', osis, flags=re.DOTALL)
# \d_text...
osis = re.sub(r'\\d\s+(\\v\s+\S+\s+)?(.+)', lambda m: (m.group(1) if m.group(1) else '') + '\uFDD4' + m.group(2) + ' ', osis)
# \sp_text...
# USFM \sp tags represent printed non-canonical secondary titles, whereas the OSIS tag is indended to hold a canonical name associated with elements.
osis = re.sub(r'\\sp\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \mt#_text...
osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '\uFDD4' + m.group(2) + ' ', osis)
# \mte#_text...
osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '\uFDD4' + m.group(2) + ' ', osis)
return osis
def cvtChaptersAndVerses(osis, relaxedConformance):
"""Converts USFM **Chapter and Verse** tags to OSIS, returning the processed text as a string.
Supported tags: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \c_#
osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+| ' + m.group(2) + ' \uFDD3\n', osis, flags=re.DOTALL)
# \cp_#
# \ca_#\ca*
def replaceChapterNumber(matchObject):
"""Regex helper function to replace chapter numbers from \c_# with values that appeared in \cp_# and \ca_#\ca*, returing the chapter text as a string.
Keyword arguments:
matchObject -- a regex match object in which the first element is the chapter text
"""
ctext = matchObject.group(1)
cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext)
if cp:
ctext = re.sub(r'\\cp\s+(.+?)(?=(\\|\s))', '', ctext, flags=re.DOTALL)
cp = cp.group(1)
ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+cp+'"', ctext)
ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
if ca:
ctext = re.sub(r'\\ca\s+(.+?)\\ca\*', '', ctext, flags=re.DOTALL)
ca = ca.group(1)
ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext)
return ctext
osis = re.sub(r'( .+?]+/>)', replaceChapterNumber, osis, flags=re.DOTALL)
# \cl_
# If \cl is found just before the first \c it is a generic term to be utilized at the top of every chapter.
# Otherwise \cl is a single chapter label.
preChapterLabel = re.search(r'\\cl\s+([^\n]*?)((.{0,2}[^>]+>.{0,2})*]+>.{0,2})*]*>)', lambda m: m.group(1)+'\uFDD4'+(re.sub(r'\d+', m.group(2), preChapterLabel.group(1), 1) if re.search(r'\d+', preChapterLabel.group(1)) else preChapterLabel.group(1)+' '+m.group(2))+' ', osis)
osis = re.sub(r'\\cl\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \cd_# <--This # seems to be an error
osis = re.sub(r'\\cd\b\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \v_#
osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+| ' + m.group(2) + ' \uFDD2\n', osis, flags=re.DOTALL)
# \vp_#\vp*
# \va_#\va*
def replaceVerseNumber(matchObject):
"""Regex helper function to replace verse numbers from \v_# with values that appeared in \vp_#\vp* and \va_#\va*, returing the verse text as a string.
Keyword arguments:
matchObject -- a regex match object in which the first element is the verse text
"""
vtext = matchObject.group(1)
vp = re.search(r'\\vp\s+(.+?)\\vp\*', vtext)
if vp:
vtext = re.sub(r'\\vp\s+(.+?)\\vp\*', '', vtext, flags=re.DOTALL)
vp = vp.group(1)
vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+vp+'"', vtext)
va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
if va:
vtext = re.sub(r'\\va\s+(.+?)\\va\*', '', vtext, flags=re.DOTALL)
va = va.group(1)
vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext)
return vtext
osis = re.sub(r'( .+?]+/>)', replaceVerseNumber, osis, flags=re.DOTALL)
return osis
def cvtParagraphs(osis, relaxedConformance):
"""Converts USFM **Paragraph** tags to OSIS, returning the processed text as a string.
Supported tags: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
if relaxedConformance:
paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
# \p(_text...)
osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|p|'+paragraphregex+r')\b|\n' + m.group(1) + '\uFDD3\n', osis, flags=re.DOTALL)
# \pc(_text...)
# \pr(_text...)
# \m(_text...)
# \pmo(_text...)
# \pm(_text...)
# \pmc(_text...)
# \pmr_text... # deprecated: map to same as \pr
# \pi#(_Sample text...)
# \mi(_text...)
# \nb
# \phi # deprecated
# \ps # deprecated
# \psi # deprecated
# \p# # deprecated
pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'}
osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|'+paragraphregex+r')\b|\n' + m.group(2) + '\uFDD3\n', osis, flags=re.DOTALL)
# \cls_text...
osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|lit|cls|tr)\b|' + m.group(1) + '\uFDD3\n', osis, flags=re.DOTALL)
# \ph#(_text...)
# \li#(_text...)
osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis)
osis = re.sub(r'\\li\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDE7'+r']|\\li\d?\b|<(lb|title|item|/?div|/?chapter)\b))', r'- \1
', osis, flags=re.DOTALL)
osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDE7'+r']|\\li\d?\b|<(lb|title|item|/?div|/?chapter)\b))', r'- \2
', osis, flags=re.DOTALL)
osis = osis.replace('\n', '\n')
osis = re.sub('(- )', '\uFDD3
'+r'\1'+'
\uFDD3', osis, flags=re.DOTALL)
# \b
osis = re.sub(r'\\b\b\s?', '\uFDE7
', osis)
return osis
def cvtPoetry(osis, relaxedConformance):
"""Converts USFM **Poetry** tags to OSIS, returning the processed text as a string.
Supported tags: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \qa_text...
osis = re.sub(r'\\qa\s+(.+)', '\uFDD4'+r'\1 ', osis)
# \qac_text...\qac*
osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'\1', osis, flags=re.DOTALL)
# \qs_(Selah)\qs*
osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'\1 ', osis, flags=re.DOTALL)
# \q#(_text...)
osis = re.sub(r'\\q\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDE7'+r']|\\(q[\drcm]?|qm\d|fig)\b|<(l|lb|title|list|/?div)\b))', r'\1 ', osis, flags=re.DOTALL)
osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDE7'+r']|\\(q[\drcm]?|qm\d|fig)\b|<(l|lb|title|list|/?div)\b))', r'\2 ', osis, flags=re.DOTALL)
# \qr_text...
# \qc_text...
# \qm#(_text...)
qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDE7'+r']|\\(q\d?|fig)\b|<(l|lb|title|list|/?div)\b))', lambda m: '' + m.group(2) + ' ', osis, flags=re.DOTALL)
osis = osis.replace('\n', '\n')
osis = re.sub('()', r'\1 ', osis, flags=re.DOTALL)
# x-to-next-level allows line folding like Paratext
osis = re.sub('(.*? (\s*)?)', lambda m: m.group(1)+' subType="x-to-next-level"'+m.group(3) if m.group(4) and int(m.group(2))+1 == int(m.group(5)) else m.group(1)+m.group(3), osis, flags=re.DOTALL)
return osis
def cvtTables(osis, relaxedConformance):
"""Converts USFM **Table** tags to OSIS, returning the processed text as a string.
Supported tags: \tr, \th#, \thr#, \tc#, \tcr#
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \tr_
osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\tr\s|<(lb|title)\b))', r'\1
', osis, flags=re.DOTALL)
# \th#_text...
# \thr#_text...
# \tc#_text...
# \tcr#_text...
tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right"'}
osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|' + m.group(2) + '', osis, flags=re.DOTALL)
osis = re.sub(r'(.*?
)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE7'+r']|\\tr\s|<(lb|title)\b))', r'\1
', osis, flags=re.DOTALL)
return osis
def processNote(note):
"""Convert note-internal USFM tags to OSIS, returning the note as a string.
Keyword arguments:
note -- The note as a string.
"""
note = note.replace('\n', ' ')
# \fdc_refs...\fdc*
note = re.sub(r'\\fdc\b\s(.+?)\\fdc\*', r'\1 ', note)
# \fq_
note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \fqa_
note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \fr_
note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \fk_
note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \fl_
note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'', note)
# \fp_
note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'\1
', note)
note = re.sub(r'(]*?>)(.*?)', r'\1
\2
', note)
# \fv_
note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note)
# \ft_ handle this lastly, so it may properly end any previous footnote tag
note = re.sub(r'\\ft\s', '', note)
# \fq*,\fqa*,\ft*,\fr*,\fk*,\fl*,\fp*,\fv*
note = re.sub(r'\\f(q|qa|t|r|k|l|p|v)\*', '', note)
note = note.replace('\uFDDF', '')
return note
def cvtFootnotes(osis, relaxedConformance):
"""Converts USFM **Footnote** tags to OSIS, returning the processed text as a string.
Supported tags: \f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \f_+_...\f*
osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '' + m.group(2) + '\uFDDF ', osis, flags=re.DOTALL)
# \fe_+_...\fe*
osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '' + m.group(2) + '\uFDDF ', osis, flags=re.DOTALL)
osis = re.sub(r'(]*?>.*? )', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
# \fm_...\fm*
osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'\1', osis)
return osis
def processXref(note):
"""Convert cross-reference note-internal USFM tags to OSIS, returning the cross-reference note as a string.
Keyword arguments:
note -- The cross-reference note as a string.
"""
note = note.replace('\n', ' ')
# \xot_
note = re.sub(r'\\xot\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xnt_
note = re.sub(r'\\xnt\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xdc_
note = re.sub(r'\\xdc\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xq_
note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xo_##SEP##
note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xk_
note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
# \xt_ # This isn't guaranteed to be *the* reference, but it's a good guess.
note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1 ', note)
if relaxedConformance:
# TODO: move this to a concorance/index-specific section?
# \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\*', '\uFDDF'+r'See: \1 ', note)
# \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\*', '\uFDDF'+r'See also: \1 ', note)
# \xot*,\xnt*,\xdc*,\xq*,\xt*,\xo*,\xk*
note = re.sub(r'\\x(ot|nt|dc|q|t|o|k)\*', '', note)
note = note.replace('\uFDDF', '')
return note
def cvtCrossReferences(osis, relaxedConformance):
"""Converts USFM **Cross Reference** tags to OSIS, returning the processed text as a string.
Supported tags: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \x_+_...\x*
osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '' + m.group(2) + '\uFDDF ', osis, flags=re.DOTALL)
osis = re.sub(r'(]*?type="crossReference"[^>]*>.*? )', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
return osis
### Special Text and Character Styles
def cvtSpecialText(osis, relaxedConformance):
"""Converts USFM **Special Text** tags to OSIS, returning the processed text as a string.
Supported tags: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \add_...\add*
osis = re.sub(r'\\add\s+(.+?)\\add\*', r'\1 ', osis, flags=re.DOTALL)
# \wj_...\wj*
osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'\1
', osis, flags=re.DOTALL)
# \nd_...\nd*
osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'\1 ', osis, flags=re.DOTALL)
# \pn_...\pn*
osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'\1 ', osis, flags=re.DOTALL)
# \qt_...\qt* # TODO:should this be ?
osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'\1 ', osis, flags=re.DOTALL)
# \sig_...\sig*
osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'\1 ', osis, flags=re.DOTALL)
# \ord_...\ord*
osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'\1', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript)
# \tl_...\tl*
osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'\1 ', osis, flags=re.DOTALL)
# \bk_...\bk*
osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'\1 ', osis, flags=re.DOTALL)
# \k_...\k*
osis = re.sub(r'\\k\s+(.+?)\\k\*', r'\1 ', osis, flags=re.DOTALL)
# \lit
osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)\b|<(chapter eID|/?div|p|closer)\b))', lambda m: '\uFDD3\n' + m.group(1) + '\uFDD3
\n', osis, flags=re.DOTALL)
# \dc_...\dc* # TODO: Find an example---should this really be transChange?
osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'\1 ', osis, flags=re.DOTALL)
# \sls_...\sls*
osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'/1 ', osis, flags=re.DOTALL) # TODO: find a better mapping than ?
if relaxedConformance:
# \addpn...\addpn*
osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'\1', osis, flags=re.DOTALL)
# \k# # TODO: unsure of this tag's purpose
osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'\1 ', osis, flags=re.DOTALL)
osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'\1 ', osis, flags=re.DOTALL)
osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'\1 ', osis, flags=re.DOTALL)
osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'\1 ', osis, flags=re.DOTALL)
osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'\1 ', osis, flags=re.DOTALL)
return osis
def cvtCharacterStyling(osis, relaxedConformance):
"""Converts USFM **Character Styling** tags to OSIS, returning the processed text as a string.
Supported tags: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \em_...\em*
osis = re.sub(r'\\em\s+(.+?)\\em\*', r'\1', osis, flags=re.DOTALL)
# \bd_...\bd*
osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'\1', osis, flags=re.DOTALL)
# \it_...\it*
osis = re.sub(r'\\it\s+(.+?)\\it\*', r'\1', osis, flags=re.DOTALL)
# \bdit_...\bdit*
osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'\1', osis, flags=re.DOTALL)
# \no_...\no*
osis = re.sub(r'\\no\s+(.+?)\\no\*', r'\1', osis, flags=re.DOTALL)
# \sc_...\sc*
osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'\1', osis, flags=re.DOTALL)
return osis
def cvtSpacingAndBreaks(osis, relaxedConformance):
"""Converts USFM **Spacing and Breaks** tags to OSIS, returning the processed text as a string.
Supported tags: ~, //, \pb
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# ~
osis = osis.replace('~', '\u00A0')
# //
osis = osis.replace('//', '\uFDE7
')
# \pb
osis = re.sub(r'\\pb\s*', ' \n', osis, flags=re.DOTALL)
return osis
def cvtSpecialFeatures(osis, relaxedConformance):
"""Converts USFM **Special Feature** tags to OSIS, returning the processed text as a string.
Supported tags: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh*
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
def makeFigure(matchObject):
"""Regex helper function to convert USFM \fig to OSIS , returning the OSIS element as a string.
Keyword arguments:
matchObject -- a regex match object containing the elements of a USFM \fig tag
"""
fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups()
figure = ''
return figure
osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis)
# \ndx_...\ndx* # TODO tag with x-glossary instead of ? Is containerable?
osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1 \2', osis, flags=re.DOTALL)
# \pro_...\pro*
osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'\1 \2\4', osis, flags=re.DOTALL)
# \w_...\w*
osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1 \2', osis, flags=re.DOTALL)
# \wg_...\wg*
osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1 \2', osis, flags=re.DOTALL)
# \wh_...\wh*
osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1 \2', osis, flags=re.DOTALL)
if relaxedConformance:
# \wr...\wr*
osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1 \2', osis, flags=re.DOTALL)
return osis
def cvtPeripherals(osis, relaxedConformance):
"""Converts USFM **Peripheral** tags to OSIS, returning the processed text as a string.
Supported tag: \periph
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \periph
def tagPeriph(matchObject):
"""Regex helper function to tag peripherals, returning a -encapsulated string.
Keyword arguments:
matchObject -- a regex match object containing the peripheral type and contents
"""
periphType,contents = matchObject.groups()[0:2]
periph = '\n' + contents + '\n'
return periph
osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
return osis
def cvtStudyBibleContent(osis, relaxedConformance):
"""Converts USFM **Study Bible Content** tags to OSIS, returning the processed text as a string.
Supported tags: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
# \ef...\ef*
osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '' + m.group(2) + '\uFDDF ', osis, flags=re.DOTALL)
osis = re.sub(r'(]*?>.*? )', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
# \ex...\ex*
osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '' + m.group(2) + ' \uFDDF ', osis, flags=re.DOTALL)
osis = re.sub(r'(]*?type="crossReference"[^>]*>.*? )', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
# \esb...\esbex # TODO: this likely needs to go much earlier in the process
osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '\uFDD5'+r'\1'+'\uFDD5\n', osis, flags=re.DOTALL)
# \cat_\cat*
osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r' ', osis)
return osis
def cvtPrivateUseExtensions(osis, relaxedConformance):
"""Converts USFM **\z namespace** tags to OSIS, returning the processed text as a string.
Supported tags: \z
Keyword arguments:
osis -- The document as a string.
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
### We can't really know what these mean, but will preserve them as elements.
# publishing assistant markers
# \zpa-xb...\zpa-xb* : \periph Book
# \zpa-xc...\zpa-xc* : \periph Chapter
# \zpa-xv...\zpa-xv* : \periph Verse
# \zpa-xd...\zpa-xd* : \periph Description
# TODO: Decide how these should actually be encoded. In lieu of that,
# these can all be handled by the default \z Namespace handlers:
# \z{X}...\z{X}*
osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'\2 ', osis, flags=re.DOTALL)
# \z{X}
osis = re.sub(r'\\z([^\s]+)', r' ', osis)
return osis
def processOsisIDs(osis):
"""Perform postprocessing on an OSIS document, returning the processed text as a string.
Recurses through chapter & verses, substituting acutal book IDs & chapter numbers for placeholders.
Keyword arguments:
osis -- The document as a string.
"""
# TODO: add support for subverses, including in ranges/series, e.g. Matt.1.1!b-Matt.2.5,Matt.2.7!a
# TODO: make sure that descending ranges generate invalid markup (osisID="")
# expand verse ranges, series
def expandRange(vRange):
"""Expands a verse range into its constituent verses as a string.
Keyword arguments:
vRange -- A string of the lower & upper bounds of the range, with a hypen in between.
"""
vRange = re.findall(r'\d+', vRange)
osisID = list()
for n in range(int(vRange[0]), int(vRange[1])+1):
osisID.append('$BOOK$.$CHAP$.'+str(n))
return ' '.join(osisID)
osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis)
def expandSeries(vSeries):
"""Expands a verse series (list) into its constituent verses as a string.
Keyword arguments:
vSeries -- A comma-separated list of verses.
"""
vSeries = re.findall(r'\d+', vSeries)
osisID = list()
for n in vSeries:
osisID.append('$BOOK$.$CHAP$.'+str(n))
return ' '.join(osisID)
osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis)
# fill in book & chapter values
bookChunks = osis.split('\uFDD0')
osis = ''
for bc in bookChunks:
bookValue = re.search(r'\uFDD2)\n?', r'\2'+'\n'+r'\1', osis) # can this ever occur?
# ... --> ...
osis = re.sub('([\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9])([^\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9]*)', r'\2\1', osis)
# delete Unicode non-characters
for c in '\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF':
osis = osis.replace(c, '')
#
-->
osis = re.sub('(((\s*)?]*)?>.*? |<([pl]|lg)(\s[^>]*)?>|\s)+)(]*>)', r'\7\1', osis)
# -->
osis = re.sub('(((\s*)?]*)?>.*? |<([pl]|lg)(\s[^>]*)?>|\s)+)(]*>)', r'\7\1', osis)
# -->
osis = re.sub('(]*>)((([pl]|lg)(\s[^>]*)?>|\s)+)', r'\2\1', osis)
# -->
osis = re.sub('(]*>)((([pl]|lg)(\s[^>]*)?>|\s)+)', r'\2\1', osis)
# NOTE --> NOTE
osis = re.sub('()()', r'\2\1', osis)
# delete attributes from end tags (since they are invalid)
osis = re.sub(r'([^\s>]+) [^>]*>', r'\1>', osis)
osis = osis.replace('
', '
')
for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']:
osis = re.sub('\s+'+endBlock+'>', ''+endBlock+r'>\n', osis)
osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis)
osis = re.sub(' +(([^>]+>)+) *', r'\1 ', osis)
# normalize p, lg, l and other containers for prettier OSIS
osis = re.sub('\s*(?(title|lg)>)\s*', r'\1', osis)
osis = re.sub('\s*((p|l)(?=[\s>])[^>]*>)\s*', r'\1', osis)
osis = re.sub('\s*(<(p|l)(?=[\s>])[^>]*>)\s*', '\n'+r'\1', osis)
osis = re.sub('\s*(]*>)\s*', '\n'+r'\1', osis)
osis = re.sub('\s*(]*>)\s*', r'\1'+'\n', osis)
osis = re.sub('\s*(]*>)\s*', '\n'+r'\1'+'\n', osis)
# strip extra spaces & newlines
osis = re.sub(' +', ' ', osis)
osis = re.sub(' ?\n\n+', '\n', osis)
return osis
### Processing starts here
if encoding:
osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
else:
encoding = 'utf-8'
osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
# \ide_
encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
if encoding:
encoding = encoding.group(1).lower().strip()
if encoding != 'utf-8':
if encoding in aliases:
osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
else:
print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8'))
encoding = 'utf-8'
if sys.version_info[0] < 3:
osis = osis.lstrip(unichr(0xFEFF))
else:
osis = osis.lstrip(chr(0xFEFF))
# call individual conversion processors in series
osis = cvtPreprocess(osis, relaxedConformance)
osis = cvtRelaxedConformanceRemaps(osis, relaxedConformance)
osis = cvtIdentification(osis, relaxedConformance)
osis = cvtIntroductions(osis, relaxedConformance)
osis = cvtTitles(osis, relaxedConformance)
osis = cvtChaptersAndVerses(osis, relaxedConformance)
osis = cvtParagraphs(osis, relaxedConformance)
osis = cvtPoetry(osis, relaxedConformance)
osis = cvtTables(osis, relaxedConformance)
osis = cvtFootnotes(osis, relaxedConformance)
osis = cvtCrossReferences(osis, relaxedConformance)
osis = cvtSpecialText(osis, relaxedConformance)
osis = cvtCharacterStyling(osis, relaxedConformance)
osis = cvtSpacingAndBreaks(osis, relaxedConformance)
osis = cvtSpecialFeatures(osis, relaxedConformance)
osis = cvtPeripherals(osis, relaxedConformance)
osis = cvtStudyBibleContent(osis, relaxedConformance)
osis = cvtPrivateUseExtensions(osis, relaxedConformance)
osis = processOsisIDs(osis)
osis = osisReorderAndCleanup(osis)
# change type on special books
for sb in specialBooks:
osis = osis.replace('', '')
if DEBUG:
localUnhandledTags = set(re.findall(r'(\\[^\s]*)', osis))
if localUnhandledTags:
print(('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)'))
return osis
def readIdentifiersFromOsis(filename):
"""Reads the USFM file and stores information about which Bible book it represents and localized abbrevations in global variables.
Keyword arguments:
filename -- a USFM filename
"""
global encoding
global loc2osisBk, osis2locBk, filename2osis
### Processing starts here
if encoding:
osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
else:
encoding = 'utf-8'
osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
# \ide_
encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
if encoding:
encoding = encoding.group(1).lower().strip()
if encoding != 'utf-8':
if encoding in aliases:
osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
else:
#print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8'))
encoding = 'utf-8'
# keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis)
if osisBook:
osisBook = bookDict[osisBook.group(1)]
filename2osis[filename] = osisBook
locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
if locBook:
locBook = locBook.group(1)
if osisBook:
osis2locBk[osisBook]=locBook
loc2osisBk[locBook]=osisBook
def verbosePrint(text):
"""Wraper for print() that only prints if verbose is True."""
if verbose:
print(text)
def printUsage():
"""Prints usage statement."""
print(('usfm2osis.py -- USFM ' + usfmVersion + ' to OSIS ' + osisVersion + ' converter version ' + scriptVersion))
print((' Revision: ' + rev + ' (' + date + ')'))
print('')
print('Usage: usfm2osis.py [OPTION] ... ...')
print('')
print(' -d debug mode (single-threaded, verbose output)')
print(' -e ENCODING input encoding override (default is to read the USFM file\'s')
print(' \\ide value or assume UTF-8 encoding in its absence)')
print(' -h, --help print this usage information')
print(' -l LANGUAGE input language code - (default "und")')
print(' -o FILENAME output filename (default is: .osis.xml)')
print(' -r enable relaxed markup processing (for non-standard USFM)')
print(' -s mode set book sorting mode: natural (default), alpha, canonical,')
print(' usfm, random, none')
print(' -v verbose feedback')
print(' -x disable XML validation')
print('')
print('As an example, if you want to generate the osisWork and your USFM')
print(' are located in the ./KJV folder, enter:')
print(' python usfm2osis.py Bible.KJV ./KJV/*.usfm')
verbosePrint('')
verbosePrint('Supported encodings: ' + ', '.join(aliases))
class Worker(multiprocessing.Process):
"""Worker object for multiprocessing."""
def __init__(self, work_queue, result_queue):
# base class initialization
multiprocessing.Process.__init__(self)
# job management stuff
self.work_queue = work_queue
self.result_queue = result_queue
self.kill_received = False
def run(self):
while not self.kill_received:
# get a task
try:
job = self.work_queue.get_nowait()
except Queue.Empty:
break
# the actual processing
osis = convertToOsis(job)
# TODO: move XML validation here?
# store the result
self.result_queue.put((job,osis))
osisSchema = r' '
if __name__ == "__main__":
global encoding
global relaxedConformance
num_processes = max(1,multiprocessing.cpu_count()-1)
num_jobs = num_processes
encoding = ''
relaxedConformance = False
inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted.
usfmDocList = list()
if '-v' in sys.argv:
verbose = True
inputFilesIdx += 1
else:
verbose = False
if '-x' in sys.argv:
validatexml = False
inputFilesIdx += 1
else:
validatexml = True
if '-d' in sys.argv:
DEBUG = True
inputFilesIdx += 1
num_processes = 1
num_jobs = 1
verbose = True
else:
DEBUG = False
if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
printUsage()
else:
osisWork = sys.argv[1]
if '-o' in sys.argv:
i = sys.argv.index('-o')+1
if len(sys.argv) < i+1:
printUsage()
osisFileName = sys.argv[i]
inputFilesIdx += 2 # increment 2, reflecting 2 args for -o
else:
osisFileName = osisWork + '.osis.xml'
if '-e' in sys.argv:
i = sys.argv.index('-e')+1
if len(sys.argv) < i+1:
printUsage()
encoding = sys.argv[i]
inputFilesIdx += 2 # increment 2, reflecting 2 args for -e
if '-r' in sys.argv:
relaxedConformance = True
bookDict = dict(list(bookDict.items()) + list(addBookDict.items()))
inputFilesIdx += 1
if '-l' in sys.argv:
i = sys.argv.index('-l')+1
if len(sys.argv) < i+1:
printUsage()
language = sys.argv[i]
inputFilesIdx += 2 #increment 2, reflecting 2 args for -l
else: language = 'und'
if '-s' in sys.argv:
i = sys.argv.index('-s')+1
if len(sys.argv) < i+1:
printUsage()
if sys.argv[i].startswith('a'):
sortKey = None
print('Sorting book files alphanumerically')
elif sys.argv[i].startswith('na'):
sortKey = keynat
print('Sorting book files naturally')
elif sys.argv[i].startswith('c'):
sortKey = keycanon
print('Sorting book files canonically')
elif sys.argv[i].startswith('u'):
sortKey = keyusfm
print('Sorting book files by USFM book number')
elif sys.argv[i].startswith('random'): # for testing only
sortKey = lambda filename: int(random.random()*256)
print('Sorting book files randomly')
else:
sortKey = keysupplied
print('Leaving book files unsorted, in the order in which they were supplied')
inputFilesIdx += 2 # increment 2, reflecting 2 args for -s
else:
sortKey = keynat
print('Sorting book files naturally')
usfmDocList = sys.argv[inputFilesIdx:]
for filename in usfmDocList:
readIdentifiersFromOsis(filename)
usfmDocList = sorted(usfmDocList, key=sortKey)
# run
# load up work queue
work_queue = multiprocessing.Queue()
for job in usfmDocList:
work_queue.put(job)
# create a queue to pass to workers to store the results
result_queue = multiprocessing.Queue()
# spawn workers
print('Converting USFM documents to OSIS...')
for i in range(num_processes):
worker = Worker(work_queue, result_queue)
worker.start()
# collect the results off the queue
osisSegment = dict()
for i in usfmDocList:
k,v=result_queue.get()
osisSegment[k]=v
print('Assembling OSIS document')
conversionInfo = '\n'
osisDoc = '\n\n\n' + conversionInfo + ' \n \n'
unhandledTags = set()
for doc in usfmDocList:
unhandledTags |= set(re.findall(r'(\\[^\s]*)', osisSegment[doc]))
osisDoc += osisSegment[doc]
osisDoc += ' \n \n'
if validatexml:
try:
#import urllib
from lxml import etree
print('Validating XML...')
osisParser = etree.XMLParser(schema = etree.XMLSchema(etree.XML(osisSchema)))
#osisParser = etree.XMLParser(schema = etree.XMLSchema(etree.XML(urllib.urlopen('http://www.bibletechnologies.net/osisCore.' + osisVersion + '.xsd').read())))
etree.fromstring(osisDoc, osisParser)
print('XML Valid')
except ImportError:
print('For schema validation, install lxml')
except etree.XMLSyntaxError as eVal:
print('XML Validation error: ' + str(eVal))
osisFile = codecs.open(osisFileName, 'w', 'utf-8')
osisFile.write('\n')
osisFile.write(osisDoc)
print('Done!')
if unhandledTags:
print('')
print(('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)'))
if not relaxedConformance:
print('Consider using the -r option for relaxed markup processing')