#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals date = '$Date$' rev = '$Rev$' id = '$Id$' usfmVersion = '2.35' # http://ubs-icap.org/chm/usfm/2.35/index.html osisVersion = '2.1.1' # http://www.bibletechnologies.net/osisCore.2.1.1.xsd scriptVersion = '0.5' # usfm2osis.py # Copyright 2012 by the CrossWire Bible Society # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation version 2. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # The full text of the GNU General Public License is available at: # . ### Guidelines & objectives: # Target CPython 2.7+ (but support CPython 3 and other interpreters if possible) # Require no non-default libraries # Don't require SWORD bindings # Handle all USFM characters from the USFM reference: # # Employ best-practice conformant OSIS # Employ modularity (functions rather than a big long script) # Employ the same command-line syntax as usfm2osis.pl # Use non-characters for milestoning ### Roadmap: # 0.5 initial commit, including full coverage of core USFM tags # 0.6 file sorting options (natural/alphabetic/canonical/none); expand sub-verses with ! in osisIDs; Python3 compatability; add optional schema validator (lxml probably); docstrings; unittest; make fully OO; PyDev project? # 0.7 test suite incorporating all USFM examples from UBS ICAP and other complex cases # 0.8 more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers; clear remaining TODO items, to the extent possible # 1.0 feature complete for release & production use # 1.x xreffix.pl-functionality (osisParse(ref)), requiring SWORD bindings; use toc3 for localization # 1.x SWORD-mode output? # 1.x IMP output? # 1.x SWORD module output?, requiring SWORD bindings ### TODO for 0.6: # expand sub-verses with ! in osisIDs # unittest # make fully OO # PyDev project? # check Python2/3 compatibility ### Key to non-characters: # Used : \uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1 # Unused : \uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF # \uFDD0 book # \uFDD1 chapter # \uFDD2 verse # \uFDD3 paragraph # \uFDD4 title # \uFDD5 ms1 # \uFDD6 ms2 # \uFDD7 ms3 # \uFDD8 ms4 # \uFDD9 ms5 # \uFDDA s1 # \uFDDB s2 # \uFDDC s3 # \uFDDD s4 # \uFDDE s5 # \uFDDF notes # \uFDE0 intro-list # \uFDE1 intro-outline # \uFDE2 is1 # \uFDE3 is2 # \uFDE4 is3 # \uFDE5 is4 # \uFDE6 is5 # \uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE sections import sys, codecs, re from encodings.aliases import aliases import multiprocessing if sys.version_info[0] < 3: import Queue else: import queue as Queue import random date = date.replace('$', '').strip()[6:16] rev = rev.replace('$', '').strip()[5:] bookDict = { ### Known USFM Book codes from Paratext ### Cf. http://ubs-icap.org/chm/usfm/2.35/index.html?book_codes.htm # OT 'GEN':'Gen', 'EXO':'Exod', 'LEV':'Lev', 'NUM':'Num', 'DEU':'Deut', 'JOS':'Josh', 'JDG':'Judg', 'RUT':'Ruth', '1SA':'1Sam', '2SA':'2Sam', '1KI':'1Kgs', '2KI':'2Kgs', '1CH':'1Chr', '2CH':'2Chr', 'EZR':'Ezra', 'NEH':'Neh', 'EST':'Esth', 'JOB':'Job', 'PSA':'Ps', 'PRO':'Prov', 'ECC':'Eccl', 'SNG':'Song', 'ISA':'Isa', 'JER':'Jer', 'LAM':'Lam', 'EZK':'Ezek', 'DAN':'Dan', 'HOS':'Hos', 'JOL':'Joel', 'AMO':'Amos', 'OBA':'Obad', 'JON':'Jonah', 'MIC':'Mic', 'NAM':'Nah', 'HAB':'Hab', 'ZEP':'Zeph', 'HAG':'Hag', 'ZEC':'Zech', 'MAL':'Mal', # NT 'MAT':'Matt', 'MRK':'Mark', 'LUK':'Luke', 'JHN':'John', 'ACT':'Acts', 'ROM':'Rom', '1CO':'1Cor', '2CO':'2Cor', 'GAL':'Gal', 'EPH':'Eph', 'PHP':'Phil', 'COL':'Col', '1TH':'1Thess', '2TH':'2Thess', '1TI':'1Tim', '2TI':'2Tim', 'TIT':'Titus', 'PHM':'Phlm', 'HEB':'Heb', 'JAS':'Jas', '1PE':'1Pet', '2PE':'2Pet', '1JN':'1John', '2JN':'2John', '3JN':'3John', 'JUD':'Jude', 'REV':'Rev', # DC - Catholic 'TOB':'Tob', 'JDT':'Jdt', 'ESG':'EsthGr', 'WIS':'Wis', 'SIR':'Sir', 'BAR':'Bar', 'LJE':'EpJer', 'S3Y':'PrAzar', 'SUS':'Sus', 'BEL':'Bel', '1MA':'1Macc', '2MA':'2Macc', # DC - Eastern Orthodox '3MA':'3Macc', '4MA':'4Macc', '1ES':'1Esd', '2ES':'2Esd', 'MAN':'PrMan', 'PS2':'AddPs', # Rahlfs' LXX 'ODA':'Odes', 'PSS':'PssSol', # Esdrae 'EZA':'4Ezra', '5EZ':'5Ezra', '6EZ':'6Ezra', # Inconsistency with Esther 'DAG':'DanGr', # Syriac 'PS3':'5ApocSyrPss', '2BA':'2Bar', 'LBA':'EpBar', # Ethiopic 'JUB':'Jub', 'ENO':'1En', '1MQ':'1Meq', '2MQ':'2Meq', '3MQ':'3Meq', 'REP':'Reproof', '4BA':'4Bar', # Vulgate 'LAO':'EpLao', # Additional non-biblical books 'XXA':'XXA', 'XXB':'XXB', 'XXC':'XXC', 'XXD':'XXD', 'XXE':'XXE', 'XXF':'XXF', 'XXG':'XXG', # Peripheral books 'FRT':'FRONT', 'INT':'INTRODUCTION', 'BAK':'BACK', 'CNC':'CONCORDANCE', 'GLO':'GLOSSARY', 'TDX':'INDEX', 'NDX':'GAZETTEER', 'OTH':'X-OTHER' } addBookDict = { ### Deprecated # Rahlfs 'JSA':'JoshA', 'JDB':'JudgB', 'TBS':'TobS', 'SST':'SusTh', 'DNT':'DanTh', 'BLT':'BelTh', # Esdrae '4ES':'4Ezra', '5ES':'5Ezra', '6ES':'6Ezra', ### Proposed Additions # Alternate Psalms 'PSB':'PsMet', # Vulgate 'PSO':'PrSol', 'PJE':'PrJer', # Armenian 'WSI':'WSir', 'COP':'EpCorPaul', '3CO':'3Cor', 'EUT':'PrEuth', 'DOJ':'DormJohn', # Apostolic Fathers '1CL':'1Clem', '2CL':'2Clem', 'SHE':'Herm', 'LBA':'Barn', 'DID':'Did', ### # Proposed replacements 'ODE':'Odes', # Additional biblical books 'ADE':'AddEsth' } canonicalOrder = [ # General principles of ordering: # 1) Protocanonical books follow standard Protestant order within OT & NT # 2) Intertestamentals follow the OT # 3) NT-Apocrypha follow the NT # 4) Apostolic Fathers follow NT-deuterocanonicals # Specific principles: # 1) Book representing parts of protocanonical books follow the primary book # 2) Variants follow primary forms # 3) Books that appear in only one tradition or Bible appear following their traditional/attested antecedent # There's no fool-proof way to order books without knowing the tradition ahead of time, # but this ordering should get it right often for many common real Bibles. # Front Matter 'FRONT', 'INTRODUCTION', # OT 'Gen', 'Exod', 'Lev', 'Num', 'Deut', 'Josh', 'JoshA', 'Judg', 'JudgB', 'Ruth', '1Sam', '2Sam', '1Kgs', '2Kgs', '1Chr', '2Chr', 'PrMan', 'Jub', '1En', 'Ezra', 'Neh', 'Tob', 'TobS', 'Jdt', 'Esth', 'EsthGr', 'AddEsth', '1Meq', '2Meq', '3Meq', 'Job', 'Ps', 'AddPs', '5ApocSyrPss', 'PsMet', 'Odes', 'Prov', 'Reproof', 'Eccl', 'Song', 'Wis', 'Sir', 'WSir', 'PrSol', 'PssSol', 'Isa', 'Jer', 'Lam', 'PrJer', 'Bar', 'EpJer', '2Bar', 'EpBar', '4Bar', 'Ezek', 'Dan', 'DanGr', 'DanTh', 'PrAzar', 'Sus', 'SusTh', 'Bel', 'BelTh', 'Hos', 'Joel', 'Amos', 'Obad', 'Jonah', 'Mic', 'Nah', 'Hab', 'Zeph', 'Hag', 'Zech', 'Mal', # Intertestamentals '1Esd', '2Esd', '4Ezra', '5Ezra', '6Ezra', '1Macc', '2Macc', '3Macc', '4Macc', # NT 'Matt', 'Mark', 'Luke', 'John', 'Acts', 'Rom', '1Cor', '2Cor', 'Gal', 'Eph', 'Phil', 'Col', '1Thess', '2Thess', '1Tim', '2Tim', 'Titus', 'Phlm', 'Heb', 'Jas', '1Pet', '2Pet', '1John', '2John', '3John', 'Jude', 'Rev', # NT-Apocrypha 'EpLao', 'EpCorPaul', '3Cor', 'PrEuth', 'DormJohn', # AF '1Clem', '2Clem', 'Herm', 'Barn', 'Did', # Private-Use Extensions 'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG', # Back Matter 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER' ] usfmNumericOrder = [ # Front Matter 'FRONT', 'INTRODUCTION', # OT 01-39 'Gen', 'Exod', 'Lev', 'Num', 'Deut', 'Josh', 'Judg', 'Ruth', '1Sam', '2Sam', '1Kgs', '2Kgs', '1Chr', '2Chr', 'Ezra', 'Neh', 'Esth', 'Job', 'Ps', 'Prov', 'Eccl', 'Song', 'Isa', 'Jer', 'Lam', 'Ezek', 'Dan', 'Hos', 'Joel', 'Amos', 'Obad', 'Jonah', 'Mic', 'Nah', 'Hab', 'Zeph', 'Hag', 'Zech', 'Mal', # NT 41-67 'Matt', 'Mark', 'Luke', 'John', 'Acts', 'Rom', '1Cor', '2Cor', 'Gal', 'Eph', 'Phil', 'Col', '1Thess', '2Thess', '1Tim', '2Tim', 'Titus', 'Phlm', 'Heb', 'Jas', '1Pet', '2Pet', '1John', '2John', '3John', 'Jude', 'Rev', # Apocrypha 68-87 (plus AddEsth, inserted after EsthGr) 'Tob', 'Jdt', 'EsthGr', 'AddEsth', 'Wis', 'Sir', 'Bar', 'EpJer', 'PrAzar', 'Sus', 'Bel', '1Macc', '2Macc', '3Macc', '4Macc', '1Esd', '2Esd', 'PrMan', 'AddPs', 'Odes', 'PssSol', # Esdrae A4-A6 '4Ezra', '5Ezra', '6Ezra', # Gk. Daniel, Syriac additions, Ethiopic additions, Laodiceans B2-C2 'DanGr', '5ApocSyrPss', '2Bar', 'EpBar', 'Jub', '1En', '1Meq', '2Meq', '3Meq', 'Reproof', '4Bar', 'EpLao', # Books not currently adopted into USFM, in order given by BFBS # Metrical Psalms 'PsMet', # Vulgate 'PrSol', 'PrJer', # Armenian 'WSir', 'EpCorPaul', '3Cor', 'PrEuth', 'DormJohn', # NT Codices '1Clem', '2Clem', 'Herm', 'Barn', 'Did', # Books not currently adopted into USFM, recommended for removal by BFBS 'JoshA', 'JudgB', 'TobS', 'DanTh', 'SusTh', 'BelTh', # Private-Use Extensions 'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG', # Back Matter 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER' ] specialBooks = ['FRONT', 'INTRODUCTION', 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER'] peripherals = { 'Title Page':'titlePage', 'Half Title Page':'x-halfTitlePage', 'Promotional Page':'x-promotionalPage', 'Imprimatur':'imprimatur', 'Publication Data':'publicationData', 'Foreword':'x-foreword', 'Preface':'preface', 'Table of Contents':'tableofContents', 'Alphabetical Contents':'x-alphabeticalContents', 'Table of Abbreviations':'x-tableofAbbreviations', 'Chronology':'x-chronology', 'Weights and Measures':'x-weightsAndMeasures', 'Map Index':'x-mapIndex', 'NT Quotes from LXX':'x-ntQuotesFromLXX', 'Cover':'coverPage', 'Spine':'x-spine' } introPeripherals = { 'Bible Introduction':'bible', 'Old Testament Introduction':'oldTestament', 'Pentateuch Introduction':'pentateuch', 'History Introduction':'history', 'Poetry Introduction':'poetry', 'Prophecy Introduction':'prophecy', 'New Testament Introduction':'newTestament', 'Gospels Introduction':'gospels', 'Acts Introduction':'acts', 'Epistles Introduction':'epistles', 'Letters Introduction':'letters', 'Deuterocanon Introduction':'deuterocanon' } osis2locBk = dict() loc2osisBk = dict() filename2osis = dict() verbose = bool() ucs4 = (sys.maxunicode > 0xFFFF) # BEGIN PSF-licensed segment # keynat from http://code.activestate.com/recipes/285264-natural-string-sorting/ def keynat(string): r'''A natural sort helper function for sort() and sorted() without using regular expressions or exceptions. >>> items = ('Z', 'a', '10th', '1st', '9') >>> sorted(items) ['10th', '1st', '9', 'Z', 'a'] >>> sorted(items, key=keynat) ['1st', '9', '10th', 'a', 'Z'] ''' it = type(1) r = [] for c in string: if c.isdigit(): d = int(c) if r and type( r[-1] ) == it: r[-1] = r[-1] * 10 + d else: r.append(d) else: r.append(c.lower()) return r # END PSF-licened segment def keycanon(filename): """Sort helper function that orders according to canon position (defined in canonicalOrder list), returning canonical position or infinity if not in the list.""" if filename in filename2osis: return canonicalOrder.index(filename2osis[filename]) return float('inf') def keyusfm(filename): """Sort helper function that orders according to USFM book number (defined in usfmNumericOrder list), returning USFM book number or infinity if not in the list.""" if filename in filename2osis: return usfmNumericOrder.index(filename2osis[filename]) return float('inf') def keysupplied(filename): """Sort helper function that keeps the items in the order in which they were supplied (i.e. it doesn't sort at all), returning the number of times the function has been called.""" if not hasattr(keysupplied, "counter"): keysupplied.counter = 0 keysupplied.counter += 1 return keysupplied.counter def convertToOsis(sFile): """Open a USFM file and return a string consisting of its OSIS equivalent. Keyword arguments: sFile -- Path to the USFM file to be converted """ global encoding global relaxedConformance verbosePrint(('Processing: ' + sFile)) def cvtPreprocess(osis, relaxedConformance): """Perform preprocessing on a USFM document, returning the processed text as a string. Removes excess spaces & CRs and escapes XML entities. Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # lines should never start with non-tags osis = re.sub('\n\s*([^\\\s])', r' \1', osis) # TODO: test this # convert CR to LF osis = osis.replace('\r', '\n') # lines should never end with whitespace (other than \n) osis = re.sub('\s+\n', '\n', osis) # replace with XML entities, as necessary osis = osis.replace('&', '&') osis = osis.replace('<', '<') osis = osis.replace('>', '>') #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis) return osis def cvtRelaxedConformanceRemaps(osis, relaxedConformance): """Perform preprocessing on a USFM document, returning the processed text as a string. Remaps certain deprecated USFM tags to recommended alternatives. Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ if not relaxedConformance: return osis # \tr#: DEP: map to \tr osis = re.sub(r'\\tr\d\b', r'\\tr', osis) # remapped 2.0 periphs # \pub osis = re.sub(r'\\pub\b\s', '\\periph Publication Data\n', osis) # \toc : \periph Table of Contents osis = re.sub(r'\\toc\b\s', '\\periph Table of Contents\n', osis) # \pref osis = re.sub(r'\\pref\b\s', '\\periph Preface\n', osis) # \maps osis = re.sub(r'\\maps\b\s', '\\periph Map Index\n', osis) # \cov osis = re.sub(r'\\cov\b\s', '\\periph Cover\n', osis) # \spine osis = re.sub(r'\\spine\b\s', '\\periph Spine\n', osis) # \pubinfo osis = re.sub(r'\\pubinfo\b\s', '\\periph Publication Information\n', osis) # \intro osis = re.sub(r'\\intro\b\s', '\\id INT\n', osis) # \conc osis = re.sub(r'\\conc\b\s', '\\id CNC\n', osis) # \glo osis = re.sub(r'\\glo\b\s', '\\id GLO\n', osis) # \idx osis = re.sub(r'\\idx\b\s', '\\id TDX\n', osis) return osis def cvtIdentification(osis, relaxedConformance): """Converts USFM **Identification** tags to OSIS, returning the processed text as a string. Supported tags: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3 Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \id__(Name of file, Book name, Language, Last edited, Date etc.) osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n]*?)\n'+r'(.*)(?=\\id|$)', lambda m: '\uFDD0
\n' + (('\n') if m.group(2) else '') + m.group(3) + '
\uFDD0\n' , osis, flags=re.DOTALL) # \ide_ osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above # \sts_ osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \rem_text... osis = re.sub(r'\\rem\b\s+(.+)', r'', osis) # \restore_text... if relaxedConformance: osis = re.sub(r'\\restore\b\s+(.+)', r'', osis) # \h#_text... osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'\1'+'\n', osis) osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'\2'+'\n', osis) # \toc1_text... osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \toc2_text... osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \toc3_text... osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r''+'\n', osis) return osis def cvtIntroductions(osis, relaxedConformance): """Converts USFM **Introduction** tags to OSIS, returning the processed text as a string. Supported tags: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \imt#_text... osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \imte#_text... osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \is#_text... osis = re.sub(r'\\is1?\s+(.+)', lambda m: '\uFDDA
' + m.group(1) + '', osis) osis = re.sub('(\uFDDA[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA]+)', r'\1'+'
\uFDDA\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is2\s+(.+)', lambda m: '\uFDDB
' + m.group(1) + '', osis) osis = re.sub('(\uFDDB[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB]+)', r'\1'+'
\uFDDB\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is3\s+(.+)', lambda m: '\uFDDC
' + m.group(1) + '', osis) osis = re.sub('(\uFDDC[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC]+)', r'\1'+'
\uFDDC\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is4\s+(.+)', lambda m: '\uFDDD
' + m.group(1) + '', osis) osis = re.sub('(\uFDDD[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD]+)', r'\1'+'
\uFDDD\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is5\s+(.+)', lambda m: '\uFDDE
' + m.group(1) + '', osis) osis = re.sub('(\uFDDE[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+)', r'\1'+'
\uFDDE\n', osis, flags=re.DOTALL) # \ip_text... osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|\n' + m.group(1) + '\uFDD3

\n', osis, flags=re.DOTALL) # \ipi_text... # \im_text... # \imi_text... # \ipq_text... # \imq_text... # \ipr_text... pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'} osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|\n' + m.group(2) + '\uFDD3

\n', osis, flags=re.DOTALL) # \iq#_text... osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\i?q[\d\s]|\\fig|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\i?q[\d\s]|\\fig|\2', osis, flags=re.DOTALL) # \ib osis = re.sub(r'\\ib\b\s?', '', osis) osis = osis.replace('\n', '\n') osis = re.sub('()', r'\1', osis, flags=re.DOTALL) osis = re.sub('(.+?)', lambda m: m.group(1).replace('', ''), osis, flags=re.DOTALL) # re-handle \b that occurs within # \ili#_text... osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|\uFDE0'+r'\1'+'\uFDE0', osis, flags=re.DOTALL) osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|\uFDE0'+r'\2'+'\uFDE0', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub('()', '\uFDD3'+r'\1'+'\uFDD3', osis, flags=re.DOTALL) # \iot_text... # \io#_text...(references range) osis = re.sub(r'\\io\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|\uFDE1'+r'\1'+'\uFDE1', osis, flags=re.DOTALL) osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|\uFDE1'+r'\2'+'\uFDE1', osis, flags=re.DOTALL) osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\io[t\d\s]|\\iex?|\uFDE1'+r'\1'+'\uFDE1', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub('()', '\uFDD3
'+r'\1'+'
\uFDD3', osis, flags=re.DOTALL) osis = re.sub('item type="head"', 'head', osis) # \ior_text...\ior* osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'\1', osis, flags=re.DOTALL) # \iex # TODO: look for example; I have no idea what this would look like in context osis = re.sub(r'\\iex\b\s*(.+?)'+'?=(\s*(\\c|\uFDD0))', r'
\1
', osis, flags=re.DOTALL) # \iqt_text...\iqt* osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'\1', osis, flags=re.DOTALL) # \ie osis = re.sub(r'\\ie\b\s*', '', osis) return osis def cvtTitles(osis, relaxedConformance): """Converts USFM **Title, Heading, and Label** tags to OSIS, returning the processed text as a string. Supported tags: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \ms#_text... osis = re.sub(r'\\ms1?\s+(.+)', lambda m: '\uFDD5
' + m.group(1) + '', osis) osis = re.sub('(\uFDD5[^\uFDD5\uFDD0]+)', r'\1'+'
\uFDD5\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms2\s+(.+)', lambda m: '\uFDD6
' + m.group(1) + '', osis) osis = re.sub('(\uFDD6[^\uFDD5\uFDD0\uFDD6]+)', r'\1'+'
\uFDD6\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms3\s+(.+)', lambda m: '\uFDD7
' + m.group(1) + '', osis) osis = re.sub('(\uFDD7[^\uFDD5\uFDD0\uFDD6\uFDD7]+)', r'\1'+'
\uFDD7\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms4\s+(.+)', lambda m: '\uFDD8
' + m.group(1) + '', osis) osis = re.sub('(\uFDD8[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8]+)', r'\1'+'
\uFDD8\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms5\s+(.+)', lambda m: '\uFDD9
' + m.group(1) + '', osis) osis = re.sub('(\uFDD9[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9]+)', r'\1'+'
\uFDD9\n', osis, flags=re.DOTALL) # \mr_text... osis = re.sub(r'\\mr\s+(.+)', '\uFDD4<reference>'+r'\1</reference>', osis) # \s#_text... osis = re.sub(r'\\s1?\s+(.+)', lambda m: '\uFDDA
' + m.group(1) + '', osis) osis = re.sub('(\uFDDA
[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA]+)', r'\1'+'
\uFDDA\n', osis, flags=re.DOTALL) if relaxedConformance: osis = re.sub(r'\\ss\s+', r'\\s2 ', osis) osis = re.sub(r'\\sss\s+', r'\\s3 ', osis) osis = re.sub(r'\\s2\s+(.+)', lambda m: '\uFDDB
' + m.group(1) + '', osis) osis = re.sub('(\uFDDB
[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB]+)', r'\1'+'
\uFDDB\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s3\s+(.+)', lambda m: '\uFDDC
' + m.group(1) + '', osis) osis = re.sub('(\uFDDC
[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC]+)', r'\1'+'
\uFDDC\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s4\s+(.+)', lambda m: '\uFDDD
' + m.group(1) + '', osis) osis = re.sub('(\uFDDD
[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD]+)', r'\1'+'
\uFDDD\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s5\s+(.+)', lambda m: '\uFDDE
' + m.group(1) + '', osis) osis = re.sub('(\uFDDE
[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+)', r'\1'+'
\uFDDE\n', osis, flags=re.DOTALL) # \sr_text... osis = re.sub(r'\\sr\s+(.+)', '\uFDD4<reference>'+r'\1</reference>', osis) # \r_text... osis = re.sub(r'\\r\s+(.+)', '\uFDD4<reference type="parallel">'+r'\1</reference>', osis) # \rq_text...\rq* osis = re.sub(r'\\rq\s+(.+?)\\rq\*', r'\1', osis, flags=re.DOTALL) # \d_text... osis = re.sub(r'\\d\s+(.+)', '\uFDD4'+r'\1', osis) # \sp_text... osis = re.sub(r'\\sp\s+(.+)', r'\1', osis) # \mt#_text... osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \mte#_text... osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) return osis def cvtChaptersAndVerses(osis, relaxedConformance): """Converts USFM **Chapter and Verse** tags to OSIS, returning the processed text as a string. Supported tags: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \c_# osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|
' + m.group(2) + '\uFDD3\n', osis, flags=re.DOTALL) # \cp_# # \ca_#\ca* def replaceChapterNumber(matchObject): """Regex helper function to replace chapter numbers from \c_# with values that appeared in \cp_# and \ca_#\ca*, returing the chapter text as a string. Keyword arguments: matchObject -- a regex match object in which the first element is the chapter text """ ctext = matchObject.group(1) cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext) if cp: ctext = re.sub(r'\\cp\s+(.+?)(?=(\\|\s))', '', ctext, flags=re.DOTALL) cp = cp.group(1) ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+cp+'"', ctext) ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext) if ca: ctext = re.sub(r'\\ca\s+(.+?)\\ca\*', '', ctext, flags=re.DOTALL) ca = ca.group(1) ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext) return ctext osis = re.sub(r'(.+?]+/>)', replaceChapterNumber, osis, flags=re.DOTALL) # \cl_ osis = re.sub(r'\\cl\s+(.+)', '\uFDD4'+r'\1', osis) # \cd_# <--This # seems to be an error osis = re.sub(r'\\cd\b\s+(.+)', '\uFDD4'+r'\1', osis) # \v_# osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|
' + m.group(2) + '\uFDD2\n', osis, flags=re.DOTALL) # \vp_#\vp* # \va_#\va* def replaceVerseNumber(matchObject): """Regex helper function to replace verse numbers from \v_# with values that appeared in \vp_#\vp* and \va_#\va*, returing the verse text as a string. Keyword arguments: matchObject -- a regex match object in which the first element is the verse text """ vtext = matchObject.group(1) vp = re.search(r'\\vp\s+(.+?)\\vp\*', vtext) if vp: vtext = re.sub(r'\\vp\s+(.+?)\\vp\*', '', vtext, flags=re.DOTALL) vp = vp.group(1) vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+vp+'"', vtext) va = re.search(r'\\va\s+(.+?)\\va\*', vtext) if va: vtext = re.sub(r'\\va\s+(.+?)\\va\*', '', vtext, flags=re.DOTALL) va = va.group(1) vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext) return vtext osis = re.sub(r'(.+?]+/>)', replaceVerseNumber, osis, flags=re.DOTALL) return osis def cvtParagraphs(osis, relaxedConformance): """Converts USFM **Paragraph** tags to OSIS, returning the processed text as a string. Supported tags: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \p(_text...) osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(1) + '\uFDD3

\n', osis, flags=re.DOTALL) # \pc(_text...) # \pr(_text...) # \m(_text...) # \pmo(_text...) # \pm(_text...) # \pmc(_text...) # \pmr_text... # deprecated: map to same as \pr # \pi#(_Sample text...) # \mi(_text...) # \nb # \phi # deprecated # \ps # deprecated # \psi # deprecated # \p# # deprecated pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'} paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb' if relaxedConformance: paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5' osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(2) + '\uFDD3

\n', osis, flags=re.DOTALL) # \cls_text... osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|' + m.group(1) + '\uFDD3\n', osis, flags=re.DOTALL) # \ph#(_text...) # \li#(_text...) osis = re.sub(r'\\ph\b\s*', r'\\li ', osis) osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis) osis = re.sub(r'\\li\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|\2', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub('()', '\uFDD3'+r'\1'+'\uFDD3', osis, flags=re.DOTALL) # \b osis = re.sub(r'\\b\b\s?', '', osis) return osis def cvtPoetry(osis, relaxedConformance): """Converts USFM **Poetry** tags to OSIS, returning the processed text as a string. Supported tags: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \qs_(Selah)\qs* osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'\1', osis, flags=re.DOTALL) # \q#(_text...) osis = re.sub(r'\\q\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE'+r']|\\q[\d\s]|\\fig|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE'+r']|\\q[\d\s]|\\fig|\2', osis, flags=re.DOTALL) # \qr_text... # \qc_text... # \qm#(_text...) qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'} osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE'+r']|\\q[\d\s]|\\fig|' + m.group(2) + '', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub('()', r'\1', osis, flags=re.DOTALL) # \b osis = re.sub('(.+?)', lambda m: m.group(1).replace('', ''), osis, flags=re.DOTALL) # re-handle \b that occurs within # \qa_text... osis = re.sub(r'\\qa\s+(.+)', '\uFDD4'+r'\1', osis) # \qac_text...\qac* osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'\1', osis, flags=re.DOTALL) return osis def cvtTables(osis, relaxedConformance): """Converts USFM **Table** tags to OSIS, returning the processed text as a string. Supported tags: \tr, \th#, \thr#, \tc#, \tcr# Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \tr_ osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\tr\s|\1', osis, flags=re.DOTALL) # \th#_text... # \thr#_text... # \tc#_text... # \tcr#_text... tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'} osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|' + m.group(2) + '', osis, flags=re.DOTALL) return osis def processNote(note): """Convert note-internal USFM tags to OSIS, returning the note as a string. Keyword arguments: note -- The note as a string. """ note = note.replace('\n', ' ') # \fdc_refs...\fdc* note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'\1', note) # \fq_ note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \fqa_ note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \ft_ note = re.sub(r'\\ft\s', '', note) # \fr_##SEP## note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \fk_ note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \fl_ note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'', note) # \fp_ note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'

\1

', note) note = re.sub(r'(]*?>)(.*?)

', r'\1

\2

', note) # \fv_ note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \fq*,\fqa*,\ft*,\fr*,\fk*,\fl*,\fp*,\fv* note = re.sub(r'\\f(q|qa|t|r|k|l|p|v)\*', '', note) note = note.replace('\uFDDF', '') return note def cvtFootnotes(osis, relaxedConformance): """Converts USFM **Footnote** tags to OSIS, returning the processed text as a string. Supported tags: \f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \f_+_...\f* osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '' + m.group(2) + '\uFDDF', osis, flags=re.DOTALL) # \fe_+_...\fe* osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '' + m.group(2) + '\uFDDF', osis, flags=re.DOTALL) osis = re.sub(r'(]*?>.*?)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) # \fm_...\fm* osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'\1', osis) return osis def processXref(note): """Convert cross-reference note-internal USFM tags to OSIS, returning the cross-reference note as a string. Keyword arguments: note -- The cross-reference note as a string. """ note = note.replace('\n', ' ') # \xot_refs...\xot* note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', '\uFDDF'+r'\1', note) # \xnt_refs...\xnt* note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', '\uFDDF'+r'\1', note) # \xdc_refs...\xdc* note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', '\uFDDF'+r'\1', note) # \xq_ note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \xo_##SEP## note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \xk_ note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1', note) # \xt_ # This isn't guaranteed to be *the* reference, but it's a good guess. note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+'\uFDDF))', '\uFDDF'+r'\1', note) if relaxedConformance: # TODO: move this to a concorance/index-specific section? # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', '\uFDDF'+r'See: \1', note) # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference. note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', '\uFDDF'+r'See also: \1', note) # \xq*,\xt*,\xo*,\xk* note = re.sub(r'\\x(q|t|o|k)\*', '', note) note = note.replace('\uFDDF', '') return note def cvtCrossReferences(osis, relaxedConformance): """Converts USFM **Cross Reference** tags to OSIS, returning the processed text as a string. Supported tags: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \x_+_...\x* osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '' + m.group(2) + '\uFDDF', osis, flags=re.DOTALL) osis = re.sub(r'(]*?type="crossReference"[^>]*>.*?)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) return osis ### Special Text and Character Styles def cvtSpecialText(osis, relaxedConformance): """Converts USFM **Special Text** tags to OSIS, returning the processed text as a string. Supported tags: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \add_...\add* osis = re.sub(r'\\add\s+(.+?)\\add\*', r'\1', osis, flags=re.DOTALL) # \wj_...\wj* osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'\1', osis, flags=re.DOTALL) # \nd_...\nd* osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'\1', osis, flags=re.DOTALL) # \pn_...\pn* osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'\1', osis, flags=re.DOTALL) # \qt_...\qt* # TODO:should this be ? osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'\1', osis, flags=re.DOTALL) # \sig_...\sig* osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'\1', osis, flags=re.DOTALL) # \ord_...\ord* osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'\1', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript) # \tl_...\tl* osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'\1', osis, flags=re.DOTALL) # \bk_...\bk* osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'\1', osis, flags=re.DOTALL) # \k_...\k* osis = re.sub(r'\\k\s+(.+?)\\k\*', r'\1', osis, flags=re.DOTALL) # \lit osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(1) + '\uFDD3

\n', osis, flags=re.DOTALL) # \dc_...\dc* # TODO: Find an example---should this really be transChange? osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'\1', osis, flags=re.DOTALL) # \sls_...\sls* osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'/1', osis, flags=re.DOTALL) # TODO: find a better mapping than ? if relaxedConformance: # \addpn...\addpn* osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'\1', osis, flags=re.DOTALL) # \k# # TODO: unsure of this tag's purpose osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'\1', osis, flags=re.DOTALL) return osis def cvtCharacterStyling(osis, relaxedConformance): """Converts USFM **Character Styling** tags to OSIS, returning the processed text as a string. Supported tags: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \em_...\em* osis = re.sub(r'\\em\s+(.+?)\\em\*', r'\1', osis, flags=re.DOTALL) # \bd_...\bd* osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'\1', osis, flags=re.DOTALL) # \it_...\it* osis = re.sub(r'\\it\s+(.+?)\\it\*', r'\1', osis, flags=re.DOTALL) # \bdit_...\bdit* osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'\1', osis, flags=re.DOTALL) # \no_...\no* osis = re.sub(r'\\no\s+(.+?)\\no\*', r'\1', osis, flags=re.DOTALL) # \sc_...\sc* osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'\1', osis, flags=re.DOTALL) return osis def cvtSpacingAndBreaks(osis, relaxedConformance): """Converts USFM **Spacing and Breaks** tags to OSIS, returning the processed text as a string. Supported tags: ~, //, \pb Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # ~ osis = osis.replace('~', '\u00A0') # // osis = osis.replace('//', '') # \pb osis = re.sub(r'\\pb\s*', '\n', osis, flags=re.DOTALL) return osis def cvtSpecialFeatures(osis, relaxedConformance): """Converts USFM **Special Feature** tags to OSIS, returning the processed text as a string. Supported tags: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh* Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig* def makeFigure(matchObject): """Regex helper function to convert USFM \fig to OSIS
, returning the OSIS element as a string. Keyword arguments: matchObject -- a regex match object containing the elements of a USFM \fig tag """ fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups() figure = '\n' if fig_ref: figure += '' + fig_ref + '\n' if fig_desc: figure += '\n' if fig_loc: figure += '\n' figure += '
' return figure osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis) # \ndx_...\ndx* # TODO tag with x-glossary instead of ? Is containerable? osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1\2', osis, flags=re.DOTALL) # \pro_...\pro* osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'\1\2\4', osis, flags=re.DOTALL) # \w_...\w* osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1\2', osis, flags=re.DOTALL) # \wg_...\wg* osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1\2', osis, flags=re.DOTALL) # \wh_...\wh* osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1\2', osis, flags=re.DOTALL) if relaxedConformance: # \wr...\wr* osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1\2', osis, flags=re.DOTALL) return osis def cvtPeripherals(osis, relaxedConformance): """Converts USFM **Peripheral** tags to OSIS, returning the processed text as a string. Supported tag: \periph Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \periph def tagPeriph(matchObject): """Regex helper function to tag peripherals, returning a
-encapsulated string. Keyword arguments: matchObject -- a regex match object containing the peripheral type and contents """ periphType,contents = matchObject periph = '
\n' + contents + '
\n' return periph osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(
|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL) return osis def cvtStudyBibleContent(osis, relaxedConformance): """Converts USFM **Study Bible Content** tags to OSIS, returning the processed text as a string. Supported tags: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ # \ef...\ef* osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '' + m.group(2) + '\uFDDF', osis, flags=re.DOTALL) osis = re.sub(r'(]*?>.*?)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) # \ex...\ex* osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '' + m.group(2) + '\uFDDF', osis, flags=re.DOTALL) osis = re.sub(r'(]*?type="crossReference"[^>]*>.*?)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) # \esb...\esbex # TODO: this likely needs to go much earlier in the process osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '\uFDD5
'+r'\1'+'
\uFDD5\n', osis, flags=re.DOTALL) # \cat_\cat* osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'', osis) return osis def cvtPrivateUseExtensions(osis, relaxedConformance): """Converts USFM **\z namespace** tags to OSIS, returning the processed text as a string. Supported tags: \z Keyword arguments: osis -- The document as a string. relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags. """ ### We can't really know what these mean, but will preserve them as elements. # publishing assistant markers # \zpa-xb...\zpa-xb* : \periph Book # \zpa-xc...\zpa-xc* : \periph Chapter # \zpa-xv...\zpa-xv* : \periph Verse # \zpa-xd...\zpa-xd* : \periph Description # TODO: Decide how these should actually be encoded. In lieu of that, # these can all be handled by the default \z Namespace handlers: # \z{X}...\z{X}* osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'\2', osis, flags=re.DOTALL) # \z{X} osis = re.sub(r'\\z([^\s]+)', r'', osis) return osis def processOsisIDs(osis): """Perform postprocessing on an OSIS document, returning the processed text as a string. Recurses through chapter & verses, substituting acutal book IDs & chapter numbers for placeholders. Keyword arguments: osis -- The document as a string. """ # TODO: add support for subverses, including in ranges/series, e.g. Matt.1.1!b-Matt.2.5,Matt.2.7!a # TODO: make sure that descending ranges generate invalid markup (osisID="") # expand verse ranges, series def expandRange(vRange): """Expands a verse range into its constituent verses as a string. Keyword arguments: vRange -- A string of the lower & upper bounds of the range, with a hypen in between. """ vRange = re.findall(r'\d+', vRange) osisID = list() for n in range(int(vRange[0]), int(vRange[1])+1): osisID.append('$BOOK$.$CHAP$.'+str(n)) return ' '.join(osisID) osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis) def expandSeries(vSeries): """Expands a verse series (list) into its constituent verses as a string. Keyword arguments: vSeries -- A comma-separated list of verses. """ vSeries = re.findall(r'\d+', vSeries) osisID = list() for n in vSeries: osisID.append('$BOOK$.$CHAP$.'+str(n)) return ' '.join(osisID) osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis) # fill in book & chapter values bookChunks = osis.split('\uFDD0') osis = '' for bc in bookChunks: bookValue = re.search(r'
\uFDD2)\n?', r'\2'+'\n'+r'\1', osis) osis = re.sub('([\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9]
)([^\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9]*)', r'\2\1', osis) osis = re.sub('(\uFDD3

\n?\uFDD3

)\n?(\uFDD2)\n?', r'\2'+'\n'+r'\1'+'\n', osis) osis = re.sub('\n(\uFDD2)', r'\1'+'\n', osis) osis = re.sub('\n*()([\uFDD2\n]*)', r'\2\1', osis) # delete attributes from end tags (since they are invalid) osis = re.sub(r'(]+) [^>]*>', r'\1>', osis) osis = osis.replace('', '') # delete Unicode non-characters for c in '\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF': osis = osis.replace(c, '') for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']: osis = re.sub('\s+', '\n', osis) osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis) osis = re.sub(' +((]+>)+) *', r'\1 ', osis) # strip extra spaces & newlines osis = re.sub(' +', ' ', osis) osis = re.sub(' ?\n\n+', '\n', osis) return osis ### Processing starts here if encoding: osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' else: encoding = 'utf-8' osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' # \ide_ encoding = re.search(r'\\ide\s+(.+)'+'\n', osis) if encoding: encoding = encoding.group(1).lower().strip() if encoding != 'utf-8': if encoding in aliases: osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' else: print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.')) encoding = 'utf-8' # call individual conversion processors in series osis = cvtPreprocess(osis, relaxedConformance) osis = cvtRelaxedConformanceRemaps(osis, relaxedConformance) osis = cvtIdentification(osis, relaxedConformance) osis = cvtIntroductions(osis, relaxedConformance) osis = cvtTitles(osis, relaxedConformance) osis = cvtChaptersAndVerses(osis, relaxedConformance) osis = cvtParagraphs(osis, relaxedConformance) osis = cvtPoetry(osis, relaxedConformance) osis = cvtTables(osis, relaxedConformance) osis = cvtFootnotes(osis, relaxedConformance) osis = cvtCrossReferences(osis, relaxedConformance) osis = cvtSpecialText(osis, relaxedConformance) osis = cvtCharacterStyling(osis, relaxedConformance) osis = cvtSpacingAndBreaks(osis, relaxedConformance) osis = cvtSpecialFeatures(osis, relaxedConformance) osis = cvtPeripherals(osis, relaxedConformance) osis = cvtStudyBibleContent(osis, relaxedConformance) osis = cvtPrivateUseExtensions(osis, relaxedConformance) osis = processOsisIDs(osis) osis = osisReorderAndCleanup(osis) # change type on special books for sb in specialBooks: osis = osis.replace('

', '
') if DEBUG: localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis)) if localUnhandledTags: print(('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)')) return osis def readIdentifiersFromOsis(filename): """Reads the USFM file and stores information about which Bible book it represents and localized abbrevations in global variables. Keyword arguments: filename -- a USFM filename """ global encoding global loc2osisBk, osis2locBk, filename2osis ### Processing starts here if encoding: osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' else: encoding = 'utf-8' osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' # \ide_ encoding = re.search(r'\\ide\s+(.+)'+'\n', osis) if encoding: encoding = encoding.group(1).lower().strip() if encoding != 'utf-8': if encoding in aliases: osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' else: #print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8.')) encoding = 'utf-8' # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis) if osisBook: osisBook = bookDict[osisBook.group(1)] filename2osis[filename] = osisBook locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis) if locBook: locBook = locBook.group(1) if osisBook: osis2locBk[osisBook]=locBook loc2osisBk[locBook]=osisBook def verbosePrint(text): """Wraper for print() that only prints if verbose is True.""" if verbose: print(text) def printUsage(): """Prints usage statement.""" print(('usfm2osis.py -- USFM ' + usfmVersion + ' to OSIS ' + osisVersion + ' converter version ' + scriptVersion)) print((' Revision: ' + rev + ' (' + date + ')')) print('') print('Usage: usfm2osis.py [OPTION] ... ...') print('') print(' -d debug mode (single-threaded, verbose output') print(' -e ENCODING input encoding override (default is to read the USFM file\'s') print(' \\ide value or assume UTF-8 encoding in its absence)') print(' -h, --help print this usage information') print(' -o FILENAME output filename (default is: .osis.xml)') print(' -r enable relaxed markup processing (for non-standard USFM)') print(' -s mode set book sorting mode: natural (default), alpha, canonical,') print(' usfm, random, none') print(' -v verbose feedback') print(' -x disable XML validation') print('') print('As an example, if you want to generate the osisWork and your USFM') print(' are located in the ./KJV folder, enter:') print(' python usfm2osis.py Bible.KJV ./KJV/*.usfm') verbosePrint('') verbosePrint('Supported encodings: ' + ', '.join(aliases)) class Worker(multiprocessing.Process): """Worker object for multiprocessing.""" def __init__(self, work_queue, result_queue): # base class initialization multiprocessing.Process.__init__(self) # job management stuff self.work_queue = work_queue self.result_queue = result_queue self.kill_received = False def run(self): while not self.kill_received: # get a task try: job = self.work_queue.get_nowait() except Queue.Empty: break # the actual processing osis = convertToOsis(job) # TODO: move XML validation here? # store the result self.result_queue.put((job,osis)) osisSchema = r'' if __name__ == "__main__": global encoding global relaxedConformance num_processes = multiprocessing.cpu_count() num_jobs = num_processes encoding = '' relaxedConformance = False inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted. usfmDocList = list() if '-v' in sys.argv: verbose = True inputFilesIdx += 1 else: verbose = False if '-x' in sys.argv: validatexml = False inputFilesIdx += 1 else: validatexml = True if '-d' in sys.argv: DEBUG = True inputFilesIdx += 1 num_processes = 1 num_jobs = 1 verbose = True else: DEBUG = False if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3: printUsage() else: osisWork = sys.argv[1] if '-o' in sys.argv: i = sys.argv.index('-o')+1 if len(sys.argv) < i+1: printUsage() osisFileName = sys.argv[i] inputFilesIdx += 2 # increment 2, reflecting 2 args for -o else: osisFileName = osisWork + '.osis.xml' if '-e' in sys.argv: i = sys.argv.index('-e')+1 if len(sys.argv) < i+1: printUsage() encoding = sys.argv[i] inputFilesIdx += 2 # increment 2, reflecting 2 args for -e if '-r' in sys.argv: relaxedConformance = True bookDict = dict(list(bookDict.items()) + list(addBookDict.items())) inputFilesIdx += 1 if '-s' in sys.argv: i = sys.argv.index('-s')+1 if len(sys.argv) < i+1: printUsage() if sys.argv[i].startswith('a'): sortKey = None print('Sorting book files alphanumerically.') elif sys.argv[i].startswith('na'): sortKey = keynat print('Sorting book files naturally.') elif sys.argv[i].startswith('c'): sortKey = keycanon print('Sorting book files canonically.') elif sys.argv[i].startswith('u'): sortKey = keyusfm print('Sorting book files by USFM book number.') elif sys.argv[i].startswith('random'): # for testing only sortKey = lambda filename: int(random.random()*256) print('Sorting book files randomly.') else: sortKey = keysupplied print('Leaving book files unsorted, in the order in which they were supplied.') inputFilesIdx += 2 # increment 2, reflecting 2 args for -s else: sortKey = keynat print('Sorting book files naturally.') usfmDocList = sys.argv[inputFilesIdx:] for filename in usfmDocList: readIdentifiersFromOsis(filename) usfmDocList = sorted(usfmDocList, key=sortKey) # run # load up work queue work_queue = multiprocessing.Queue() for job in usfmDocList: work_queue.put(job) # create a queue to pass to workers to store the results result_queue = multiprocessing.Queue() # spawn workers for i in range(num_processes): worker = Worker(work_queue, result_queue) worker.start() # collect the results off the queue osisSegment = dict() for i in usfmDocList: k,v=result_queue.get() osisSegment[k]=v verbosePrint('Assembling OSIS document...') osisDoc = '\n\n
\n\n
\n' unhandledTags = set() for doc in usfmDocList: unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc])) osisDoc += osisSegment[doc] osisDoc += '
\n
\n' if validatexml: try: #import urllib from lxml import etree verbosePrint('Validating XML...') osisParser = etree.XMLParser(schema = etree.XMLSchema(etree.XML(osisSchema))) #osisParser = etree.XMLParser(schema = etree.XMLSchema(etree.XML(urllib.urlopen('http://www.bibletechnologies.net/osisCore.' + osisVersion + '.xsd').read()))) etree.fromstring(osisDoc, osisParser) verbosePrint('XML Valid') except ImportError: verbosePrint('For schema validation, install lxml') except etree.XMLSyntaxError as eVal: print('XML Validation error: ' + str(eVal)) osisFile = codecs.open(osisFileName, 'w', 'utf-8') osisFile.write('\n') osisFile.write(osisDoc) if unhandledTags: if verbose: print('') print(('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)')) if not relaxedConformance: print('Consider using the -r option for relaxed markup processing.')