#!/usr/bin/env python # -*- coding: utf-8 -*- date = '$Date$' rev = '$Rev$' id = '$Id$' USFMversion = '2.35' # http://ubs-icap.org/chm/usfm/2.35/index.html OSISversion = '2.1.1' # http://www.bibletechnologies.net/osisCore.2.1.1.xsd scriptVersion = '0.5' # usfm2osis.py # Copyright 2012 by the CrossWire Bible Society # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # The full text of the GNU General Public License is available at: # . ### Guidelines & objectives: # Target Python 2.7+ (but support 3.2 if possible) # Use no non-default libraries (this may change in the future) # Don't use SWORD bindings (this will probably change to allow *optional* use of bindings, if installed) # Achieve full coverage of USFM according to UBS spec: # # Employ best-practice conformant OSIS # Employ modularity (functions rather than a big long script) # Employ the same command-line syntax as usfm2osis.pl # Use non-characters for milestoning ### Roadmap: # 0.5 initial commit, including full coverage of core USFM tags # 0.6 file sorting options (natural/alphabetic/canonical/none); expand sub-verses with ! in osisIDs; Python3 compatability; add optional schema validator (lxml probably); docstrings; unittest; make fully OO; PyDev project? # 0.7 test suite incorporating all USFM examples from UBS ICAP and other complex cases # 0.8 more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers; clear remaining TODO items, to the extent possible # 1.0 feature complete for release & production use # 1.x xreffix.pl-functionality (osisParse(ref)), requiring SWORD bindings # 1.x SWORD-mode output? # 1.x IMP output? # 1.x SWORD module output?, requiring SWORD bindings ### TODO for next milestone: # file sorting options (natural/alphabetic/canonical/none) # expand sub-verses with ! in osisIDs # Python3 compatability # add optional schema validator (lxml probably) # document functions (docstrings) # unittest # make fully OO # PyDev project? ### Key to non-characters: # Used : ﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡ # Unused : ﷧﷨﷩﷪﷫﷬﷭﷮﷯ # ﷐ book # ﷑ chapter # ﷒ verse # ﷓ paragraph # ﷔ title # ﷕ ms1 # ﷖ ms2 # ﷗ ms3 # ﷘ ms4 # ﷙ ms5 # ﷚ s1 # ﷛ s2 # ﷜ s3 # ﷝ s4 # ﷞ s5 # ﷟ notes # ﷠ intro-list # ﷡ intro-outline # ﷢ is1 # ﷣ is2 # ﷤ is3 # ﷥ is4 # ﷦ is5 # ﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞ sections import sys, codecs, re from encodings.aliases import aliases import multiprocessing, Queue date = date.replace('$', '').strip()[6:16] rev = rev.replace('$', '').strip()[5:] bookDict = { ### Known USFM Book codes from Paratext ### Cf. http://ubs-icap.org/chm/usfm/2.35/index.html?book_codes.htm # OT 'GEN':'Gen', 'EXO':'Exod', 'LEV':'Lev', 'NUM':'Num', 'DEU':'Deut', 'JOS':'Josh', 'JDG':'Judg', 'RUT':'Ruth', '1SA':'1Sam', '2SA':'2Sam', '1KI':'1Kgs', '2KI':'2Kgs', '1CH':'1Chr', '2CH':'2Chr', 'EZR':'Ezra', 'NEH':'Neh', 'EST':'Esth', 'JOB':'Job', 'PSA':'Ps', 'PRO':'Prov', 'ECC':'Eccl', 'SNG':'Song', 'ISA':'Isa', 'JER':'Jer', 'LAM':'Lam', 'EZK':'Ezek', 'DAN':'Dan', 'HOS':'Hos', 'JOL':'Joel', 'AMO':'Amos', 'OBA':'Obad', 'JON':'Jonah', 'MIC':'Mic', 'NAM':'Nah', 'HAB':'Hab', 'ZEP':'Zeph', 'HAG':'Hag', 'ZEC':'Zech', 'MAL':'Mal', # NT 'MAT':'Matt', 'MRK':'Mark', 'LUK':'Luke', 'JHN':'John', 'ACT':'Acts', 'ROM':'Rom', '1CO':'1Cor', '2CO':'2Cor', 'GAL':'Gal', 'EPH':'Eph', 'PHP':'Phil', 'COL':'Col', '1TH':'1Thess', '2TH':'2Thess', '1TI':'1Tim', '2TI':'2Tim', 'TIT':'Titus', 'PHM':'Phlm', 'HEB':'Heb', 'JAS':'Jas', '1PE':'1Pet', '2PE':'2Pet', '1JN':'1John', '2JN':'2John', '3JN':'3John', 'JUD':'Jude', 'REV':'Rev', # DC - Catholic 'TOB':'Tob', 'JDT':'Jdt', 'ESG':'EsthGr', 'WIS':'Wis', 'SIR':'Sir', 'BAR':'Bar', 'LJE':'EpJer', 'S3Y':'PrAzar', 'SUS':'Sus', 'BEL':'Bel', '1MA':'1Macc', '2MA':'2Macc', # DC - Eastern Orthodox '3MA':'3Macc', '4MA':'4Macc', '1ES':'1Esd', '2ES':'2Esd', 'MAN':'PrMan', 'PS2':'Ps151', # Rahlfs' LXX 'ODA':'Odes', 'PSS':'PssSol', # Esdrae 'EZA':'4Ezra', '5EZ':'5Ezra', '6EZ':'6Ezra', # Inconsistency with Esther 'DAG':'DanGr', # Syriac 'PS3':'5ApocSyrPss', '2BA':'2Bar', 'LBA':'EpBar', # Ethiopic 'JUB':'Jub', 'ENO':'1En', '1MQ':'1Meq', '2MQ':'2Meq', '3MQ':'3Meq', 'REP':'Reproof', '4BA':'4Bar', # Vulgate 'LAO':'EpLao', # Additional non-biblical books 'XXA':'XXA', 'XXB':'XXB', 'XXC':'XXC', 'XXD':'XXD', 'XXE':'XXE', 'XXF':'XXF', 'XXG':'XXG', # Peripheral books 'FRT':'FRONT', 'INT':'INTRODUCTION', 'BAK':'BACK', 'CNC':'CONCORDANCE', 'GLO':'GLOSSARY', 'TDX':'INDEX', 'NDX':'GAZETTEER', 'OTH':'X-OTHER' } addBookDict = { ### Deprecated # Rahlfs 'JSA':'JoshA', 'JDB':'JudgB', 'TBS':'TobS', 'SST':'SusTh', 'DNT':'DanTh', 'BLT':'BelTh', # Esdrae '4ES':'4Ezra', '5ES':'5Ezra', '6ES':'6Ezra', ### Proposed Additions # Alternate Psalms 'PSB':'Ps', # Vulgate 'PSO':'PrSol', 'PJE':'PrJer', # Armenian 'WSI':'WSir', 'COP':'CorCorr', '3CO':'3Cor', 'EUT':'PrEut', 'DOJ':'DJohn', # Apostolic Fathers '1CL':'1Clem', '2CL':'2Clem', 'SHE':'Herm', 'LBA':'Barn', 'DID':'Did', ### # Proposed replacements 'ODE':'Odes', # Additional biblical books 'ADE':'AddEsth' } canonicalOrder = ( # OT 'GEN', 'EXO', 'LEV', 'NUM', 'DEU', 'JOS', 'JDG', 'RUT', '1SA', '2SA', '1KI', '2KI', '1CH', '2CH', 'EZR', 'NEH', 'EST', 'JOB', 'PSA', 'PRO', 'ECC', 'SNG', 'ISA', 'JER', 'LAM', 'EZK', 'DAN', 'HOS', 'JOL', 'AMO', 'OBA', 'JON', 'MIC', 'NAM', 'HAB', 'ZEP', 'HAG', 'ZEC', 'MAL', # DC - Catholic 'TOB', 'JDT', 'ESG', 'ADE', 'WIS', 'SIR', 'PSS', 'BAR', 'LJE', 'DAG', 'S3Y', 'SUS', 'BEL', '1MA', '2MA', # DC - Eastern Orthodox '1ES', 'MAN', 'PS2', '3MA', '2ES', '4MA', # NT 'MAT', 'MRK', 'LUK', 'JHN', 'ACT', 'ROM', '1CO', '2CO', 'GAL', 'EPH', 'PHP', 'COL', '1TH', '2TH', '1TI', '2TI', 'TIT', 'PHM', 'HEB', 'JAS', '1PE', '2PE', '1JN', '2JN', '3JN', 'JUD', 'REV', # Rahlfs' LXX 'ODA', 'ODE', # Esdrae 'EZA', '5EZ', '6EZ', # Inconsistency with Esther # Syriac 'PS3', '2BA', 'LBA', # Ethiopic 'JUB', 'ENO', '1MQ', '2MQ', '3MQ', 'REP', '4BA', # Vulgate 'LAO', # Additional non-biblical books 'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG', # Peripheral books 'FRT', 'INT', 'BAK', 'CNC', 'GLO', 'TDX', 'NDX', 'OTH' ### Deprecated # Rahlfs 'JSA', 'JDB', 'TBS', 'SST', 'DNT', 'BLT', # Esdrae '4ES', '5ES', '6ES', # Alternate Psalms 'PSB', # Vulgate 'PSO', 'PJE', # Armenian 'WSI', 'COP', '3CO', 'EUT', 'DOJ', # Apostolic Fathers '1CL', '2CL', 'SHE', 'LBA', 'DID', ### # Proposed replacements ) specialBooks = ['FRONT', 'INTRODUCTION', 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER'] peripherals = { 'Title Page':'titlePage', 'Half Title Page':'x-halfTitlePage', 'Promotional Page':'x-promotionalPage', 'Imprimatur':'imprimatur', 'Publication Data':'publicationData', 'Foreword':'x-foreword', 'Preface':'preface', 'Table of Contents':'tableofContents', 'Alphabetical Contents':'x-alphabeticalContents', 'Table of Abbreviations':'x-tableofAbbreviations', 'Chronology':'x-chronology', 'Weights and Measures':'x-weightsAndMeasures', 'Map Index':'x-mapIndex', 'NT Quotes from LXX':'x-ntQuotesFromLXX', 'Cover':'coverPage', 'Spine':'x-spine' } introPeripherals = { 'Bible Introduction':'bible', 'Old Testament Introduction':'oldTestament', 'Pentateuch Introduction':'pentateuch', 'History Introduction':'history', 'Poetry Introduction':'poetry', 'Prophecy Introduction':'prophecy', 'New Testament Introduction':'newTestament', 'Gospels Introduction':'gospels', 'Acts Introduction':'acts', 'Epistles Introduction':'epistles', 'Letters Introduction':'letters', 'Deuterocanon Introduction':'deuterocanon' } osis2locBk = dict() loc2osisBk = dict() verbose = bool() ucs4 = (sys.maxunicode > 0xFFFF) """ BEGIN PSF-licensed segment """ """ keynat from http://code.activestate.com/recipes/285264-natural-string-sorting/ """ def keynat(string): r'''A natural sort helper function for sort() and sorted() without using regular expressions or exceptions. >>> items = ('Z', 'a', '10th', '1st', '9') >>> sorted(items) ['10th', '1st', '9', 'Z', 'a'] >>> sorted(items, key=keynat) ['1st', '9', '10th', 'a', 'Z'] ''' it = type(1) r = [] for c in string: if c.isdigit(): d = int(c) if r and type( r[-1] ) == it: r[-1] = r[-1] * 10 + d else: r.append(d) else: r.append(c.lower()) return r """ END PSF-licened segment """ def convertToOSIS(sFile): global encoding global relaxedConformance verbosePrint('Processing: ' + sFile) def cvtPreprocess(osis, relaxedConformance): # lines should never start with non-tags osis = re.sub('\n\s*([^\\\s])', r' \1', osis) # TODO: test this # convert CR to LF osis = osis.replace('\r', '\n') # lines should never end with whitespace (other than \n) osis = re.sub('\s+\n', '\n', osis) # XML-encode as necessary osis = osis.replace('&', '&') osis = osis.replace('<', '<') osis = osis.replace('>', '>') #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis) return osis def cvtRelaxedConformanceRemaps(osis, relaxedConformance): if not relaxedConformance: return osis # \tr#: DEP: map to \tr osis = re.sub(r'\\tr\d\b', r'\\tr', osis) # remapped 2.0 periphs # \pub osis = re.sub(r'\\pub\b\s', '\\periph Publication Data\n', osis) # \toc : \periph Table of Contents osis = re.sub(r'\\toc\b\s', '\\periph Table of Contents\n', osis) # \pref osis = re.sub(r'\\pref\b\s', '\\periph Preface\n', osis) # \maps osis = re.sub(r'\\maps\b\s', '\\periph Map Index\n', osis) # \cov osis = re.sub(r'\\cov\b\s', '\\periph Cover\n', osis) # \spine osis = re.sub(r'\\spine\b\s', '\\periph Spine\n', osis) # \pubinfo osis = re.sub(r'\\pubinfo\b\s', '\\periph Publication Information\n', osis) # \intro osis = re.sub(r'\\intro\b\s', '\\id INT\n', osis) # \conc osis = re.sub(r'\\conc\b\s', '\\id CNC\n', osis) # \glo osis = re.sub(r'\\glo\b\s', '\\id GLO\n', osis) # \idx osis = re.sub(r'\\idx\b\s', '\\id TDX\n', osis) return osis def cvtIdentification(osis, relaxedConformance): """ Identification supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3 """ global loc2osisBk, osis2locBk # \id__(Name of file, Book name, Language, Last edited, Date etc.) osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐
\n' + (('\n') if m.group(2) else '') + m.group(3) + u'
﷐\n' , osis, flags=re.DOTALL)#@ # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis) if osisBook: osisBook = bookDict[osisBook.group(1)] # \ide_ osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above # \sts_ osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \rem_text... osis = re.sub(r'\\rem\b\s+(.+)', r'', osis) # \restore_text... if relaxedConformance: osis = re.sub(r'\\restore\b\s+(.+)', r'', osis) # \h#_text... osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'\1'+'\n', osis) osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'\2'+'\n', osis) # \toc1_text... osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \toc2_text... osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r''+'\n', osis) # \toc3_text... locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis) if locBook: locBook = locBook.group(1) if osisBook: osis2locBk[osisBook]=locBook loc2osisBk[locBook]=osisBook osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r''+'\n', osis) return osis def cvtIntroductions(osis, relaxedConformance): """ Introductions supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie """ # \imt#_text... osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \imte#_text... osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \is#_text... osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'﷚
' + m.group(1) + '', osis) osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'
﷚\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is2\s+(.+)', lambda m: u'﷛
' + m.group(1) + '', osis) osis = re.sub(u'(﷛[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'
﷛\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is3\s+(.+)', lambda m: u'﷜
' + m.group(1) + '', osis) osis = re.sub(u'(﷜[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'
﷜\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is4\s+(.+)', lambda m: u'﷝
' + m.group(1) + '', osis) osis = re.sub(u'(﷝[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'
﷝\n', osis, flags=re.DOTALL) osis = re.sub(r'\\is5\s+(.+)', lambda m: u'﷞
' + m.group(1) + '', osis) osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'
﷞\n', osis, flags=re.DOTALL) # \ip_text... osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|\n' + m.group(1) + u'﷓

\n', osis, flags=re.DOTALL) # \ipi_text... # \im_text... # \imi_text... # \ipq_text... # \imq_text... # \ipr_text... pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'} osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|\n' + m.group(2) + u'﷓

\n', osis, flags=re.DOTALL) # \iq#_text... osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|\2', osis, flags=re.DOTALL) # \ib osis = re.sub(r'\\ib\b\s?', '', osis) osis = osis.replace('\n', '\n') osis = re.sub(u'()', r'\1', osis, flags=re.DOTALL) osis = re.sub('(.+?)', lambda m: m.group(1).replace('', ''), osis, flags=re.DOTALL) # re-handle \b that occurs within # \ili#_text... osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|﷠\1﷠', osis, flags=re.DOTALL) osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|﷠\2﷠', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub(u'()', ur'﷓\1﷓', osis, flags=re.DOTALL) # \iot_text... # \io#_text...(references range) osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|﷡\1﷡', osis, flags=re.DOTALL) osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|﷡\2﷡', osis, flags=re.DOTALL) osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|﷡\1﷡', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub(u'()', ur'﷓
\1
﷓', osis, flags=re.DOTALL) osis = re.sub('item type="head"', 'head', osis) # \ior_text...\ior* osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'\1', osis, flags=re.DOTALL) # \iex # TODO: look for example; I have no idea what this would look like in context osis = re.sub(r'\\iex\b\s*(.+?)'+u'?=(\s*(\\c|﷐))', r'
\1
', osis, flags=re.DOTALL) # \iqt_text...\iqt* osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'\1', osis, flags=re.DOTALL) # \ie osis = re.sub(r'\\ie\b\s*', '', osis) return osis def cvtTitles(osis, relaxedConformance): """ Titles, Headings, and Labels supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp """ # \ms#_text... osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'﷕
' + m.group(1) + '', osis) osis = re.sub(u'(﷕[^﷕﷐]+)', r'\1'+u'
﷕\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'﷖
' + m.group(1) + '', osis) osis = re.sub(u'(﷖[^﷕﷐﷖]+)', r'\1'+u'
﷖\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'﷗
' + m.group(1) + '', osis) osis = re.sub(u'(﷗[^﷕﷐﷖﷗]+)', r'\1'+u'
﷗\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms4\s+(.+)', lambda m: u'﷘
' + m.group(1) + '', osis) osis = re.sub(u'(﷘[^﷕﷐﷖﷗﷘]+)', r'\1'+u'
﷘\n', osis, flags=re.DOTALL) osis = re.sub(r'\\ms5\s+(.+)', lambda m: u'﷙
' + m.group(1) + '', osis) osis = re.sub(u'(﷙[^﷕﷐﷖﷗﷘﷙]+)', r'\1'+u'
﷙\n', osis, flags=re.DOTALL) # \mr_text... osis = re.sub(r'\\mr\s+(.+)', u'﷔<reference>'+r'\1</reference>', osis) # \s#_text... osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'﷚
' + m.group(1) + '', osis) osis = re.sub(u'(﷚
[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'
﷚\n', osis, flags=re.DOTALL) if relaxedConformance: osis = re.sub(r'\\ss\s+', r'\\s2 ', osis) osis = re.sub(r'\\sss\s+', r'\\s3 ', osis) osis = re.sub(r'\\s2\s+(.+)', lambda m: u'﷛
' + m.group(1) + '', osis) osis = re.sub(u'(﷛
[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'
﷛\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s3\s+(.+)', lambda m: u'﷜
' + m.group(1) + '', osis) osis = re.sub(u'(﷜
[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'
﷜\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s4\s+(.+)', lambda m: u'﷝
' + m.group(1) + '', osis) osis = re.sub(u'(﷝
[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'
﷝\n', osis, flags=re.DOTALL) osis = re.sub(r'\\s5\s+(.+)', lambda m: u'﷞
' + m.group(1) + '', osis) osis = re.sub(u'(﷞
[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'
﷞\n', osis, flags=re.DOTALL) # \sr_text... osis = re.sub(r'\\sr\s+(.+)', ur'﷔<reference>\1</reference>', osis) # \r_text... osis = re.sub(r'\\r\s+(.+)', ur'﷔<reference type="parallel">\1</reference>', osis) # \rq_text...\rq* osis = re.sub(r'\\rq\s+(.+?)\\rq\*', ur'\1', osis, flags=re.DOTALL) # \d_text... osis = re.sub(r'\\d\s+(.+)', ur'﷔\1', osis) # \sp_text... osis = re.sub(r'\\sp\s+(.+)', r'\1', osis) # \mt#_text... osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) # \mte#_text... osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '' + m.group(2) + '', osis) return osis def cvtChaptersAndVerses(osis, relaxedConformance): """ Chapters and Verses supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp* """ # \c_# osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|
' + m.group(2) + u'﷓\n', osis, flags=re.DOTALL) # \cp_# # \ca_#\ca* def replaceChapterNumber(matchObj): ctext = matchObj.group(1) cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext) if cp: ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL) cp = cp.group(1) ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+ca+'"', ctext) ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext) if ca: ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL) ca = ca.group(1) ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext) return ctext osis = re.sub(r'(.+?]+/>)', replaceChapterNumber, osis, flags=re.DOTALL) # \cl_ osis = re.sub(r'\\cl\s+(.+)', u'﷔'+r'\1', osis) # \cd_# <--This # seems to be an error osis = re.sub(r'\\cd\b\s+(.+)', u'﷔'+r'\1', osis) # \v_# osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|
' + m.group(2) + '﷒\n', osis, flags=re.DOTALL) # \vp_#\vp* # \va_#\va* def replaceVerseNumber(matchObj): vtext = matchObj.group(1) vp = re.search(r'\\vp\s+(.+?)\\vp*', vtext) if vp: vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL) vp = vp.group(1) vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+va+'"', vtext) va = re.search(r'\\va\s+(.+?)\\va\*', vtext) if va: vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL) va = va.group(1) vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext) return vtext osis = re.sub(r'(.+?]+/>)', replaceVerseNumber, osis, flags=re.DOTALL) return osis def cvtParagraphs(osis, relaxedConformance): """ Paragraphs supported: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b """ # \p(_text...) osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(1) + u'﷓

\n', osis, flags=re.DOTALL) # \pc(_text...) # \pr(_text...) # \m(_text...) # \pmo(_text...) # \pm(_text...) # \pmc(_text...) # \pmr_text... # deprecated: map to same as \pr # \pi#(_Sample text...) # \mi(_text...) # \nb # \phi # deprecated # \ps # deprecated # \psi # deprecated # \p# # deprecated pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'} paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb' if relaxedConformance: paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5' osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(2) + u'﷓

\n', osis, flags=re.DOTALL) # \cls_text... osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|' + m.group(1) + u'﷓\n', osis, flags=re.DOTALL) # \ph#(_text...) # \li#(_text...) osis = re.sub(r'\\ph\b\s*', r'\\li ', osis) osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis) osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|\2', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub(u'()', ur'﷓\1﷓', osis, flags=re.DOTALL) # \b osis = re.sub(r'\\b\b\s?', '', osis) return osis def cvtPoetry(osis, relaxedConformance): """ Poetry supported: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b """ # \qs_(Selah)\qs* osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'\1', osis, flags=re.DOTALL) # \q#(_text...) osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|\1', osis, flags=re.DOTALL) osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|\2', osis, flags=re.DOTALL) # \qr_text... # \qc_text... # \qm#(_text...) qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'} osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|' + m.group(2) + '', osis, flags=re.DOTALL) osis = osis.replace('\n', '\n') osis = re.sub(u'()', r'\1', osis, flags=re.DOTALL) # \b osis = re.sub('(.+?)', lambda m: m.group(1).replace('', ''), osis, flags=re.DOTALL) # re-handle \b that occurs within # \qa_text... osis = re.sub(r'\\qa\s+(.+)', u'﷔'+r'\1', osis) # \qac_text...\qac* osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'\1', osis, flags=re.DOTALL) return osis def cvtTables(osis, relaxedConformance): """ Tables supported: \tr, \th#, \thr#, \tc#, \tcr# """ # \tr_ osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\tr\s|\1', osis, flags=re.DOTALL) # \th#_text... # \thr#_text... # \tc#_text... # \tcr#_text... tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'} osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|' + m.group(2) + '', osis, flags=re.DOTALL) return osis def processNote(note): note = note.replace('\n', ' ') # \fdc_refs...\fdc* note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'\1', note) # \fq_ note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'\1', note) # \fqa_ note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'\1', note) # \ft_ note = re.sub(r'\\ft\s', '', note) # \fr_##SEP## note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'\1', note) # \fk_ note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'\1', note) # \fl_ note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'', note) # \fp_ note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'

\1

', note) note = re.sub(r'(]*?>)(.*?)

', r'\1

\2

', note) # \fv_ note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'\1', note) if relaxedConformance: note = note.replace(r'\fq*', '') note = note.replace(r'\fqa*', '') note = note.replace(r'\ft*', '') note = note.replace(r'\fr*', '') note = note.replace(r'\fk*', '') note = note.replace(r'\fl*', '') note = note.replace(r'\fp*', '') note = note.replace(r'\fv*', '') note = note.replace(u'﷟', '') return note def cvtFootnotes(osis, relaxedConformance): """ Footnotes supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm* """ # \f_+_...\f* osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '' + m.group(2) + u'﷟', osis, flags=re.DOTALL) # \fe_+_...\fe* osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '' + m.group(2) + u'﷟', osis, flags=re.DOTALL) osis = re.sub(r'(]*?>.*?)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) # \fm_...\fm* osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'\1', osis) return osis def processXref(note): note = note.replace('\n', ' ') # \xot_refs...\xot* note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u'﷟'+r'\1', note) # \xnt_refs...\xnt* note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u'﷟'+r'\1', note) # \xdc_refs...\xdc* note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u'﷟'+r'\1', note) # \xq_ note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'\1', note) # \xo_##SEP## note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'\1', note) # \xk_ note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'\1', note) # \xt_ # This isn't guaranteed to be *the* reference, but it's a good guess. note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'\1', note) if relaxedConformance: # TODO: move this to a concorance/index-specific section? # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference. note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', u'﷟'+r'See: \1', note) # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference. note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', u'﷟'+r'See also: \1', note) if relaxedConformance: note = note.replace(r'\xq*', '') note = note.replace(r'\xt*', '') note = note.replace(r'\xo*', '') note = note.replace(r'\xk*', '') note = note.replace(u'﷟', '') return note def cvtCrossReferences(osis, relaxedConformance): """ Cross References supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc* """ # \x_+_...\x* osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '' + m.group(2) + u'﷟', osis, flags=re.DOTALL) osis = re.sub(r'(]*?type="crossReference"[^>]*>.*?)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) return osis """ Special Text and Character Styles """ def cvtSpecialText(osis, relaxedConformance): """ Special Text supported: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj* """ # \add_...\add* osis = re.sub(r'\\add\s+(.+?)\\add\*', r'\1', osis, flags=re.DOTALL) # \wj_...\wj* osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'\1', osis, flags=re.DOTALL) # \nd_...\nd* osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'\1', osis, flags=re.DOTALL) # \pn_...\pn* osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'\1', osis, flags=re.DOTALL) # \qt_...\qt* # TODO:should this be ? osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'\1', osis, flags=re.DOTALL) # \sig_...\sig* osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'\1', osis, flags=re.DOTALL) # \ord_...\ord* osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'\1', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript) # \tl_...\tl* osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'\1', osis, flags=re.DOTALL) # \bk_...\bk* osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'\1', osis, flags=re.DOTALL) # \k_...\k* osis = re.sub(r'\\k\s+(.+?)\\k\*', r'\1', osis, flags=re.DOTALL) # \lit osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|\n' + m.group(1) + u'﷓

\n', osis, flags=re.DOTALL) # \dc_...\dc* # TODO: Find an example---should this really be transChange? osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'\1', osis, flags=re.DOTALL) # \sls_...\sls* osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'/1', osis, flags=re.DOTALL) # find a better mapping than ? if relaxedConformance: # \addpn...\addpn* osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'\1', osis, flags=re.DOTALL) # \k# # TODO: unsure of this tag's purpose osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'\1', osis, flags=re.DOTALL) osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'\1', osis, flags=re.DOTALL) return osis def cvtCharacterStyling(osis, relaxedConformance): """ Character Styling supported: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc* """ # \em_...\em* osis = re.sub(r'\\em\s+(.+?)\\em\*', r'\1', osis, flags=re.DOTALL) # \bd_...\bd* osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'\1', osis, flags=re.DOTALL) # \it_...\it* osis = re.sub(r'\\it\s+(.+?)\\it\*', r'\1', osis, flags=re.DOTALL) # \bdit_...\bdit* osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'\1', osis, flags=re.DOTALL) # \no_...\no* osis = re.sub(r'\\no\s+(.+?)\\no\*', r'\1', osis, flags=re.DOTALL) # \sc_...\sc* osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'\1', osis, flags=re.DOTALL) return osis def cvtSpacingAndBreaks(osis, relaxedConformance): """ Spacing and Breaks supported: ~, //, \pb """ # ~ osis = osis.replace('~', '\uA0') # // osis = osis.replace('//', '') # \pb osis = re.sub(r'\\pb\s*', '\n', osis, flags=re.DOTALL) return osis def cvtSpecialFeatures(osis, relaxedConformance): """ Special Features supported: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh* """ # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig* def makeFigure(matchObject): fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups() figure = '\n' if fig_ref: figure += '' + fig_ref + '\n' if fig_desc: figure += '\n' if fig_loc: figure += '\n' figure += '' return figure osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis) # \ndx_...\ndx* # TODO tag with x-glossary instead of ? Is containerable? osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1\2', osis, flags=re.DOTALL) # \pro_...\pro* osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'\1\2\4', osis, flags=re.DOTALL) # \w_...\w* osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1\2', osis, flags=re.DOTALL) # \wg_...\wg* osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1\2', osis, flags=re.DOTALL) # \wh_...\wh* osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1\2', osis, flags=re.DOTALL) if relaxedConformance: # \wr...\wr* osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1\2', osis, flags=re.DOTALL) return osis def cvtPeripherals(osis, relaxedConformance): """ Peripherals supported: \periph """ # \periph def tagPeriph(matchObject): periphType,contents = matchObject periph = '
\n' + contents + '
\n' return periph osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(
|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL) return osis def cvtStudyBibleContent(osis, relaxedConformance): """ Study Bible Content supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat """ # \ef...\ef* osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '' + m.group(2) + u'﷟', osis, flags=re.DOTALL) osis = re.sub(r'(]*?>.*?)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL) # \ex...\ex* osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '' + m.group(2) + u'﷟', osis, flags=re.DOTALL) osis = re.sub(r'(]*?type="crossReference"[^>]*>.*?)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL) # \esb...\esbex # TODO: this likely needs to go much earlier in the process osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', ur'﷕
\1
﷕'+'\n', osis, flags=re.DOTALL) # \cat_\cat* osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'', osis) return osis def cvtPrivateUseExtensions(osis, relaxedConformance): """ \z namespace supported: \z We can't really know what these mean, but will preserve them as elements. """ # publishing assistant markers # \zpa-xb...\zpa-xb* : \periph Book # \zpa-xc...\zpa-xc* : \periph Chapter # \zpa-xv...\zpa-xv* : \periph Verse # \zpa-xd...\zpa-xd* : \periph Description # TODO: Decide how these should actually be encoded. In lieu of that, # these can all be handled by the default \z Namespace handlers: # \z{X}...\z{X}* osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'\2', osis, flags=re.DOTALL) # \z{X} osis = re.sub(r'\\z([^\s]+)', r'', osis) return osis def processOsisIDs(osis): # TODO: add support for subverses, including in ranges/series, e.g. Matt.1.1!b-Matt.2.5,Matt.2.7!a # expand verse ranges, series def expandRange(vRange): vRange = re.findall(r'\d+', vRange) osisID = list() for n in range(int(vRange[0]), max(int(vRange[0]), int(vRange[1]))+1): osisID.append('$BOOK$.$CHAP$.'+str(n)) return ' '.join(osisID) osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis) def expandSeries(vSeries): vSeries = re.findall(r'\d+', vSeries) osisID = list() for n in vSeries: osisID.append('$BOOK$.$CHAP$.'+str(n)) return ' '.join(osisID) osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis) # fill in book & chapter values bookChunks = osis.split(u'﷐') osis = '' for bc in bookChunks: bookValue = re.search(r'
﷒)\n?', r'\2'+'\n'+r'\1', osis) osis = re.sub(u'([﷕﷖﷗﷘﷙]
)([^﷕﷖﷗﷘﷙]*)', r'\2\1', osis) osis = re.sub(u'(﷓

\n?﷓

)\n?(﷒)\n?', r'\2'+'\n'+r'\1'+'\n', osis) osis = re.sub(u'\n(﷒)', r'\1'+'\n', osis) osis = re.sub(u'\n*()([﷒\n]*)', r'\2\1', osis) # delete attributes from end tags (since they are invalid) osis = re.sub(r'(]+) [^>]*>', r'\1>', osis) osis = osis.replace('', '') # delete Unicode tags for c in u'﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯': osis = osis.replace(c, '') for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']: osis = re.sub('\s+', '\n', osis) osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis) osis = re.sub(' +((]+>)+) *', r'\1 ', osis) # strip extra spaces & newlines osis = re.sub(' +', ' ', osis) osis = re.sub(' ?\n\n+', '\n', osis) return osis ### Processing starts here if encoding: osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' else: encoding = 'utf-8' osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' # \ide_ encoding = re.search(r'\\ide\s+(.+)'+'\n', osis) if encoding: encoding = encoding.group(1).lower().strip() if encoding != 'utf-8': if encoding in aliases: osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n' else: print('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.') encoding = 'utf-8' # call individual conversion processors in series osis = cvtPreprocess(osis, relaxedConformance) osis = cvtRelaxedConformanceRemaps(osis, relaxedConformance) osis = cvtIdentification(osis, relaxedConformance) osis = cvtIntroductions(osis, relaxedConformance) osis = cvtTitles(osis, relaxedConformance) osis = cvtChaptersAndVerses(osis, relaxedConformance) osis = cvtParagraphs(osis, relaxedConformance) osis = cvtPoetry(osis, relaxedConformance) osis = cvtTables(osis, relaxedConformance) osis = cvtFootnotes(osis, relaxedConformance) osis = cvtCrossReferences(osis, relaxedConformance) osis = cvtSpecialText(osis, relaxedConformance) osis = cvtCharacterStyling(osis, relaxedConformance) osis = cvtSpacingAndBreaks(osis, relaxedConformance) osis = cvtSpecialFeatures(osis, relaxedConformance) osis = cvtPeripherals(osis, relaxedConformance) osis = cvtStudyBibleContent(osis, relaxedConformance) osis = cvtPrivateUseExtensions(osis, relaxedConformance) osis = processOsisIDs(osis) osis = osisReorderAndCleanup(osis) # change type on special books for sb in specialBooks: osis = osis.replace('

', '
') if DEBUG: localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis)) if localUnhandledTags: print('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)') return osis def writeOSISHeader(oFile, workID, lang='en'): oFile.write('\n\n\n
\n\n
\n') def writeOSISFooter(oFile): oFile.write('
\n
\n') def verbosePrint(text): if verbose: print text def printUsage(): print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion) print(' Revision: ' + rev + ' (' + date + ')') print('') print('Usage: usfm2osis.py [OPTION] ... ...') print('') print(' -d debug mode (single-threaded, verbose output') print(' -e ENCODING input encoding override (default is to read the USFM file\'s') print(' \\ide value or assume UTF-8 encoding in its absence)') print(' -h, --help print this usage information') print(' -o FILENAME output filename (default is: .osis.xml)') print(' -r enable relaxed markup processing (for non-standard USFM)') print(' -s mode set book sorting mode: natural (default), alpha, canonical, none') print(' -v verbose feedback') print('') print('As an example, if you want to generate the osisWork and your USFM') print(' are located in the ./KJV folder, enter:') print(' python usfm2osis.py Bible.KJV ./KJV/*.usfm') verbosePrint('') verbosePrint('Supported encodings: ' + ', '.join(aliases)) class Worker(multiprocessing.Process): def __init__(self, work_queue, result_queue): # base class initialization multiprocessing.Process.__init__(self) # job management stuff self.work_queue = work_queue self.result_queue = result_queue self.kill_received = False def run(self): while not self.kill_received: # get a task #job = self.work_queue.get_nowait() try: job = self.work_queue.get_nowait() except Queue.Empty: break # the actual processing osis = convertToOSIS(job) # store the result self.result_queue.put((job,osis)) if __name__ == "__main__": global encoding global relaxedConformance num_processes = multiprocessing.cpu_count() num_jobs = num_processes encoding = '' relaxedConformance = False inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted. usfmDocList = list() if '-v' in sys.argv: verbose = True inputFilesIdx += 1 else: verbose = False if '-d' in sys.argv: DEBUG = True inputFilesIdx += 1 num_processes = 1 num_jobs = 1 verbose = True else: DEBUG = False if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3: printUsage() else: OSISwork = sys.argv[1] if '-o' in sys.argv: i = sys.argv.index('-o')+1 if len(sys.argv) < i+1: printUsage() OSISfileName = sys.argv[i] inputFilesIdx += 2 # increment 2, reflecting 2 args for -o else: OSISfileName = OSISwork + '.osis.xml' if '-e' in sys.argv: i = sys.argv.index('-e')+1 if len(sys.argv) < i+1: printUsage() encoding = sys.argv[i] inputFilesIdx += 2 # increment 2, reflecting 2 args for -e if '-r' in sys.argv: relaxedConformance = True bookDict = dict(bookDict.items() + addBookDict.items()) inputFilesIdx += 1 if '-s' in sys.argv: i = sys.argv.index('-s')+1 if len(sys.argv) < i+1: printUsage() if sys.argv[i].startsWith('a'): sortHelper = keynat # TODO: write appropriate helpers print('Sorting book files alphanumerically.') elif sys.argv[i].startsWith('na'): sortHelper = keynat print('Sorting book files naturally.') elif sys.argv[i].startsWith('c'): sortHelper = keynat # TODO: write appropriate helpers print('Sorting book files canonically.') else: sortHelper = usfmDocList.index print('Leaving book files unsorted.') else: sortHelper = keynat print('Sorting book files naturally.') usfmDocList = sys.argv[inputFilesIdx:] OSISfile = codecs.open(OSISfileName, 'w', 'utf-8') writeOSISHeader(OSISfile, OSISwork) # run # load up work queue work_queue = multiprocessing.Queue() for job in sorted(usfmDocList, key=sortHelper): work_queue.put(job) # create a queue to pass to workers to store the results result_queue = multiprocessing.Queue() # spawn workers for i in range(num_processes): worker = Worker(work_queue, result_queue) worker.start() # collect the results off the queue osisSegment = dict() for i in usfmDocList: k,v=result_queue.get() osisSegment[k]=v unhandledTags = set() for doc in sorted(usfmDocList, key=keynat): unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc])) OSISfile.write(osisSegment[doc]) writeOSISFooter(OSISfile) if unhandledTags: if verbose: print('') print('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)') if not relaxedConformance: print('Consider using the -r option for relaxed markup processing.')