#!/usr/bin/env python
# -*- coding: utf-8 -*-

date = '$Date$'
rev = '$Rev$'
id = '$Id$'

USFMversion = '2.35'  # http://ubs-icap.org/chm/usfm/2.35/index.html
OSISversion = '2.1.1' # http://www.bibletechnologies.net/osisCore.2.1.1.xsd
scriptVersion = '0.5'

# usfm2osis.py
# Copyright 2012 by the CrossWire Bible Society <http://www.crosswire.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# The full text of the GNU General Public License is available at:
# <http://www.gnu.org/licenses/gpl-3.0.txt>.


### Guidelines & objectives:
# Target Python 2.7+ (but support 3.2 if possible)
# Use no non-default libraries (this may change in the future)
# Don't use SWORD bindings (this will probably change to allow *optional* use of bindings, if installed)
# Achieve full coverage of USFM according to UBS spec:
#      <http://paratext.ubs-translations.org/about/usfm>
# Employ best-practice conformant OSIS
# Employ modularity (functions rather than a big long script)
# Employ the same command-line syntax as usfm2osis.pl
# Use non-characters for milestoning

### Roadmap:
# 0.5 initial commit, including full coverage of core USFM tags
# 0.6 file sorting options (natural/alphabetic/canonical/none); expand sub-verses with ! in osisIDs; Python3 compatability; add optional schema validator (lxml probably); docstrings; unittest; make fully OO; PyDev project? 
# 0.7 test suite incorporating all USFM examples from UBS ICAP and other complex cases
# 0.8 more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers; clear remaining TODO items, to the extent possible
# 1.0 feature complete for release & production use
# 1.x xreffix.pl-functionality (osisParse(ref)), requiring SWORD bindings
# 1.x SWORD-mode output?
# 1.x IMP output?
# 1.x SWORD module output?, requiring SWORD bindings

### TODO for next milestone:
# file sorting options (natural/alphabetic/canonical/none)
# expand sub-verses with ! in osisIDs
# Python3 compatability
# add optional schema validator (lxml probably)
# document functions (docstrings)
# unittest
# make fully OO
# PyDev project? 

### Key to non-characters:
# Used   : ﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡
# Unused : ﷧﷨﷩﷪﷫﷬﷭﷮﷯
# ﷐ book
# ﷑ chapter
# ﷒ verse
# ﷓ paragraph
# ﷔ title
# ﷕ ms1
# ﷖ ms2
# ﷗ ms3
# ﷘ ms4
# ﷙ ms5
# ﷚ s1
# ﷛ s2
# ﷜ s3
# ﷝ s4
# ﷞ s5
# ﷟ notes
# ﷠ intro-list
# ﷡ intro-outline
# ﷢ is1
# ﷣ is2
# ﷤ is3
# ﷥ is4
# ﷦ is5

# ﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞ sections

import sys, codecs, re
from encodings.aliases import aliases
import multiprocessing, Queue

date = date.replace('$', '').strip()[6:16]
rev = rev.replace('$', '').strip()[5:]

bookDict = {
    ### Known USFM Book codes from Paratext
    ### Cf. http://ubs-icap.org/chm/usfm/2.35/index.html?book_codes.htm
    # OT
    'GEN':'Gen', 'EXO':'Exod', 'LEV':'Lev', 'NUM':'Num', 'DEU':'Deut', 'JOS':'Josh', 'JDG':'Judg', 'RUT':'Ruth',
    '1SA':'1Sam', '2SA':'2Sam', '1KI':'1Kgs', '2KI':'2Kgs', '1CH':'1Chr', '2CH':'2Chr', 'EZR':'Ezra', 'NEH':'Neh',
    'EST':'Esth', 'JOB':'Job', 'PSA':'Ps', 'PRO':'Prov', 'ECC':'Eccl', 'SNG':'Song', 'ISA':'Isa', 'JER':'Jer',
    'LAM':'Lam', 'EZK':'Ezek', 'DAN':'Dan', 'HOS':'Hos', 'JOL':'Joel', 'AMO':'Amos', 'OBA':'Obad', 'JON':'Jonah',
    'MIC':'Mic', 'NAM':'Nah', 'HAB':'Hab', 'ZEP':'Zeph', 'HAG':'Hag', 'ZEC':'Zech', 'MAL':'Mal',
    # NT
    'MAT':'Matt', 'MRK':'Mark', 'LUK':'Luke', 'JHN':'John', 'ACT':'Acts', 'ROM':'Rom', '1CO':'1Cor', '2CO':'2Cor',
    'GAL':'Gal', 'EPH':'Eph', 'PHP':'Phil', 'COL':'Col', '1TH':'1Thess', '2TH':'2Thess', '1TI':'1Tim', '2TI':'2Tim',
    'TIT':'Titus', 'PHM':'Phlm', 'HEB':'Heb', 'JAS':'Jas', '1PE':'1Pet', '2PE':'2Pet', '1JN':'1John', '2JN':'2John',
    '3JN':'3John', 'JUD':'Jude', 'REV':'Rev',
    # DC - Catholic
    'TOB':'Tob', 'JDT':'Jdt', 'ESG':'EsthGr', 'WIS':'Wis', 'SIR':'Sir', 'BAR':'Bar', 'LJE':'EpJer', 'S3Y':'PrAzar',
    'SUS':'Sus', 'BEL':'Bel', '1MA':'1Macc', '2MA':'2Macc',
    # DC - Eastern Orthodox
    '3MA':'3Macc', '4MA':'4Macc', '1ES':'1Esd', '2ES':'2Esd', 'MAN':'PrMan', 'PS2':'Ps151',
    # Rahlfs' LXX
    'ODA':'Odes', 'PSS':'PssSol',
    # Esdrae
    'EZA':'4Ezra', '5EZ':'5Ezra', '6EZ':'6Ezra',
    # Inconsistency with Esther
    'DAG':'DanGr',
    # Syriac
    'PS3':'5ApocSyrPss', '2BA':'2Bar', 'LBA':'EpBar',
    # Ethiopic
    'JUB':'Jub', 'ENO':'1En', '1MQ':'1Meq', '2MQ':'2Meq', '3MQ':'3Meq', 'REP':'Reproof', '4BA':'4Bar',
    # Vulgate
    'LAO':'EpLao',

    # Additional non-biblical books
    'XXA':'XXA', 'XXB':'XXB', 'XXC':'XXC', 'XXD':'XXD', 'XXE':'XXE', 'XXF':'XXF', 'XXG':'XXG',

    # Peripheral books
    'FRT':'FRONT', 'INT':'INTRODUCTION', 'BAK':'BACK', 'CNC':'CONCORDANCE', 'GLO':'GLOSSARY',
    'TDX':'INDEX', 'NDX':'GAZETTEER', 'OTH':'X-OTHER'
    }

addBookDict = {
    ### Deprecated
    # Rahlfs
    'JSA':'JoshA', 'JDB':'JudgB', 'TBS':'TobS', 'SST':'SusTh', 'DNT':'DanTh', 'BLT':'BelTh',
    # Esdrae
    '4ES':'4Ezra', '5ES':'5Ezra', '6ES':'6Ezra',


    ### Proposed Additions <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
    # Alternate Psalms
    'PSB':'Ps',
    # Vulgate
    'PSO':'PrSol', 'PJE':'PrJer',
    # Armenian
    'WSI':'WSir', 'COP':'CorCorr', '3CO':'3Cor', 'EUT':'PrEut', 'DOJ':'DJohn',
    # Apostolic Fathers
    '1CL':'1Clem', '2CL':'2Clem', 'SHE':'Herm', 'LBA':'Barn', 'DID':'Did',
    ###
    # Proposed replacements <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
    'ODE':'Odes',

    # Additional biblical books
    'ADE':'AddEsth'
    }

canonicalOrder = (
    # OT
    'GEN', 'EXO', 'LEV', 'NUM', 'DEU', 'JOS', 'JDG', 'RUT', '1SA', '2SA', '1KI', '2KI', '1CH', '2CH', 'EZR', 'NEH',
    'EST', 'JOB', 'PSA', 'PRO', 'ECC', 'SNG', 'ISA', 'JER', 'LAM', 'EZK', 'DAN', 'HOS', 'JOL', 'AMO', 'OBA', 'JON',
    'MIC', 'NAM', 'HAB', 'ZEP', 'HAG', 'ZEC', 'MAL',
    # DC - Catholic
    'TOB', 'JDT', 'ESG', 'ADE', 'WIS', 'SIR', 'PSS', 'BAR', 'LJE', 'DAG', 'S3Y', 'SUS', 'BEL', '1MA', '2MA',
    # DC - Eastern Orthodox
    '1ES', 'MAN', 'PS2', '3MA', '2ES', '4MA',
    # NT
    'MAT', 'MRK', 'LUK', 'JHN', 'ACT', 'ROM', '1CO', '2CO', 'GAL', 'EPH', 'PHP', 'COL', '1TH', '2TH', '1TI', '2TI',
    'TIT', 'PHM', 'HEB', 'JAS', '1PE', '2PE', '1JN', '2JN', '3JN', 'JUD', 'REV',
    # Rahlfs' LXX
    'ODA', 'ODE',
    # Esdrae
    'EZA', '5EZ', '6EZ',
    # Inconsistency with Esther

    # Syriac
    'PS3', '2BA', 'LBA',
    # Ethiopic
    'JUB', 'ENO', '1MQ', '2MQ', '3MQ', 'REP', '4BA',
    # Vulgate
    'LAO',

    # Additional non-biblical books
    'XXA', 'XXB', 'XXC', 'XXD', 'XXE', 'XXF', 'XXG',

    # Peripheral books
    'FRT', 'INT', 'BAK', 'CNC', 'GLO',
    'TDX', 'NDX', 'OTH'
    ### Deprecated
    # Rahlfs
    'JSA', 'JDB', 'TBS', 'SST', 'DNT', 'BLT',
    # Esdrae
    '4ES', '5ES', '6ES',

    # Alternate Psalms
    'PSB',
    # Vulgate
    'PSO', 'PJE',
    # Armenian
    'WSI', 'COP', '3CO', 'EUT', 'DOJ',
    # Apostolic Fathers
    '1CL', '2CL', 'SHE', 'LBA', 'DID',
    ###
    # Proposed replacements <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
    )

specialBooks = ['FRONT', 'INTRODUCTION', 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER']

peripherals = {
    'Title Page':'titlePage', 'Half Title Page':'x-halfTitlePage', 'Promotional Page':'x-promotionalPage',
    'Imprimatur':'imprimatur', 'Publication Data':'publicationData', 'Foreword':'x-foreword', 'Preface':'preface',
    'Table of Contents':'tableofContents', 'Alphabetical Contents':'x-alphabeticalContents',
    'Table of Abbreviations':'x-tableofAbbreviations', 'Chronology':'x-chronology',
    'Weights and Measures':'x-weightsAndMeasures', 'Map Index':'x-mapIndex',
    'NT Quotes from LXX':'x-ntQuotesFromLXX',
    'Cover':'coverPage',
    'Spine':'x-spine'
    }

introPeripherals = {
    'Bible Introduction':'bible', 'Old Testament Introduction':'oldTestament',
    'Pentateuch Introduction':'pentateuch', 'History Introduction':'history', 'Poetry Introduction':'poetry',
    'Prophecy Introduction':'prophecy', 'New Testament Introduction':'newTestament',
    'Gospels Introduction':'gospels', 'Acts Introduction':'acts', 'Epistles Introduction':'epistles',
    'Letters Introduction':'letters', 'Deuterocanon Introduction':'deuterocanon'
    }

osis2locBk = dict()
loc2osisBk = dict()
verbose = bool()
ucs4 = (sys.maxunicode > 0xFFFF)

"""
BEGIN PSF-licensed segment
"""
"""
keynat from http://code.activestate.com/recipes/285264-natural-string-sorting/
"""
def keynat(string):
    r'''A natural sort helper function for sort() and sorted()
    without using regular expressions or exceptions.

    >>> items = ('Z', 'a', '10th', '1st', '9')
    >>> sorted(items)
    ['10th', '1st', '9', 'Z', 'a']
    >>> sorted(items, key=keynat)
    ['1st', '9', '10th', 'a', 'Z']
    '''
    it = type(1)
    r = []
    for c in string:
        if c.isdigit():
            d = int(c)
            if r and type( r[-1] ) == it:
                r[-1] = r[-1] * 10 + d
            else:
                r.append(d)
        else:
            r.append(c.lower())
    return r
"""
END PSF-licened segment
"""

def convertToOSIS(sFile):
    global encoding
    global relaxedConformance

    verbosePrint('Processing: ' + sFile)

    def cvtPreprocess(osis, relaxedConformance):
        # lines should never start with non-tags
        osis = re.sub('\n\s*([^\\\s])', r' \1', osis)  # TODO: test this
        # convert CR to LF
        osis = osis.replace('\r', '\n')
        # lines should never end with whitespace (other than \n)
        osis = re.sub('\s+\n', '\n', osis)
        # XML-encode as necessary
        osis = osis.replace('&', '&amp;')
        osis = osis.replace('<', '&lt;')
        osis = osis.replace('>', '&gt;')

        #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis)

        return osis


    def cvtRelaxedConformanceRemaps(osis, relaxedConformance):
        if not relaxedConformance:
            return osis

        # \tr#: DEP: map to \tr
        osis = re.sub(r'\\tr\d\b', r'\\tr', osis)

        # remapped 2.0 periphs
        # \pub
        osis = re.sub(r'\\pub\b\s', '\\periph Publication Data\n', osis)
        # \toc : \periph Table of Contents
        osis = re.sub(r'\\toc\b\s', '\\periph Table of Contents\n', osis)
        # \pref
        osis = re.sub(r'\\pref\b\s', '\\periph Preface\n', osis)
        # \maps
        osis = re.sub(r'\\maps\b\s', '\\periph Map Index\n', osis)
        # \cov
        osis = re.sub(r'\\cov\b\s', '\\periph Cover\n', osis)
        # \spine
        osis = re.sub(r'\\spine\b\s', '\\periph Spine\n', osis)
        # \pubinfo
        osis = re.sub(r'\\pubinfo\b\s', '\\periph Publication Information\n', osis)

        # \intro
        osis = re.sub(r'\\intro\b\s', '\\id INT\n', osis)
        # \conc
        osis = re.sub(r'\\conc\b\s', '\\id CNC\n', osis)
        # \glo
        osis = re.sub(r'\\glo\b\s', '\\id GLO\n', osis)
        # \idx
        osis = re.sub(r'\\idx\b\s', '\\id TDX\n', osis)

        return osis


    def cvtIdentification(osis, relaxedConformance):
        """
        Identification
        supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
        """
        global loc2osisBk, osis2locBk
        # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
        osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)#@
        # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
        osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
        if osisBook:
            osisBook = bookDict[osisBook.group(1)]

        # \ide_<ENCODING>
        osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above

        # \sts_<STATUS CODE>
        osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-sts" n="\1"/>'+'\n', osis)

        # \rem_text...
        osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
        
        # \restore_text...
        if relaxedConformance:
            osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis)

        # \h#_text...
        osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'<title type="runningHead">\1</title>'+'\n', osis)
        osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'<title type="runningHead" n="\1">\2</title>'+'\n', osis)

        # \toc1_text...
        osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc1" n="\1"/>'+'\n', osis)

        # \toc2_text...
        osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc2" n="\1"/>'+'\n', osis)

        # \toc3_text...
        locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
        if locBook:
            locBook = locBook.group(1)
            if osisBook:
                osis2locBk[osisBook]=locBook
                loc2osisBk[locBook]=osisBook
        osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)

        return osis


    def cvtIntroductions(osis, relaxedConformance):
        """
        Introductions
        supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
        """
        # \imt#_text...
        osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction">' + m.group(2) + '</title>', osis)

        # \imte#_text...
        osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)

        # \is#_text...
        osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'﷚<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\is2\s+(.+)', lambda m: u'﷛<div type="subSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷛[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\is3\s+(.+)', lambda m: u'﷜<div type="x-subSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷜[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\is4\s+(.+)', lambda m: u'﷝<div type="x-subSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷝[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\is5\s+(.+)', lambda m: u'﷞<div type="x-subSubSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, flags=re.DOTALL)

        # \ip_text...
        osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p subType="x-introduction">\n' + m.group(1) + u'﷓</p>\n', osis, flags=re.DOTALL)

        # \ipi_text...
        # \im_text...
        # \imi_text...
        # \ipq_text...
        # \imq_text...
        # \ipr_text...
        pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'}
        osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)]  + '" subType="x-introduction">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)

        # \iq#_text...
        osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)

        # \ib
        osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis)
        osis = osis.replace('\n</l>', '</l>\n')
        osis = re.sub(u'(<l [^﷐﷑﷓﷔]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>

        # \ili#_text...
        osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">﷠\1﷠</item>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\ili(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">﷠\2﷠</item>', osis, flags=re.DOTALL)
        osis = osis.replace('\n</item>', '</item>\n')
        osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', ur'﷓<list>\1</list>﷓', osis, flags=re.DOTALL)

        # \iot_text...
        # \io#_text...(references range)
        osis = re.sub(r'\\io\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-1" subType="x-introduction">﷡\1﷡</item>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\io(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="x-indent-\1" subType="x-introduction">﷡\2﷡</item>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|\\iex?|<lb\b|<title\b|<item\b))', ur'<item type="head">﷡\1﷡</item type="head">', osis, flags=re.DOTALL)
        osis = osis.replace('\n</item>', '</item>\n')
        osis = re.sub(u'(<item [^﷐﷑﷓﷔﷠]+</item>)', ur'﷓<div type="outline"><list>\1</list></div>﷓', osis, flags=re.DOTALL)
        osis = re.sub('item type="head"', 'head', osis)

        # \ior_text...\ior*
        osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'<reference>\1</reference>', osis, flags=re.DOTALL)
        
        # \iex    # TODO: look for example; I have no idea what this would look like in context
        osis = re.sub(r'\\iex\b\s*(.+?)'+u'?=(\s*(\\c|</div type="book">﷐))', r'<div type="bridge">\1</div>', osis, flags=re.DOTALL)

        # \iqt_text...\iqt*
        osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'<q subType="x-introduction">\1</q>', osis, flags=re.DOTALL)

        # \ie
        osis = re.sub(r'\\ie\b\s*', '<milestone type="x-usfm-ie"/>', osis)

        return osis


    def cvtTitles(osis, relaxedConformance):
        """
        Titles, Headings, and Labels
        supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
        """
        # \ms#_text...
        osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'﷕<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷕[^﷕﷐]+)', r'\1'+u'</div>﷕\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'﷖<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷖[^﷕﷐﷖]+)', r'\1'+u'</div>﷖\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'﷗<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷗[^﷕﷐﷖﷗]+)', r'\1'+u'</div>﷗\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\ms4\s+(.+)', lambda m: u'﷘<div type="majorSection" n="4"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷘[^﷕﷐﷖﷗﷘]+)', r'\1'+u'</div>﷘\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\ms5\s+(.+)', lambda m: u'﷙<div type="majorSection" n="5"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷙[^﷕﷐﷖﷗﷘﷙]+)', r'\1'+u'</div>﷙\n', osis, flags=re.DOTALL)

        # \mr_text...
        osis = re.sub(r'\\mr\s+(.+)', u'﷔<title type="scope"><reference>'+r'\1</reference></title>', osis)

        # \s#_text...
        osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'﷚<div type="section"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷚<div type="section">[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, flags=re.DOTALL)
        if relaxedConformance:
            osis = re.sub(r'\\ss\s+', r'\\s2 ', osis)
            osis = re.sub(r'\\sss\s+', r'\\s3 ', osis)
        osis = re.sub(r'\\s2\s+(.+)', lambda m: u'﷛<div type="subSection"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷛<div type="subSection">[^﷕﷐﷖﷗﷘﷙﷚﷛]+)', r'\1'+u'</div>﷛\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\s3\s+(.+)', lambda m: u'﷜<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷜<div type="x-subSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜]+)', r'\1'+u'</div>﷜\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\s4\s+(.+)', lambda m: u'﷝<div type="x-subSubSubSection"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷝<div type="x-subSubSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝]+)', r'\1'+u'</div>﷝\n', osis, flags=re.DOTALL)
        osis = re.sub(r'\\s5\s+(.+)', lambda m: u'﷞<div type="x-subSubSubSubSection"><title>' + m.group(1) + '</title>', osis)
        osis = re.sub(u'(﷞<div type="x-subSubSubSubSection">[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, flags=re.DOTALL)

        # \sr_text...
        osis = re.sub(r'\\sr\s+(.+)', ur'﷔<title type="scope"><reference>\1</reference></title>', osis)
        # \r_text...
        osis = re.sub(r'\\r\s+(.+)', ur'﷔<title type="parallel"><reference type="parallel">\1</reference></title>', osis)
        # \rq_text...\rq*
        osis = re.sub(r'\\rq\s+(.+?)\\rq\*', ur'<reference type="source">\1</reference>', osis, flags=re.DOTALL)

        # \d_text...
        osis = re.sub(r'\\d\s+(.+)', ur'﷔<title canonical="true" type="psalm">\1</title>', osis)

        # \sp_text...
        osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis)

        # \mt#_text...
        osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main">' + m.group(2) + '</title>', osis)
        # \mte#_text...
        osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-end">' + m.group(2) + '</title>', osis)

        return osis


    def cvtChaptersAndVerses(osis, relaxedConformance):
        """
        Chapters and Verses
        supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp*
        """
        # \c_#
        osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: u'﷑<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) +  u'<chapter eID="$BOOK$.' + m.group(1) + u'"/>﷓\n', osis, flags=re.DOTALL)

        # \cp_#
        # \ca_#\ca*
        def replaceChapterNumber(matchObj):
            ctext = matchObj.group(1)
            cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext)
            if cp:
                ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL)
                cp = cp.group(1)
                ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+ca+'"', ctext)
            ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
            if ca:
                ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL)
                ca = ca.group(1)
                ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext)
            return ctext
        osis = re.sub(r'(<chapter [^<]+sID[^<]+/>.+?<chapter eID[^>]+/>)', replaceChapterNumber, osis, flags=re.DOTALL)

        # \cl_
        osis = re.sub(r'\\cl\s+(.+)', u'﷔<title>'+r'\1</title>', osis)

        # \cd_#   <--This # seems to be an error
        osis = re.sub(r'\\cd\b\s+(.+)', u'﷔<title type="x-description">'+r'\1</title>', osis)

        # \v_#
        osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'﷒<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + '" sID="$BOOK$.$CHAP$.' + m.group(1) + '"/>' + m.group(2) +  '<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>﷒\n', osis, flags=re.DOTALL)

        # \vp_#\vp*
        # \va_#\va*
        def replaceVerseNumber(matchObj):
            vtext = matchObj.group(1)
            vp = re.search(r'\\vp\s+(.+?)\\vp*', vtext)
            if vp:
                vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL)
                vp = vp.group(1)
                vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+va+'"', vtext)
            va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
            if va:
                vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL)
                va = va.group(1)
                vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext)
            return vtext
        osis = re.sub(r'(<verse [^<]+sID[^<]+/>.+?<verse eID[^>]+/>)', replaceVerseNumber, osis, flags=re.DOTALL)

        return osis


    def cvtParagraphs(osis, relaxedConformance):
        """
        Paragraphs
        supported: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b
        """
        # \p(_text...)
        osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p>\n' + m.group(1) + u'﷓</p>\n', osis, flags=re.DOTALL)

        # \pc(_text...)
        # \pr(_text...)
        # \m(_text...)
        # \pmo(_text...)
        # \pm(_text...)
        # \pmc(_text...)
        # \pmr_text...          # deprecated: map to same as \pr
        # \pi#(_Sample text...)
        # \mi(_text...)
        # \nb
        # \phi # deprecated
        # \ps # deprecated
        # \psi # deprecated
        # \p# # deprecated
        pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'}
        paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
        if relaxedConformance:
            paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
        osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="' + pType[m.group(1)]  + '">\n' + m.group(2) + u'﷓</p>\n', osis, flags=re.DOTALL)

        # \cls_text...
        osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<closer>' + m.group(1) + u'﷓</closer>\n', osis, flags=re.DOTALL)

        # \ph#(_text...)
        # \li#(_text...)
        osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
        osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis)
        osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
        osis = osis.replace('\n</item>', '</item>\n')
        osis = re.sub(u'(<item [^﷐﷑﷓﷔﷠﷡]+</item>)', ur'﷓<list>\1</list>﷓', osis, flags=re.DOTALL)

        # \b
        osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)

        return osis


    def cvtPoetry(osis, relaxedConformance):
        """
        Poetry
        supported: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b
        """
        # \qs_(Selah)\qs*
        osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)

        # \q#(_text...)
        osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
        osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)

        # \qr_text...
        # \qc_text...
        # \qm#(_text...)
        qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
        osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞'+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)

        osis = osis.replace('\n</l>', '</l>\n')
        osis = re.sub(u'(<l [^﷐﷑﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)

        # \b
        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>

        # \qa_text...
        osis = re.sub(r'\\qa\s+(.+)', u'﷔<title type="acrostic">'+r'\1</title>', osis)

        # \qac_text...\qac*
        osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL)

        return osis


    def cvtTables(osis, relaxedConformance):
        """
        Tables
        supported: \tr, \th#, \thr#, \tc#, \tcr#
        """
        # \tr_
        osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\tr\s|<lb\b|<title\b))', r'<row>\1</row>', osis, flags=re.DOTALL)

        # \th#_text...
        # \thr#_text...
        # \tc#_text...
        # \tcr#_text...
        tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
        osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: '<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)

        return osis


    def processNote(note):
        note = note.replace('\n', ' ')

        # \fdc_refs...\fdc*
        note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'<seg editions="dc">\1</seg>', note)

        # \fq_
        note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)

        # \fqa_
        note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<rdg type="alternate">\1</rdg>', note)

        # \ft_
        note = re.sub(r'\\ft\s', '', note)

        # \fr_##SEP##
        note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<reference type="annotateRef">\1</reference>', note)

        # \fk_
        note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)

        # \fl_
        note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<label>\1</label>', note)

        # \fp_
        note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'<p>\1</p>', note)
        note = re.sub(r'(<note\b[^>]*?>)(.*?)<p>', r'\1<p>\2</p><p>', note)

        # \fv_
        note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<hi type="super">\1</hi>', note)

        if relaxedConformance:
            note = note.replace(r'\fq*', '')
            note = note.replace(r'\fqa*', '')
            note = note.replace(r'\ft*', '')
            note = note.replace(r'\fr*', '')
            note = note.replace(r'\fk*', '')
            note = note.replace(r'\fl*', '')
            note = note.replace(r'\fp*', '')
            note = note.replace(r'\fv*', '')

        note = note.replace(u'﷟', '')
        return note


    def cvtFootnotes(osis, relaxedConformance):
        """
        Footnotes
        supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
        """
        # \f_+_...\f*
        osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)

        # \fe_+_...\fe*
        osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)

        osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)

        # \fm_...\fm*
        osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'<hi type="super">\1</hi>', osis)

        return osis


    def processXref(note):
        note = note.replace('\n', ' ')

        # \xot_refs...\xot*
        note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u'﷟'+r'<seg editions="ot">\1</seg>', note)

        # \xnt_refs...\xnt*
        note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u'﷟'+r'<seg editions="nt">\1</seg>', note)

        # \xdc_refs...\xdc*
        note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u'﷟'+r'<seg editions="dc">\1</seg>', note)

        # \xq_
        note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)

        # \xo_##SEP##
        note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference type="annotateRef">\1</reference>', note)

        # \xk_
        note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)

        # \xt_  # This isn't guaranteed to be *the* reference, but it's a good guess.
        note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
        
        if relaxedConformance:
            # TODO: move this to a concorance/index-specific section?
            # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
            note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', u'﷟'+r'<reference osisRef="\1">See: \1</reference>', note)
            # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
            note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', u'﷟'+r'<reference osisRef="\1">See also: \1</reference>', note)

        if relaxedConformance:
            note = note.replace(r'\xq*', '')
            note = note.replace(r'\xt*', '')
            note = note.replace(r'\xo*', '')
            note = note.replace(r'\xk*', '')

        note = note.replace(u'﷟', '')
        return note


    def cvtCrossReferences(osis, relaxedConformance):
        """
        Cross References
        supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
        """
        # \x_+_...\x*
        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)

        osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)

        return osis


        """
        Special Text and Character Styles
        """
    def cvtSpecialText(osis, relaxedConformance):
        """
        Special Text
        supported: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj*
        """
        # \add_...\add*
        osis = re.sub(r'\\add\s+(.+?)\\add\*', r'<transChange type="added">\1</transChange>', osis, flags=re.DOTALL)

        # \wj_...\wj*
        osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'<q who="Jesus" marker="">\1</q>', osis, flags=re.DOTALL)

        # \nd_...\nd*
        osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'<divineName>\1</divineName>', osis, flags=re.DOTALL)

        # \pn_...\pn*
        osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'<name>\1</name>', osis, flags=re.DOTALL)

        # \qt_...\qt* # TODO:should this be <q>?
        osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'<seg type="otPassage">\1</seg>', osis, flags=re.DOTALL)

        # \sig_...\sig*
        osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'<signed>\1</signed>', osis, flags=re.DOTALL)

        # \ord_...\ord*
        osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'<hi type="super">\1</hi>', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript)

        # \tl_...\tl*
        osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'<foreign>\1</foreign>', osis, flags=re.DOTALL)

        # \bk_...\bk*
        osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'<name type="x-workTitle">\1</name>', osis, flags=re.DOTALL)

        # \k_...\k*
        osis = re.sub(r'\\k\s+(.+?)\\k\*', r'<seg type="keyword">\1</seg>', osis, flags=re.DOTALL)

        # \lit
        osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'﷓<p type="x-liturgical">\n' + m.group(1) + u'﷓</p>\n', osis, flags=re.DOTALL)

        # \dc_...\dc*  # TODO: Find an example---should this really be transChange?
        osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL)

        # \sls_...\sls*
        osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL)  # find a better mapping than <foreign>?

        if relaxedConformance:
            # \addpn...\addpn*
            osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'<hi type="x-dotUnderline">\1</hi>', osis, flags=re.DOTALL)
            # \k# # TODO: unsure of this tag's purpose
            osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'<seg type="keyword" n="1">\1</seg>', osis, flags=re.DOTALL)
            osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'<seg type="keyword" n="2">\1</seg>', osis, flags=re.DOTALL)
            osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'<seg type="keyword" n="3">\1</seg>', osis, flags=re.DOTALL)
            osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'<seg type="keyword" n="4">\1</seg>', osis, flags=re.DOTALL)
            osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'<seg type="keyword" n="5">\1</seg>', osis, flags=re.DOTALL)
            

        return osis


    def cvtCharacterStyling(osis, relaxedConformance):
        """
        Character Styling
        supported: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc*
        """
        # \em_...\em*
        osis = re.sub(r'\\em\s+(.+?)\\em\*', r'<hi type="emphasis">\1</hi>', osis, flags=re.DOTALL)

        # \bd_...\bd*
        osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'<hi type="bold">\1</hi>', osis, flags=re.DOTALL)

        # \it_...\it*
        osis = re.sub(r'\\it\s+(.+?)\\it\*', r'<hi type="italic">\1</hi>', osis, flags=re.DOTALL)

        # \bdit_...\bdit*
        osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'<hi type="bold"><hi type="italic">\1</hi></hi>', osis, flags=re.DOTALL)

        # \no_...\no*
        osis = re.sub(r'\\no\s+(.+?)\\no\*', r'<hi type="normal">\1</hi>', osis, flags=re.DOTALL)

        # \sc_...\sc*
        osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'<hi type="small-caps">\1</hi>', osis, flags=re.DOTALL)

        return osis


    def cvtSpacingAndBreaks(osis, relaxedConformance):
        """
        Spacing and Breaks
        supported: ~, //, \pb
        """
        # ~
        osis = osis.replace('~', '\uA0')

        # //
        osis = osis.replace('//', '')

        # \pb
        osis = re.sub(r'\\pb\s*', '<milestone type="pb"/>\n', osis, flags=re.DOTALL)

        return osis


    def cvtSpecialFeatures(osis, relaxedConformance):
        """
        Special Features
        supported: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh*
        """
        # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
        def makeFigure(matchObject):
            fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups()
            figure = '<figure'
            if  fig_file:
                figure += ' src="' + fig_file + '"'
            if fig_size:
                figure += ' size="' + fig_size + '"'
            if fig_copy:
                figure += ' rights="' + fig_copy + '"'
            """ TODO: implement parsing in osisParse(Bible reference string)
            if fig_ref:
                figure += ' annotateRef="' + osisParse(fig_ref) + '"'
            """
            figure += '>\n'
            if fig_cap:
                figure += '<caption>' + fig_cap + '</caption>\n'
            if fig_ref:
                figure += '<reference type="annotateRef">' + fig_ref + '</reference>\n'
            if fig_desc:
                figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
            if fig_loc:
                figure += '<!-- fig LOC - ' + fig_loc + ' -->\n'
            figure += '</figure>'
            return figure
        osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis)

        # \ndx_...\ndx* # TODO tag with x-glossary instead of <index/>? Is <index/> containerable?
        osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1<index index="Index" level1="\1"/>\2', osis, flags=re.DOTALL)

        # \pro_...\pro*
        osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'<w xlit="\3">\1</w>\2\4', osis, flags=re.DOTALL)

        # \w_...\w*
        osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1<index index="Glossary" level1="\1"/>\2', osis, flags=re.DOTALL)

        # \wg_...\wg*
        osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1<index index="Greek" level1="\1"/>\2', osis, flags=re.DOTALL)

        # \wh_...\wh*
        osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL)

        if relaxedConformance:
            # \wr...\wr*
            osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1<index index="Reference" level1="\1"/>\2', osis, flags=re.DOTALL)

        return osis


    def cvtPeripherals(osis, relaxedConformance):
        """
        Peripherals
        supported: \periph
        """
        # \periph
        def tagPeriph(matchObject):
            periphType,contents = matchObject
            periph = '<div type="'
            if periphType in peripherals:
                periph += peripherals[periphType]
            elif periphType in introPeripherals:
                periph += 'introduction" subType="x-' + introPeripherals[periphType]
            else:
                periph += 'x-unknown'
            periph += '">\n' +  contents + '</div>\n'
            return periph

        osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)

        return osis


    def cvtStudyBibleContent(osis, relaxedConformance):
        """
        Study Bible Content
        supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
        """
        # \ef...\ef*
        osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
        osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)

        # \ex...\ex*
        osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
        osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)

        # \esb...\esbex  # TODO: this likely needs to go much earlier in the process
        osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', ur'﷕<div type="x-sidebar">\1</div>﷕'+'\n', osis, flags=re.DOTALL)

        # \cat_<TAG>\cat*
        osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis)

        return osis


    def cvtPrivateUseExtensions(osis, relaxedConformance):
        """
        \z namespace
        supported: \z<Extension>
        We can't really know what these mean, but will preserve them as <milestone/> elements.
        """
        # publishing assistant markers
        # \zpa-xb...\zpa-xb* : \periph Book
        # \zpa-xc...\zpa-xc* : \periph Chapter
        # \zpa-xv...\zpa-xv* : \periph Verse
        # \zpa-xd...\zpa-xd* : \periph Description
        # TODO: Decide how these should actually be encoded. In lieu of that,
        # these can all be handled by the default \z Namespace handlers:

        # \z{X}...\z{X}*
        osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'<seg type="x-\1">\2</seg>', osis, flags=re.DOTALL)
        
        # \z{X}
        osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-usfm-z-\1"/>', osis)

        return osis


    def processOsisIDs(osis):
        # TODO: add support for subverses, including in ranges/series, e.g. Matt.1.1!b-Matt.2.5,Matt.2.7!a
        # expand verse ranges, series
        def expandRange(vRange):
            vRange = re.findall(r'\d+', vRange)
            osisID = list()
            for n in range(int(vRange[0]), max(int(vRange[0]), int(vRange[1]))+1):
                osisID.append('$BOOK$.$CHAP$.'+str(n))
            return ' '.join(osisID)
        osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis)

        def expandSeries(vSeries):
            vSeries = re.findall(r'\d+', vSeries)
            osisID = list()
            for n in vSeries:
                osisID.append('$BOOK$.$CHAP$.'+str(n))
            return ' '.join(osisID)
        osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis)


        # fill in book & chapter values
        bookChunks = osis.split(u'﷐')
        osis = ''
        for bc in bookChunks:
            bookValue = re.search(r'<div type="book" osisID="([^"]+?)"', bc)
            if bookValue:
                bookValue = bookValue.group(1)
                bc = bc.replace('$BOOK$', bookValue)
                chapChunks = bc.split(u'﷑')
                newbc = ''
                for cc in chapChunks:
                    chapValue = re.search(r'<chapter osisID="[^\."]+\.([^"]+)', cc)
                    if chapValue:
                        chapValue = chapValue.group(1)
                        cc = cc.replace('$CHAP$', chapValue)
                    newbc += cc
                bc = newbc
            osis += bc
        return osis


    def osisReorderAndCleanup(osis):
        # assorted re-orderings
        osis = re.sub(u'(﷓<chapter eID=.+?\n)(<verse eID=.+?>﷒)\n?', r'\2'+'\n'+r'\1', osis)
        osis = re.sub(u'([﷕﷖﷗﷘﷙]</div>)([^﷕﷖﷗﷘﷙]*<chapter eID.+?>)', r'\2\1', osis)
        osis = re.sub(u'(﷓</p>\n?﷓<p>)\n?(<verse eID=.+?>﷒)\n?', r'\2'+'\n'+r'\1'+'\n', osis)
        osis = re.sub(u'\n(<verse eID=.+?>﷒)', r'\1'+'\n', osis)
        osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[﷒\n]*<verse osisID=.+?>)', r'\2\1', osis)

        # delete attributes from end tags (since they are invalid)
        osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
        osis = osis.replace('<lb type="x-p"/>', '<lb/>')
        # delete Unicode tags
        for c in u'﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯':
            osis = osis.replace(c, '')

        for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse', 'head', 'title', 'item', 'list']:
            osis = re.sub('\s+</'+endBlock+'>', '</'+endBlock+r'>\n', osis)
            osis = re.sub('\s+<'+endBlock+'( eID=[^/>]+/>)', '<'+endBlock+r'\1'+'\n', osis)
        osis = re.sub(' +((</[^>]+>)+) *', r'\1 ', osis)

        # strip extra spaces & newlines
        osis = re.sub('  +', ' ', osis)
        osis = re.sub(' ?\n\n+', '\n', osis)
        return osis


    ### Processing starts here
    if encoding:
        osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
    else:
        encoding = 'utf-8'
        osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
        # \ide_<ENCODING>
        encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
        if encoding:
            encoding = encoding.group(1).lower().strip()
            if encoding != 'utf-8':
                if encoding in aliases:
                    osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
                else:
                    print('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.')
                    encoding = 'utf-8'


    # call individual conversion processors in series
    osis = cvtPreprocess(osis, relaxedConformance)
    osis = cvtRelaxedConformanceRemaps(osis, relaxedConformance)
    osis = cvtIdentification(osis, relaxedConformance)
    osis = cvtIntroductions(osis, relaxedConformance)
    osis = cvtTitles(osis, relaxedConformance)
    osis = cvtChaptersAndVerses(osis, relaxedConformance)
    osis = cvtParagraphs(osis, relaxedConformance)
    osis = cvtPoetry(osis, relaxedConformance)
    osis = cvtTables(osis, relaxedConformance)
    osis = cvtFootnotes(osis, relaxedConformance)
    osis = cvtCrossReferences(osis, relaxedConformance)
    osis = cvtSpecialText(osis, relaxedConformance)
    osis = cvtCharacterStyling(osis, relaxedConformance)
    osis = cvtSpacingAndBreaks(osis, relaxedConformance)
    osis = cvtSpecialFeatures(osis, relaxedConformance)
    osis = cvtPeripherals(osis, relaxedConformance)
    osis = cvtStudyBibleContent(osis, relaxedConformance)
    osis = cvtPrivateUseExtensions(osis, relaxedConformance)

    osis = processOsisIDs(osis)
    osis = osisReorderAndCleanup(osis)

    # change type on special books
    for sb in specialBooks:
        osis = osis.replace('<div type="book" osisID="' + sb  + '">', '<div type="' + sb.lower() + '">')

    if DEBUG:
        localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis))
        if localUnhandledTags:
            print('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)')

    return osis


def writeOSISHeader(oFile, workID, lang='en'):
    oFile.write('<?xml version="1.0" encoding="UTF-8"?>\n<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+OSISversion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="' + lang + '" osisIDWork="' + workID + '">\n<header>\n<work osisWork="' + workID + '"/>\n</header>\n')

def writeOSISFooter(oFile):
    oFile.write('</osisText>\n</osis>\n')

def verbosePrint(text):
    if verbose:
        print text

def printUsage():
    print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion)
    print('                Revision: ' + rev + ' (' + date + ')')
    print('')
    print('Usage: usfm2osis.py <osisWork> [OPTION] ...  <USFM filename|wildcard> ...')
    print('')
    print('  -d               debug mode (single-threaded, verbose output')
    print('  -e ENCODING      input encoding override (default is to read the USFM file\'s')
    print('                     \\ide value or assume UTF-8 encoding in its absence)')
    print('  -h, --help       print this usage information')
    print('  -o FILENAME      output filename (default is: <osisWork>.osis.xml)')
    print('  -r               enable relaxed markup processing (for non-standard USFM)')
    print('  -s mode          set book sorting mode: natural (default), alpha, canonical, none')
    print('  -v               verbose feedback')
    print('')
    print('As an example, if you want to generate the osisWork <Bible.KJV> and your USFM')
    print('  are located in the ./KJV folder, enter:')
    print('    python usfm2osis.py Bible.KJV ./KJV/*.usfm')
    verbosePrint('')
    verbosePrint('Supported encodings: ' + ', '.join(aliases))


class Worker(multiprocessing.Process):
    def __init__(self, work_queue, result_queue):

        # base class initialization
        multiprocessing.Process.__init__(self)

        # job management stuff
        self.work_queue = work_queue
        self.result_queue = result_queue
        self.kill_received = False

    def run(self):
        while not self.kill_received:

            # get a task
            #job = self.work_queue.get_nowait()
            try:
                job = self.work_queue.get_nowait()
            except Queue.Empty:
                break

            # the actual processing
            osis = convertToOSIS(job)

            # store the result
            self.result_queue.put((job,osis))


if __name__ == "__main__":
    global encoding
    global relaxedConformance

    num_processes = multiprocessing.cpu_count()
    num_jobs = num_processes

    encoding = ''
    relaxedConformance = False
    inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted.
    usfmDocList = list()

    if '-v' in sys.argv:
        verbose = True
        inputFilesIdx += 1
    else:
        verbose = False

    if '-d' in sys.argv:
        DEBUG = True
        inputFilesIdx += 1
        num_processes = 1
        num_jobs = 1
        verbose = True
    else:
        DEBUG = False

    if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
        printUsage()
    else:
        OSISwork = sys.argv[1]

        if '-o' in sys.argv:
            i = sys.argv.index('-o')+1
            if len(sys.argv) < i+1:
                printUsage()
            OSISfileName = sys.argv[i]
            inputFilesIdx += 2 # increment 2, reflecting 2 args for -o
        else:
            OSISfileName = OSISwork + '.osis.xml'

        if '-e' in sys.argv:
            i = sys.argv.index('-e')+1
            if len(sys.argv) < i+1:
                printUsage()
            encoding = sys.argv[i]
            inputFilesIdx += 2 # increment 2, reflecting 2 args for -e

        if '-r' in sys.argv:
            relaxedConformance = True
            bookDict = dict(bookDict.items() + addBookDict.items())
            inputFilesIdx += 1

        if '-s' in sys.argv:
            i = sys.argv.index('-s')+1
            if len(sys.argv) < i+1:
                printUsage()
            if sys.argv[i].startsWith('a'):
                sortHelper = keynat # TODO: write appropriate helpers
                print('Sorting book files alphanumerically.')
            elif sys.argv[i].startsWith('na'):
                sortHelper = keynat
                print('Sorting book files naturally.')
            elif sys.argv[i].startsWith('c'):
                sortHelper = keynat # TODO: write appropriate helpers
                print('Sorting book files canonically.')
            else:
                sortHelper = usfmDocList.index
                print('Leaving book files unsorted.')
        else:
            sortHelper = keynat
            print('Sorting book files naturally.')


        usfmDocList = sys.argv[inputFilesIdx:]

        OSISfile = codecs.open(OSISfileName, 'w', 'utf-8')
        writeOSISHeader(OSISfile, OSISwork)


        # run
        # load up work queue
        work_queue = multiprocessing.Queue()
        for job in sorted(usfmDocList, key=sortHelper):
            work_queue.put(job)

        # create a queue to pass to workers to store the results
        result_queue = multiprocessing.Queue()

        # spawn workers
        for i in range(num_processes):
            worker = Worker(work_queue, result_queue)
            worker.start()

        # collect the results off the queue
        osisSegment = dict()
        for i in usfmDocList:
            k,v=result_queue.get()
            osisSegment[k]=v

        unhandledTags = set()
        for doc in sorted(usfmDocList, key=keynat):
            unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
            OSISfile.write(osisSegment[doc])

        writeOSISFooter(OSISfile)

        if unhandledTags:
            if verbose:
                print('')
            print('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)')
            if not relaxedConformance:
                print('Consider using the -r option for relaxed markup processing.')