summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Little <chrislit@crosswire.org>2012-08-26 22:25:30 +0000
committerChris Little <chrislit@crosswire.org>2012-08-26 22:25:30 +0000
commitfdb561136ce47be309d220cde81561bf02c0884d (patch)
tree94041754e125b0af184add173887d0d698655c42
parentd7bfe310c10142745931dd93deedc4fe96e6ffad (diff)
downloadsword-tools-fdb561136ce47be309d220cde81561bf02c0884d.tar.gz
added a scan of the USFM to capture id & toc3 fields for a global variable, allowing book sorting to work correctly
made book sorting a one-time event that applies immediately after the this scan and now applies to the processing stage so that printed feedback about processing is an indicator of the eventual output book order git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@397 07627401-56e2-0310-80f4-f8cd0041bdcd
-rwxr-xr-xmodules/python/usfm2osis.py73
1 files changed, 48 insertions, 25 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py
index 3ca8325..872726a 100755
--- a/modules/python/usfm2osis.py
+++ b/modules/python/usfm2osis.py
@@ -319,18 +319,13 @@ END PSF-licened segment
"""
def keycanon(filename):
- if filename2osis:
- return canonicalOrder.index(filename2osis[filename])
- else:
- return keynat(filename)
+ global filename2osis
+ return canonicalOrder.index(filename2osis[filename])
def keyusfm(filename):
- if filename2osis:
- return usfmNumericOrder.index(filename2osis[filename])
- else:
- return keynat(filename)
+ return usfmNumericOrder.index(filename2osis[filename])
-def convertToOSIS(sFile):
+def convertToOsis(sFile):
global encoding
global relaxedConformance
@@ -393,14 +388,9 @@ def convertToOSIS(sFile):
Identification
supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
"""
- global loc2osisBk, osis2locBk, filename2osis
+
# \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)
- # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
- osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
- if osisBook:
- osisBook = bookDict[osisBook.group(1)]
- filename2osis[filename] = osisBook
# \ide_<ENCODING>
osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
@@ -426,12 +416,6 @@ def convertToOSIS(sFile):
osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc2" n="\1"/>'+'\n', osis)
# \toc3_text...
- locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
- if locBook:
- locBook = locBook.group(1)
- if osisBook:
- osis2locBk[osisBook]=locBook
- loc2osisBk[locBook]=osisBook
osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)
return osis
@@ -1173,6 +1157,40 @@ def convertToOSIS(sFile):
return osis
+def readIdentifiersFromOsis(filename):
+ global encoding
+ global loc2osisBk, osis2locBk, filename2osis
+
+ ### Processing starts here
+ if encoding:
+ osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+ else:
+ encoding = 'utf-8'
+ osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+ # \ide_<ENCODING>
+ encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
+ if encoding:
+ encoding = encoding.group(1).lower().strip()
+ if encoding != 'utf-8':
+ if encoding in aliases:
+ osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+ else:
+ #print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8.'))
+ encoding = 'utf-8'
+
+ # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
+ osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis)
+ if osisBook:
+ osisBook = bookDict[osisBook.group(1)]
+ filename2osis[filename] = osisBook
+
+ locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
+ if locBook:
+ locBook = locBook.group(1)
+ if osisBook:
+ osis2locBk[osisBook]=locBook
+ loc2osisBk[locBook]=osisBook
+
def verbosePrint(text):
if verbose:
print(text)
@@ -1222,7 +1240,7 @@ class Worker(multiprocessing.Process):
break
# the actual processing
- osis = convertToOSIS(job)
+ osis = convertToOsis(job)
# store the result
self.result_queue.put((job,osis))
@@ -1315,12 +1333,14 @@ if __name__ == "__main__":
usfmDocList = sys.argv[inputFilesIdx:]
- osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+ for filename in usfmDocList:
+ readIdentifiersFromOsis(filename)
+ usfmDocList = sorted(usfmDocList, key=sortKey)
# run
# load up work queue
work_queue = multiprocessing.Queue()
- for job in sorted(usfmDocList, key=sortKey):
+ for job in usfmDocList:
work_queue.put(job)
# create a queue to pass to workers to store the results
@@ -1338,8 +1358,11 @@ if __name__ == "__main__":
osisSegment[k]=v
+ verbosePrint('Assembling OSIS document...')
+ osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+
unhandledTags = set()
- for doc in sorted(usfmDocList, key=sortKey):
+ for doc in usfmDocList:
unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
osisDoc += osisSegment[doc]