diff options
author | Chris Little <chrislit@crosswire.org> | 2012-08-26 22:25:30 +0000 |
---|---|---|
committer | Chris Little <chrislit@crosswire.org> | 2012-08-26 22:25:30 +0000 |
commit | fdb561136ce47be309d220cde81561bf02c0884d (patch) | |
tree | 94041754e125b0af184add173887d0d698655c42 /modules/python | |
parent | d7bfe310c10142745931dd93deedc4fe96e6ffad (diff) | |
download | sword-tools-fdb561136ce47be309d220cde81561bf02c0884d.tar.gz |
added a scan of the USFM to capture id & toc3 fields for a global variable, allowing book sorting to work correctly
made book sorting a one-time event that applies immediately after the this scan and now applies to the processing stage so that printed feedback about processing is an indicator of the eventual output book order
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@397 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules/python')
-rwxr-xr-x | modules/python/usfm2osis.py | 73 |
1 files changed, 48 insertions, 25 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py index 3ca8325..872726a 100755 --- a/modules/python/usfm2osis.py +++ b/modules/python/usfm2osis.py @@ -319,18 +319,13 @@ END PSF-licened segment """ def keycanon(filename): - if filename2osis: - return canonicalOrder.index(filename2osis[filename]) - else: - return keynat(filename) + global filename2osis + return canonicalOrder.index(filename2osis[filename]) def keyusfm(filename): - if filename2osis: - return usfmNumericOrder.index(filename2osis[filename]) - else: - return keynat(filename) + return usfmNumericOrder.index(filename2osis[filename]) -def convertToOSIS(sFile): +def convertToOsis(sFile): global encoding global relaxedConformance @@ -393,14 +388,9 @@ def convertToOSIS(sFile): Identification supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3 """ - global loc2osisBk, osis2locBk, filename2osis + # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.) osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n' , osis, flags=re.DOTALL) - # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS - osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis) - if osisBook: - osisBook = bookDict[osisBook.group(1)] - filename2osis[filename] = osisBook # \ide_<ENCODING> osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above @@ -426,12 +416,6 @@ def convertToOSIS(sFile): osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc2" n="\1"/>'+'\n', osis) # \toc3_text... - locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis) - if locBook: - locBook = locBook.group(1) - if osisBook: - osis2locBk[osisBook]=locBook - loc2osisBk[locBook]=osisBook osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis) return osis @@ -1173,6 +1157,40 @@ def convertToOSIS(sFile): return osis +def readIdentifiersFromOsis(filename): + global encoding + global loc2osisBk, osis2locBk, filename2osis + + ### Processing starts here + if encoding: + osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' + else: + encoding = 'utf-8' + osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' + # \ide_<ENCODING> + encoding = re.search(r'\\ide\s+(.+)'+'\n', osis) + if encoding: + encoding = encoding.group(1).lower().strip() + if encoding != 'utf-8': + if encoding in aliases: + osis = codecs.open(filename, 'r', encoding).read().strip() + '\n' + else: + #print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8.')) + encoding = 'utf-8' + + # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS + osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis) + if osisBook: + osisBook = bookDict[osisBook.group(1)] + filename2osis[filename] = osisBook + + locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis) + if locBook: + locBook = locBook.group(1) + if osisBook: + osis2locBk[osisBook]=locBook + loc2osisBk[locBook]=osisBook + def verbosePrint(text): if verbose: print(text) @@ -1222,7 +1240,7 @@ class Worker(multiprocessing.Process): break # the actual processing - osis = convertToOSIS(job) + osis = convertToOsis(job) # store the result self.result_queue.put((job,osis)) @@ -1315,12 +1333,14 @@ if __name__ == "__main__": usfmDocList = sys.argv[inputFilesIdx:] - osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n' + for filename in usfmDocList: + readIdentifiersFromOsis(filename) + usfmDocList = sorted(usfmDocList, key=sortKey) # run # load up work queue work_queue = multiprocessing.Queue() - for job in sorted(usfmDocList, key=sortKey): + for job in usfmDocList: work_queue.put(job) # create a queue to pass to workers to store the results @@ -1338,8 +1358,11 @@ if __name__ == "__main__": osisSegment[k]=v + verbosePrint('Assembling OSIS document...') + osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n' + unhandledTags = set() - for doc in sorted(usfmDocList, key=sortKey): + for doc in usfmDocList: unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc])) osisDoc += osisSegment[doc] |