added a scan of the USFM to capture id & toc3 fields for a global variable, allowing book sorting to work correctly

made book sorting a one-time event that applies immediately after the this scan and now applies to the processing stage so that printed feedback about processing is an indicator of the eventual output book order git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@397 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Chris Little <chrislit@crosswire.org> 2012-08-26 22:25:30 +0000
committer: Chris Little <chrislit@crosswire.org> 2012-08-26 22:25:30 +0000
commit: fdb561136ce47be309d220cde81561bf02c0884d (patch)
tree: 94041754e125b0af184add173887d0d698655c42
parent: d7bfe310c10142745931dd93deedc4fe96e6ffad (diff)
download: sword-tools-fdb561136ce47be309d220cde81561bf02c0884d.tar.gz
1 files changed, 48 insertions, 25 deletions
diff --git a/modules/python/usfm2osis.py b/modules/python/usfm2osis.py
index 3ca8325..872726a 100755
--- a/modules/python/usfm2osis.py
+++ b/modules/python/usfm2osis.py
@@ -319,18 +319,13 @@ END PSF-licened segment
 """
 
 def keycanon(filename):
-    if filename2osis:
-        return canonicalOrder.index(filename2osis[filename])
-    else:
-        return keynat(filename)
+    global filename2osis
+    return canonicalOrder.index(filename2osis[filename])
 
 def keyusfm(filename):
-    if filename2osis:
-        return usfmNumericOrder.index(filename2osis[filename])
-    else:
-        return keynat(filename)
+    return usfmNumericOrder.index(filename2osis[filename])
 
-def convertToOSIS(sFile):
+def convertToOsis(sFile):
     global encoding
     global relaxedConformance
 
@@ -393,14 +388,9 @@ def convertToOSIS(sFile):
         Identification
         supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
         """
-        global loc2osisBk, osis2locBk, filename2osis
+
         # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
         osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)
-        # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
-        osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
-        if osisBook:
-            osisBook = bookDict[osisBook.group(1)]
-            filename2osis[filename] = osisBook
 
         # \ide_<ENCODING>
         osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
@@ -426,12 +416,6 @@ def convertToOSIS(sFile):
         osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc2" n="\1"/>'+'\n', osis)
 
         # \toc3_text...
-        locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
-        if locBook:
-            locBook = locBook.group(1)
-            if osisBook:
-                osis2locBk[osisBook]=locBook
-                loc2osisBk[locBook]=osisBook
         osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)
 
         return osis
@@ -1173,6 +1157,40 @@ def convertToOSIS(sFile):
 
     return osis
 
+def readIdentifiersFromOsis(filename):
+    global encoding
+    global loc2osisBk, osis2locBk, filename2osis
+
+    ### Processing starts here
+    if encoding:
+        osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+    else:
+        encoding = 'utf-8'
+        osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+        # \ide_<ENCODING>
+        encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
+        if encoding:
+            encoding = encoding.group(1).lower().strip()
+            if encoding != 'utf-8':
+                if encoding in aliases:
+                    osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+                else:
+                    #print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8.'))
+                    encoding = 'utf-8'
+
+    # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
+    osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis)
+    if osisBook:
+        osisBook = bookDict[osisBook.group(1)]
+        filename2osis[filename] = osisBook
+
+    locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
+    if locBook:
+        locBook = locBook.group(1)
+        if osisBook:
+            osis2locBk[osisBook]=locBook
+            loc2osisBk[locBook]=osisBook
+
 def verbosePrint(text):
     if verbose:
         print(text)
@@ -1222,7 +1240,7 @@ class Worker(multiprocessing.Process):
                 break
 
             # the actual processing
-            osis = convertToOSIS(job)
+            osis = convertToOsis(job)
 
             # store the result
             self.result_queue.put((job,osis))
@@ -1315,12 +1333,14 @@ if __name__ == "__main__":
 
         usfmDocList = sys.argv[inputFilesIdx:]
 
-        osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+        for filename in usfmDocList:
+            readIdentifiersFromOsis(filename)
+        usfmDocList = sorted(usfmDocList, key=sortKey)
 
         # run
         # load up work queue
         work_queue = multiprocessing.Queue()
-        for job in sorted(usfmDocList, key=sortKey):
+        for job in usfmDocList:
             work_queue.put(job)
 
         # create a queue to pass to workers to store the results
@@ -1338,8 +1358,11 @@ if __name__ == "__main__":
             osisSegment[k]=v
 
         
+        verbosePrint('Assembling OSIS document...')
+        osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+
         unhandledTags = set()
-        for doc in sorted(usfmDocList, key=sortKey):
+        for doc in usfmDocList:
             unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
             osisDoc += osisSegment[doc]
author	Chris Little <chrislit@crosswire.org>	2012-08-26 22:25:30 +0000
committer	Chris Little <chrislit@crosswire.org>	2012-08-26 22:25:30 +0000
commit	fdb561136ce47be309d220cde81561bf02c0884d (patch)
tree	94041754e125b0af184add173887d0d698655c42
parent	d7bfe310c10142745931dd93deedc4fe96e6ffad (diff)
download	sword-tools-fdb561136ce47be309d220cde81561bf02c0884d.tar.gz