summaryrefslogtreecommitdiffstats
path: root/versification
diff options
context:
space:
mode:
authorGreg Hellings <greg.hellings@gmail.com>2012-01-08 17:30:58 +0000
committerGreg Hellings <greg.hellings@gmail.com>2012-01-08 17:30:58 +0000
commit511fe89c20f06ff240dcf2d20f2b309bd6325ee5 (patch)
tree1eb2aff920e274cbe8e6ebd0a8cd3d358eb2f1b2 /versification
parentf4f7a746e3373c4e887598cecc33224a9af41feb (diff)
downloadsword-tools-511fe89c20f06ff240dcf2d20f2b309bd6325ee5.tar.gz
Added a script which compared word frequencies between two modules
and reports verses which differ greatly from the mean word ratio. Can be useful for helping to check that a module is in the correct versification scheme and that it does not have too many words crammed into the last verse in a chapter. git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@343 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'versification')
-rwxr-xr-xversification/v11n_check.py107
1 files changed, 107 insertions, 0 deletions
diff --git a/versification/v11n_check.py b/versification/v11n_check.py
new file mode 100755
index 0000000..6530f2e
--- /dev/null
+++ b/versification/v11n_check.py
@@ -0,0 +1,107 @@
+#! /usr/bin/env python
+# Distributed under the "Here, have it" license
+# Written by Greg Hellings, 2011, all rights reserved
+
+def usage(name):
+ '''
+ Helpful hints for the user. Let them know what it is
+ that this script expects of them.
+ '''
+ print "Usage: %s <original mod> <mod to check> <OT|NT>" % (name,)
+ print "Requires the SWORD Python library to operate."
+ print "Also requires the Numpy library (fast array calculations)"
+ sys.exit(-1)
+
+def increment(v, *args):
+ '''
+ Copies the SWKey out of the first module and into any
+ other modules down the row.
+ '''
+ v.increment()
+ k = v.getKey()
+ for m in args:
+ m.setKey(k)
+
+def main(argv):
+ '''
+ Main loops and the like for the program.
+ '''
+ if len(argv) != 4:
+ usage(argv[0])
+
+ if argv[3] not in ('OT', 'NT'):
+ usage(argv[0])
+
+ # Which testament are we comparing against
+ if argv[3] == 'NT':
+ checkNT = True
+ else:
+ checkNT = False
+
+ # Fetch the original source language module
+ mgr = Sword.SWMgr()
+ original = mgr.getModule(argv[1])
+ if original == None:
+ print "You might want to pick a translation that exists."
+ sys.exit(-1)
+ trans = mgr.getModule(argv[2])
+ if trans == None:
+ print "You might want to pick a translation that exists."
+ sys.exit(-1)
+
+ counts = []
+ NT = False
+ # Iterate the whole selection
+ print"""\t**********************************
+ ******** Building tables ********
+ **********************************"""
+ while original.Error() == '\x00' and trans.Error() == '\x00':
+ oWords = original.StripText().decode('utf-8').split()
+ tWords = trans.StripText().decode('utf-8').split()
+ key = Sword.VerseKey(original.getKey())
+
+ # Check if we've entered the NT
+ if not NT and key.getOSISRef().startswith('Matt'):
+ NT = True
+
+ # Only check one testament
+ if checkNT != NT:
+ increment(original, trans)
+ continue
+
+ if len(oWords) == 0:
+ print 'Unable to check verse %s - no content in source' % (key.getText(),)
+ elif len(tWords) == 0:
+ print 'Unable to check verse %s - no content in target' % (key.getText(),)
+ else:
+ counts.append((key.getOSISRef(), len(oWords) / float(len(tWords)), key.getVerse() == key.getChapterMax()))
+
+ increment(original, trans)
+
+ # Now that we have all the values, let's see if there
+ # are any that seem way out of whack
+ vals = [x for k, x, b in counts]
+ mean = numpy.average(vals)
+ std = numpy.std(vals)
+
+ print """\t***********************************
+ ****** Beginning comparisons ******
+ ***********************************"""
+ print "Average ratio: %0.4f" % (mean,)
+ print "Standard dev: %0.4f" % (std,)
+
+ # Now iterate the target translation
+ print "The following references fall outside of the target standard deviation"
+ for ref, ratio, b in counts:
+ if abs(ratio - mean) >= 2 * std and b:
+ print '%s' % (ref,)
+
+try:
+ import Sword
+ import sys
+ import numpy
+except m:
+ usage(sys.argv[0])
+
+if __name__ == '__main__':
+ main(sys.argv)