From 511fe89c20f06ff240dcf2d20f2b309bd6325ee5 Mon Sep 17 00:00:00 2001 From: Greg Hellings Date: Sun, 8 Jan 2012 17:30:58 +0000 Subject: Added a script which compared word frequencies between two modules and reports verses which differ greatly from the mean word ratio. Can be useful for helping to check that a module is in the correct versification scheme and that it does not have too many words crammed into the last verse in a chapter. git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@343 07627401-56e2-0310-80f4-f8cd0041bdcd --- versification/v11n_check.py | 107 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 versification/v11n_check.py (limited to 'versification') diff --git a/versification/v11n_check.py b/versification/v11n_check.py new file mode 100755 index 0000000..6530f2e --- /dev/null +++ b/versification/v11n_check.py @@ -0,0 +1,107 @@ +#! /usr/bin/env python +# Distributed under the "Here, have it" license +# Written by Greg Hellings, 2011, all rights reserved + +def usage(name): + ''' + Helpful hints for the user. Let them know what it is + that this script expects of them. + ''' + print "Usage: %s " % (name,) + print "Requires the SWORD Python library to operate." + print "Also requires the Numpy library (fast array calculations)" + sys.exit(-1) + +def increment(v, *args): + ''' + Copies the SWKey out of the first module and into any + other modules down the row. + ''' + v.increment() + k = v.getKey() + for m in args: + m.setKey(k) + +def main(argv): + ''' + Main loops and the like for the program. + ''' + if len(argv) != 4: + usage(argv[0]) + + if argv[3] not in ('OT', 'NT'): + usage(argv[0]) + + # Which testament are we comparing against + if argv[3] == 'NT': + checkNT = True + else: + checkNT = False + + # Fetch the original source language module + mgr = Sword.SWMgr() + original = mgr.getModule(argv[1]) + if original == None: + print "You might want to pick a translation that exists." + sys.exit(-1) + trans = mgr.getModule(argv[2]) + if trans == None: + print "You might want to pick a translation that exists." + sys.exit(-1) + + counts = [] + NT = False + # Iterate the whole selection + print"""\t********************************** + ******** Building tables ******** + **********************************""" + while original.Error() == '\x00' and trans.Error() == '\x00': + oWords = original.StripText().decode('utf-8').split() + tWords = trans.StripText().decode('utf-8').split() + key = Sword.VerseKey(original.getKey()) + + # Check if we've entered the NT + if not NT and key.getOSISRef().startswith('Matt'): + NT = True + + # Only check one testament + if checkNT != NT: + increment(original, trans) + continue + + if len(oWords) == 0: + print 'Unable to check verse %s - no content in source' % (key.getText(),) + elif len(tWords) == 0: + print 'Unable to check verse %s - no content in target' % (key.getText(),) + else: + counts.append((key.getOSISRef(), len(oWords) / float(len(tWords)), key.getVerse() == key.getChapterMax())) + + increment(original, trans) + + # Now that we have all the values, let's see if there + # are any that seem way out of whack + vals = [x for k, x, b in counts] + mean = numpy.average(vals) + std = numpy.std(vals) + + print """\t*********************************** + ****** Beginning comparisons ****** + ***********************************""" + print "Average ratio: %0.4f" % (mean,) + print "Standard dev: %0.4f" % (std,) + + # Now iterate the target translation + print "The following references fall outside of the target standard deviation" + for ref, ratio, b in counts: + if abs(ratio - mean) >= 2 * std and b: + print '%s' % (ref,) + +try: + import Sword + import sys + import numpy +except m: + usage(sys.argv[0]) + +if __name__ == '__main__': + main(sys.argv) -- cgit