From 511fe89c20f06ff240dcf2d20f2b309bd6325ee5 Mon Sep 17 00:00:00 2001
From: Greg Hellings <greg.hellings@gmail.com>
Date: Sun, 8 Jan 2012 17:30:58 +0000
Subject: Added a script which compared word frequencies between two modules
 and reports verses which differ greatly from the mean word ratio.

Can be useful for helping to check that a module is in the correct
versification scheme and that it does not have too many words
crammed into the last verse in a chapter.

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@343 07627401-56e2-0310-80f4-f8cd0041bdcd
---
 versification/v11n_check.py | 107 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100755 versification/v11n_check.py

(limited to 'versification')
diff --git a/versification/v11n_check.py b/versification/v11n_check.py
new file mode 100755
index 0000000..6530f2e
--- /dev/null
+++ b/versification/v11n_check.py
@@ -0,0 +1,107 @@
+#! /usr/bin/env python
+# Distributed under the "Here, have it" license
+# Written by Greg Hellings, 2011, all rights reserved
+
+def usage(name):
+	'''
+	Helpful hints for the user. Let them know what it is
+	that this script expects of them.
+	'''
+	print "Usage: %s <original mod> <mod to check> <OT|NT>" % (name,)
+	print "Requires the SWORD Python library to operate."
+	print "Also requires the Numpy library (fast array calculations)"
+	sys.exit(-1)
+
+def increment(v, *args):
+	'''
+	Copies the SWKey out of the first module and into any
+	other modules down the row.
+	'''
+	v.increment()
+	k = v.getKey()
+	for m in args:
+		m.setKey(k)
+
+def main(argv):
+	'''
+	Main loops and the like for the program.
+	'''
+	if len(argv) != 4:
+		usage(argv[0])
+	
+	if argv[3] not in ('OT', 'NT'):
+		usage(argv[0])
+
+	# Which testament are we comparing against
+	if argv[3] == 'NT':
+		checkNT = True
+	else:
+		checkNT = False
+
+	# Fetch the original source language module
+	mgr = Sword.SWMgr()
+	original = mgr.getModule(argv[1])
+	if original == None:
+		print "You might want to pick a translation that exists."
+		sys.exit(-1)
+	trans = mgr.getModule(argv[2])
+	if trans == None:
+		print "You might want to pick a translation that exists."
+		sys.exit(-1)
+
+	counts = []
+	NT = False
+	# Iterate the whole selection
+	print"""\t**********************************
+	********  Building tables ********
+	**********************************"""
+	while original.Error() == '\x00' and trans.Error() == '\x00':
+		oWords = original.StripText().decode('utf-8').split()
+		tWords = trans.StripText().decode('utf-8').split()
+		key = Sword.VerseKey(original.getKey())
+
+		# Check if we've entered the NT
+		if not NT and key.getOSISRef().startswith('Matt'):
+			NT = True
+
+		# Only check one testament
+		if checkNT != NT:
+			increment(original, trans)
+			continue
+
+		if len(oWords) == 0:
+			print 'Unable to check verse %s - no content in source' % (key.getText(),)
+		elif len(tWords) == 0:
+			print 'Unable to check verse %s - no content in target' % (key.getText(),)
+		else:
+			counts.append((key.getOSISRef(), len(oWords) / float(len(tWords)), key.getVerse() == key.getChapterMax()))
+
+		increment(original, trans)
+	
+	# Now that we have all the values, let's see if there
+	# are any that seem way out of whack
+	vals = [x for k, x, b in counts]
+	mean = numpy.average(vals)
+	std  = numpy.std(vals)
+	
+	print """\t***********************************
+	****** Beginning comparisons ******
+	***********************************"""
+	print "Average ratio: %0.4f" % (mean,)
+	print "Standard dev:  %0.4f" % (std,)
+
+	# Now iterate the target translation
+	print "The following references fall outside of the target standard deviation"
+	for ref, ratio, b in counts:
+		if abs(ratio - mean) >= 2 * std and b:
+			print '%s' % (ref,)
+
+try:
+	import Sword
+	import sys
+	import numpy
+except m:
+	usage(sys.argv[0])
+
+if __name__ == '__main__':
+	main(sys.argv)
-- 
cgit