Updated from NA28 strongs migration effort. Added GNTMatcher

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@533 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Troy A. Griffitts <scribe@crosswire.org> 2020-05-06 03:50:10 +0000
committer: Troy A. Griffitts <scribe@crosswire.org> 2020-05-06 03:50:10 +0000
commit: bba76bf0652ec85b97124ca94959bdd2623c1bd0 (patch)
tree: d1b7dcdfaa3d06b541bd17c5cdad37acf730fbcf
parent: bafd4fb4ad4652362e47471c422c1ebecd3be0bb (diff)
download: sword-tools-bba76bf0652ec85b97124ca94959bdd2623c1bd0.tar.gz
4 files changed, 176 insertions, 36 deletions
diff --git a/migratetags/matchers/defaultmatcher.h b/migratetags/matchers/defaultmatcher.h
index 592dbf5..b74ed38 100644
--- a/migratetags/matchers/defaultmatcher.h
+++ b/migratetags/matchers/defaultmatcher.h
@@ -6,22 +6,15 @@
 class DefaultMatcher : public Matcher {
 public:
 
+	DefaultMatcher() {
+	}
+
 // Compares 2 words and tries to give a percentage assurance of a match
 // TODO: could use more smarts here
 //
 virtual int compare(const SWBuf &s1, const SWBuf &s2) {
-	SWBuf t1 = s1;
-	SWBuf t2 = s2;
-	UTF8GreekAccents filter;
-	filter.setOptionValue("off");
-
-	// remove greek accents
-	filter.processText(t1);
-	filter.processText(t2);
-
-	// change to uppercase to match
-	StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData());
-	StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData());
+	SWBuf t1 = sanitizeWord(s1);
+	SWBuf t2 = sanitizeWord(s2);
 
 	int retVal = 0;
 	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
@@ -38,6 +31,7 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) {
 	}
 	return (((float)matches) / largest.length()) * 100;
 }
+
 // 
 // This is where the magic happens
 //
@@ -76,5 +70,10 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target
 		}
 	}
 }
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+	SWBuf t1 = word;
+	t1.toUpper();
+	return t1;
+}
 };
 #endif
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h
new file mode 100644
index 0000000..aa8f296
--- /dev/null
+++ b/migratetags/matchers/gntmatcher.h
@@ -0,0 +1,114 @@
+#include "matcher.h"
+#include <utf8greekaccents.h>
+
+#ifndef gntmatcher_h
+#define gntmatcher_h
+
+class GNTMatcher : public Matcher {
+	UTF8GreekAccents sanitizeGreekAccentFilter;
+public:
+
+	GNTMatcher() : sanitizeGreekAccentFilter() {
+		sanitizeGreekAccentFilter.setOptionValue("off");
+	}
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+	SWBuf t1 = sanitizeWord(s1);
+	SWBuf t2 = sanitizeWord(s2);
+
+	int retVal = 0;
+	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
+	SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+	int matches = 0;
+	int j = 0;
+	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+		while (j < largest.length()) {
+			if (smallest[i] == largest[j++]) {
+				matches++;
+				break;
+			}
+		}
+	}
+	return (((float)matches) / largest.length()) * 100;
+}
+
+// 
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+	// initialize our results to all -1 so we can pop around and set
+	// words as we find them, and know which ones we haven't yet set
+	for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+	// poor effort attempt
+	int j = 0;
+	for (int i = 0; i < targetWords.size(); ++i) {
+		SWBuf w1 = targetWords[i];
+		int j = 0;
+		for (; j < fromWords.size(); ++j) {
+			if (fromWordTags[j] == -1) continue;
+
+			SWBuf w2 = fromWords[j];
+			int match = compare(w1, w2);
+			// if we have a better than 75% match of sequencial characters
+			// then we'll say we have a match
+			if (match > 99) {
+				targetWordTags[i] = fromWordTags[j];
+				fromWordTags[j] = -1;
+				break;
+			}
+		}
+		// didn't match
+		if (j == fromWords.size()) {
+			// TOTRY: maybe check one word before and after?
+			//
+			// be creative!
+			//
+			
+			// let's see if we have common misses, regularize and recheck
+			SWBuf w1Orig = w1;
+			if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";
+
+			if (w1 != w1Orig) {
+				for (int j = 0; j < fromWords.size(); ++j) {
+					if (fromWordTags[j] == -1) continue;
+
+					SWBuf w2 = fromWords[j];
+					int match = compare(w1, w2);
+					// if we have a better than 75% match of sequencial characters
+					// then we'll say we have a match
+					if (match > 99) {
+						targetWordTags[i] = fromWordTags[j];
+						fromWordTags[j] = -1;
+						break;
+					}
+				}
+			}
+		}
+	}
+}
+ 
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+	SWBuf t1 = word;
+	// remove greek accents
+	sanitizeGreekAccentFilter.processText(t1);
+	t1.toUpper();
+	t1.replaceBytes("[]", 0);
+	return t1;
+}
+
+};
+#endif
diff --git a/migratetags/matchers/matcher.h b/migratetags/matchers/matcher.h
index 1448c2e..ec41e3c 100644
--- a/migratetags/matchers/matcher.h
+++ b/migratetags/matchers/matcher.h
@@ -20,6 +20,8 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0;
 //
 virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) = 0;
 
+// sanitize word for comparing (e.g., toUpper, strip accents, etc)
+virtual SWBuf sanitizeWord(const SWBuf &word) = 0;
 
 };
 #endif
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 2051a22..689640a 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -50,7 +50,7 @@ void usage(const char *progName, const char *error = 0) {
 
 int main(int argc, char **argv) {
 	const char *progName   = argv[0];
-	for (int i = 1; i < argc; i++) {
+	for (int i = 1; i < argc; ++i) {
 		if (!strcmp(argv[i], "-v")) {
 			optionDebug = true;
 		}
@@ -62,7 +62,7 @@ int main(int argc, char **argv) {
 		}
 		else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
 	}
-	VerseKey vk;
+
 	SWMgr lib;
 	lib.setGlobalOption("Textual Variants", "Secondary Reading");
 	SWModule *m = lib.getModule(targetModuleName);
@@ -246,15 +246,40 @@ if (optionDebug) {
 		cout << "\nTargetMod Words: " << endl;
 }
 		bool warned = false;
-		for (int i = 0; i < targetWords.size(); i++) {
+		for (int i = 0; i < targetWords.size(); ++i) {
 			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
-				if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
-				warned = true;
+				if (!warned) {
+					cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+					cerr << strongsSourceModuleName << ":";
+					for (int j = 0; j < fromWords.size(); ++j) {
+						cerr << " " << fromWords[j];
+					}
+					cerr << endl;
+					cerr << targetModuleName << ":";
+					for (int j = 0; j < targetWords.size(); ++j) {
+						cerr << " " << targetWords[j];
+					}
+					cerr << endl;
+					cerr << endl;
+					cerr << "Unmatched Words:" << endl;
+					warned = true;
+				}
+				cerr << "  " << i << ": " <<  targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
 			}
 if (optionDebug) {
 			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
 }
 		}
+		if (warned) {
+			cerr << "\n" << targetModuleName << " Tags:\n";
+			VerseKey *vk = (VerseKey *)targetMod.getKey();
+			for (int j = 0; j < targetWords.size(); ++j) {
+				if (!strstr(ignoreSeries, targetWords[j])) {
+					cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl;
+				}
+			}
+			cerr << "---------------------" << endl;
+		}
 if (optionDebug) {
 		cout << "---------------------" << endl;
 		
@@ -279,7 +304,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 	int tagLevel = 0;
 	int wTag = -1;
 	int inTag = 0;
-	for (int i = 0; i < orig.length(); i++) {
+	for (int i = 0; i < orig.length(); ++i) {
 		if (orig[i] == '<') {
 			inTag = true;
 		}
@@ -330,7 +355,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
 	}
 	if (!after || wTags[bibPos] == -1) {
 		out.insert(to, addText);
-		for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+		for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) {
 			bibMap[i] += addText.length();
 			if (wTags[i] != -1) wTags[i] += addText.length();
 		}
@@ -338,7 +363,6 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
 }
 
 
-
 SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
 	SWBuf bibWord = "";
 	SWBuf fromWord = "";
@@ -386,36 +410,38 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
 		// this is our new <w> XMLTag.
 		// attributes will be added below
 		XMLTag w("w");
+		// this only gives us word count, not if we have multiple entries per word
+		// don't use as loop
 		int parts = atoi(it->second["PartCount"]);
 		SWBuf lemma = "";
 		SWBuf morph = "";
-		for (int i = 1; i <= parts; i++) {
+		bool found = true;
+		for (int i = 1; found; ++i) {
+			found = false;
 			SWBuf key = "";
-			key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+			key = SWBuf().setFormatted("Lemma.%d", i);
 			AttributeValue::iterator li = it->second.find(key);
+			if (i == 1 && li == it->second.end()) li = it->second.find("Lemma");
 			if (li != it->second.end()) {
+				found = true;
 				if (i > 1) lemma += " ";
-				key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+				key = SWBuf().setFormatted("LemmaClass.%d", i);
 				AttributeValue::iterator lci = it->second.find(key);
+				if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass");
 				if (lci != it->second.end()) {
 					lemma += lci->second + ":";
 				}
 				lemma += li->second;
 			}
-			key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+			key = SWBuf().setFormatted("Morph.%d", i);
 			li = it->second.find(key);
-			// silly.  sometimes morph counts don't equal lemma counts
-			if (i == 1 && parts != 1 && li == it->second.end()) {
-				li = it->second.find("Morph");
-			}
+			if (i == 1 && li == it->second.end()) li = it->second.find("Morph");
 			if (li != it->second.end()) {
+				found = true;
 				if (i > 1) morph += " ";
-				key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+				key = SWBuf().setFormatted("MorphClass.%d", i);
 				AttributeValue::iterator lci = it->second.find(key);
-				// silly.  sometimes morph counts don't equal lemma counts
-				if (i == 1 && parts != 1 && lci == it->second.end()) {
-					lci = it->second.find("MorphClass");
-				}
+				if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass");
 				if (lci != it->second.end()) {
 					morph += lci->second + ":";
 				}
@@ -430,9 +456,8 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
 
 		fromWord = it->second["Text"];
 		bibWord = "";
-		for (int j = 0; j < fromWord.length(); j++) {
+		for (int j = 0; j < fromWord.length(); ++j) {
 			char c = fromWord[j];
-//			if (!strchr(ignoreSeries, c)) {
 			if (c != ' ' && c != '.' && c != ';' && c != ',') {
 				bibWord += c;
 			}
@@ -458,7 +483,7 @@ void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vecto
 	// TODO: this method needs some work,
 	// like putting multiple consecutive words
 	// together in one tag
-	for (int i = 0; i < targetWordTags.size(); i++) {
+	for (int i = 0; i < targetWordTags.size(); ++i) {
 		if (targetWordTags[i] > -1) {
 			insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
 			insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
author	Troy A. Griffitts <scribe@crosswire.org>	2020-05-06 03:50:10 +0000
committer	Troy A. Griffitts <scribe@crosswire.org>	2020-05-06 03:50:10 +0000
commit	bba76bf0652ec85b97124ca94959bdd2623c1bd0 (patch)
tree	d1b7dcdfaa3d06b541bd17c5cdad37acf730fbcf
parent	bafd4fb4ad4652362e47471c422c1ebecd3be0bb (diff)
download	sword-tools-bba76bf0652ec85b97124ca94959bdd2623c1bd0.tar.gz