1 files changed, 114 insertions, 0 deletions
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h
new file mode 100644
index 0000000..aa8f296
--- /dev/null
+++ b/migratetags/matchers/gntmatcher.h
@@ -0,0 +1,114 @@
+#include "matcher.h"
+#include <utf8greekaccents.h>
+
+#ifndef gntmatcher_h
+#define gntmatcher_h
+
+class GNTMatcher : public Matcher {
+	UTF8GreekAccents sanitizeGreekAccentFilter;
+public:
+
+	GNTMatcher() : sanitizeGreekAccentFilter() {
+		sanitizeGreekAccentFilter.setOptionValue("off");
+	}
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+	SWBuf t1 = sanitizeWord(s1);
+	SWBuf t2 = sanitizeWord(s2);
+
+	int retVal = 0;
+	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
+	SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+	int matches = 0;
+	int j = 0;
+	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+		while (j < largest.length()) {
+			if (smallest[i] == largest[j++]) {
+				matches++;
+				break;
+			}
+		}
+	}
+	return (((float)matches) / largest.length()) * 100;
+}
+
+// 
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+	// initialize our results to all -1 so we can pop around and set
+	// words as we find them, and know which ones we haven't yet set
+	for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+	// poor effort attempt
+	int j = 0;
+	for (int i = 0; i < targetWords.size(); ++i) {
+		SWBuf w1 = targetWords[i];
+		int j = 0;
+		for (; j < fromWords.size(); ++j) {
+			if (fromWordTags[j] == -1) continue;
+
+			SWBuf w2 = fromWords[j];
+			int match = compare(w1, w2);
+			// if we have a better than 75% match of sequencial characters
+			// then we'll say we have a match
+			if (match > 99) {
+				targetWordTags[i] = fromWordTags[j];
+				fromWordTags[j] = -1;
+				break;
+			}
+		}
+		// didn't match
+		if (j == fromWords.size()) {
+			// TOTRY: maybe check one word before and after?
+			//
+			// be creative!
+			//
+			
+			// let's see if we have common misses, regularize and recheck
+			SWBuf w1Orig = w1;
+			if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";
+
+			if (w1 != w1Orig) {
+				for (int j = 0; j < fromWords.size(); ++j) {
+					if (fromWordTags[j] == -1) continue;
+
+					SWBuf w2 = fromWords[j];
+					int match = compare(w1, w2);
+					// if we have a better than 75% match of sequencial characters
+					// then we'll say we have a match
+					if (match > 99) {
+						targetWordTags[i] = fromWordTags[j];
+						fromWordTags[j] = -1;
+						break;
+					}
+				}
+			}
+		}
+	}
+}
+ 
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+	SWBuf t1 = word;
+	// remove greek accents
+	sanitizeGreekAccentFilter.processText(t1);
+	t1.toUpper();
+	t1.replaceBytes("[]", 0);
+	return t1;
+}
+
+};
+#endif