migratetags/matchers/gntmatcher.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

#include "matcher.h"
#include <utf8greekaccents.h>

#ifndef gntmatcher_h
#define gntmatcher_h

class GNTMatcher : public Matcher {
	UTF8GreekAccents sanitizeGreekAccentFilter;
public:

	GNTMatcher() : sanitizeGreekAccentFilter() {
		sanitizeGreekAccentFilter.setOptionValue("off");
	}

// Compares 2 words and tries to give a percentage assurance of a match
// TODO: could use more smarts here
//
virtual int compare(const SWBuf &s1, const SWBuf &s2) {
	SWBuf t1 = sanitizeWord(s1);
	SWBuf t2 = sanitizeWord(s2);

	int retVal = 0;
	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
	SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
	int matches = 0;
	int j = 0;
	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
		while (j < largest.length()) {
			if (smallest[i] == largest[j++]) {
				matches++;
				break;
			}
		}
	}
	return (((float)matches) / largest.length()) * 100;
}

// 
// This is where the magic happens
//
// we must point each targetMod word to an XMLTag
//
// when the magic is done, and your guess is made
// populate targetWordTags with the integer offset
// into wordTags for which XMLTag you think it should
// be.
//

virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {

	// percentage of word match
	// Set very high to output every word which doesn't match exactly.
	const int WORD_MATCH_PERCENT = 99;

	// initialize our results to all -1 so we can pop around and set
	// words as we find them, and know which ones we haven't yet set
	for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);


	// poor effort attempt
	int j = 0;
	for (int i = 0; i < targetWords.size(); ++i) {
		SWBuf w1 = targetWords[i];
		int j = 0;
		for (; j < fromWords.size(); ++j) {
			if (fromWordTags[j] == -1) continue;

			SWBuf w2 = fromWords[j];
			int match = compare(w1, w2);
			// if we have a better than n% match of sequencial characters
			// then we'll say we have a match
			if (match > WORD_MATCH_PERCENT) {
				targetWordTags[i] = fromWordTags[j];
				fromWordTags[j] = -1;
				break;
			}
		}
		// didn't match
		if (j == fromWords.size()) {
			// TOTRY: maybe check one word before and after?
			//
			// be creative!
			//
			
			// let's see if we have common misses, regularize and recheck
			SWBuf w1Orig = w1;
			if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";

			if (w1 != w1Orig) {
				for (int j = 0; j < fromWords.size(); ++j) {
					if (fromWordTags[j] == -1) continue;

					SWBuf w2 = fromWords[j];
					int match = compare(w1, w2);
					// if we have a better than n% match of sequencial characters
					// then we'll say we have a match
					if (match > WORD_MATCH_PERCENT) {
						targetWordTags[i] = fromWordTags[j];
						fromWordTags[j] = -1;
						break;
					}
				}
			}
		}
	}
}
 
virtual SWBuf sanitizeWord(const SWBuf &word) {
	SWBuf t1 = word;
	// remove greek accents
	sanitizeGreekAccentFilter.processText(t1);
	t1.toUpper();
	t1.replaceBytes("[]", 0);
	return t1;
}

};
#endif