migratetags/matchers/gntmatcher.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

#include "matcher.h"
#include <utf8greekaccents.h>

#ifndef gntmatcher_h
#define gntmatcher_h

class GNTMatcher : public Matcher {
	UTF8GreekAccents sanitizeGreekAccentFilter;
public:

	GNTMatcher() : sanitizeGreekAccentFilter() {
		sanitizeGreekAccentFilter.setOptionValue("off");
	}

// Compares 2 words and tries to give a percentage assurance of a match
// TODO: could use more smarts here
//
virtual int compare(const SWBuf &s1, const SWBuf &s2) {
	SWBuf t1 = sanitizeWord(s1);
	SWBuf t2 = sanitizeWord(s2);

	int retVal = 0;
	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
	SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
	int matches = 0;
	int j = 0;
	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
		while (j < largest.length()) {
			if (smallest[i] == largest[j++]) {
				matches++;
				break;
			}
		}
	}
	return (((float)matches) / largest.length()) * 100;
}

// 
// This is where the magic happens
//
// we must point each targetMod word to an XMLTag
//
// when the magic is done, and your guess is made
// populate targetWordTags with the integer offset
// into wordTags for which XMLTag you think it should
// be.
//

virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {

	// percentage of word match
	// Set very high to output every word which doesn't match exactly.
	const int WORD_MATCH_PERCENT = 99;

	// initialize our results to all -1 so we can pop around and set
	// words as we find them, and know which ones we haven't yet set
	for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);


	// poor effort attempt
	for (int i = 0; i < targetWords.size(); ++i) {
		SWBuf w1 = targetWords[i];
		int j = 0;
		for (; j < fromWords.size(); ++j) {
			if (fromWordTags[j] == -1) continue;

			SWBuf w2 = fromWords[j];
			int match = compare(w1, w2);
			// if we have a better than n% match of sequencial characters
			// then we'll say we have a match
			if (match > WORD_MATCH_PERCENT) {
				targetWordTags[i] = fromWordTags[j];
				fromWordTags[j] = -1;
				break;
			}
		}
		// didn't match
		if (j == fromWords.size()) {
			// TOTRY: maybe check one word before and after?
			//
			// be creative!
			//
			
			// let's see if we have common misses, regularize and recheck
			SWBuf w1Orig = w1;
			if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";

			if (w1 != w1Orig) {
				for (j = 0; j < fromWords.size(); ++j) {
					if (fromWordTags[j] == -1) continue;

					SWBuf w2 = fromWords[j];
					int match = compare(w1, w2);
					// if we have a better than n% match of sequencial characters
					// then we'll say we have a match
					if (match > WORD_MATCH_PERCENT) {
						targetWordTags[i] = fromWordTags[j];
						fromWordTags[j] = -1;
						break;
					}
				}
			}
		}
	}
}
 
const char *ignoreSeries = "[]\nʼ‾̷‾";
virtual SWBuf sanitizeWord(const SWBuf &word) {
	SWBuf t1 = word;
	// remove greek accents
	sanitizeGreekAccentFilter.processText(t1);
	t1.toUpper();

	// remove ignoreSeries characters
	SWBuf o = t1;
	const unsigned char* from = (unsigned char*)o.c_str();
	t1 = "";
	while (*from) {		
		SW_u32 ch = getUniCharFromUTF8(&from, true);
		// if ch is bad, then convert to replacement char
		if (!ch) ch = 0xFFFD;
		SWBuf checkChar;
		getUTF8FromUniChar(ch, &checkChar);
		if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
		t1.append(checkChar);
	}
	return t1;
}

};
#endif