diff options
Diffstat (limited to 'migratetags/matchers/gntmatcher.h')
-rw-r--r-- | migratetags/matchers/gntmatcher.h | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h new file mode 100644 index 0000000..aa8f296 --- /dev/null +++ b/migratetags/matchers/gntmatcher.h @@ -0,0 +1,114 @@ +#include "matcher.h" +#include <utf8greekaccents.h> + +#ifndef gntmatcher_h +#define gntmatcher_h + +class GNTMatcher : public Matcher { + UTF8GreekAccents sanitizeGreekAccentFilter; +public: + + GNTMatcher() : sanitizeGreekAccentFilter() { + sanitizeGreekAccentFilter.setOptionValue("off"); + } + +// Compares 2 words and tries to give a percentage assurance of a match +// TODO: could use more smarts here +// +virtual int compare(const SWBuf &s1, const SWBuf &s2) { + SWBuf t1 = sanitizeWord(s1); + SWBuf t2 = sanitizeWord(s2); + + int retVal = 0; + SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; + SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1; + int matches = 0; + int j = 0; + for (int i = 0; i < smallest.length() && j < largest.length(); i++) { + while (j < largest.length()) { + if (smallest[i] == largest[j++]) { + matches++; + break; + } + } + } + return (((float)matches) / largest.length()) * 100; +} + +// +// This is where the magic happens +// +// we must point each targetMod word to an XMLTag +// +// when the magic is done, and your guess is made +// populate targetWordTags with the integer offset +// into wordTags for which XMLTag you think it should +// be. +// + +virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) { + + // initialize our results to all -1 so we can pop around and set + // words as we find them, and know which ones we haven't yet set + for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1); + + + // poor effort attempt + int j = 0; + for (int i = 0; i < targetWords.size(); ++i) { + SWBuf w1 = targetWords[i]; + int j = 0; + for (; j < fromWords.size(); ++j) { + if (fromWordTags[j] == -1) continue; + + SWBuf w2 = fromWords[j]; + int match = compare(w1, w2); + // if we have a better than 75% match of sequencial characters + // then we'll say we have a match + if (match > 99) { + targetWordTags[i] = fromWordTags[j]; + fromWordTags[j] = -1; + break; + } + } + // didn't match + if (j == fromWords.size()) { + // TOTRY: maybe check one word before and after? + // + // be creative! + // + + // let's see if we have common misses, regularize and recheck + SWBuf w1Orig = w1; + if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα"; + + if (w1 != w1Orig) { + for (int j = 0; j < fromWords.size(); ++j) { + if (fromWordTags[j] == -1) continue; + + SWBuf w2 = fromWords[j]; + int match = compare(w1, w2); + // if we have a better than 75% match of sequencial characters + // then we'll say we have a match + if (match > 99) { + targetWordTags[i] = fromWordTags[j]; + fromWordTags[j] = -1; + break; + } + } + } + } + } +} + +virtual SWBuf sanitizeWord(const SWBuf &word) { + SWBuf t1 = word; + // remove greek accents + sanitizeGreekAccentFilter.processText(t1); + t1.toUpper(); + t1.replaceBytes("[]", 0); + return t1; +} + +}; +#endif |