#include "matcher.h" #include #ifndef defaultmatcher_h #define defaultmatcher_h class DefaultMatcher : public Matcher { UTF8GreekAccents sanitizeGreekAccentFilter; public: DefaultMatcher() : sanitizeGreekAccentFilter() { sanitizeGreekAccentFilter.setOptionValue("off"); } // Compares 2 words and tries to give a percentage assurance of a match // TODO: could use more smarts here // virtual int compare(const SWBuf &s1, const SWBuf &s2) { SWBuf t1 = sanitizeWord(s1); SWBuf t2 = sanitizeWord(s2); int retVal = 0; SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1; int matches = 0; int j = 0; for (int i = 0; i < smallest.length() && j < largest.length(); i++) { while (j < largest.length()) { if (smallest[i] == largest[j++]) { matches++; break; } } } return (((float)matches) / largest.length()) * 100; } // // This is where the magic happens // // we must point each targetMod word to an XMLTag // // when the magic is done, and your guess is made // populate targetWordTags with the integer offset // into wordTags for which XMLTag you think it should // be. // virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) { // initialize our results to all -1 so we can pop around and set // words as we find them, and know which ones we haven't yet set for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1); // poor effort attempt int j = 0; for (int i = 0; i < targetWords.size(); ++i) { for (int j = 0; j < fromWords.size(); ++j) { if (fromWordTags[j] == -1) continue; int match = compare(targetWords[i], fromWords[j]); // if we have a better than XX% match of sequencial characters // then we'll say we have a match if (match > 49) { targetWordTags[i] = fromWordTags[j]; fromWordTags[j] = -1; break; } // TOTRY: maybe check one word before and after? // // be creative! // } } } virtual SWBuf sanitizeWord(const SWBuf &word) { SWBuf t1 = word; t1.toUpper(); return t1; } }; #endif