From bba76bf0652ec85b97124ca94959bdd2623c1bd0 Mon Sep 17 00:00:00 2001 From: "Troy A. Griffitts" Date: Wed, 6 May 2020 03:50:10 +0000 Subject: Updated from NA28 strongs migration effort. Added GNTMatcher git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@533 07627401-56e2-0310-80f4-f8cd0041bdcd --- migratetags/matchers/defaultmatcher.h | 23 ++++--- migratetags/matchers/gntmatcher.h | 114 ++++++++++++++++++++++++++++++++++ migratetags/matchers/matcher.h | 2 + migratetags/migratetags.cpp | 73 +++++++++++++++------- 4 files changed, 176 insertions(+), 36 deletions(-) create mode 100644 migratetags/matchers/gntmatcher.h diff --git a/migratetags/matchers/defaultmatcher.h b/migratetags/matchers/defaultmatcher.h index 592dbf5..b74ed38 100644 --- a/migratetags/matchers/defaultmatcher.h +++ b/migratetags/matchers/defaultmatcher.h @@ -6,22 +6,15 @@ class DefaultMatcher : public Matcher { public: + DefaultMatcher() { + } + // Compares 2 words and tries to give a percentage assurance of a match // TODO: could use more smarts here // virtual int compare(const SWBuf &s1, const SWBuf &s2) { - SWBuf t1 = s1; - SWBuf t2 = s2; - UTF8GreekAccents filter; - filter.setOptionValue("off"); - - // remove greek accents - filter.processText(t1); - filter.processText(t2); - - // change to uppercase to match - StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData()); - StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData()); + SWBuf t1 = sanitizeWord(s1); + SWBuf t2 = sanitizeWord(s2); int retVal = 0; SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; @@ -38,6 +31,7 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) { } return (((float)matches) / largest.length()) * 100; } + // // This is where the magic happens // @@ -76,5 +70,10 @@ virtual void matchWords(vector &targetWordTags, const vector &target } } } +virtual SWBuf sanitizeWord(const SWBuf &word) { + SWBuf t1 = word; + t1.toUpper(); + return t1; +} }; #endif diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h new file mode 100644 index 0000000..aa8f296 --- /dev/null +++ b/migratetags/matchers/gntmatcher.h @@ -0,0 +1,114 @@ +#include "matcher.h" +#include + +#ifndef gntmatcher_h +#define gntmatcher_h + +class GNTMatcher : public Matcher { + UTF8GreekAccents sanitizeGreekAccentFilter; +public: + + GNTMatcher() : sanitizeGreekAccentFilter() { + sanitizeGreekAccentFilter.setOptionValue("off"); + } + +// Compares 2 words and tries to give a percentage assurance of a match +// TODO: could use more smarts here +// +virtual int compare(const SWBuf &s1, const SWBuf &s2) { + SWBuf t1 = sanitizeWord(s1); + SWBuf t2 = sanitizeWord(s2); + + int retVal = 0; + SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; + SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1; + int matches = 0; + int j = 0; + for (int i = 0; i < smallest.length() && j < largest.length(); i++) { + while (j < largest.length()) { + if (smallest[i] == largest[j++]) { + matches++; + break; + } + } + } + return (((float)matches) / largest.length()) * 100; +} + +// +// This is where the magic happens +// +// we must point each targetMod word to an XMLTag +// +// when the magic is done, and your guess is made +// populate targetWordTags with the integer offset +// into wordTags for which XMLTag you think it should +// be. +// + +virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) { + + // initialize our results to all -1 so we can pop around and set + // words as we find them, and know which ones we haven't yet set + for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1); + + + // poor effort attempt + int j = 0; + for (int i = 0; i < targetWords.size(); ++i) { + SWBuf w1 = targetWords[i]; + int j = 0; + for (; j < fromWords.size(); ++j) { + if (fromWordTags[j] == -1) continue; + + SWBuf w2 = fromWords[j]; + int match = compare(w1, w2); + // if we have a better than 75% match of sequencial characters + // then we'll say we have a match + if (match > 99) { + targetWordTags[i] = fromWordTags[j]; + fromWordTags[j] = -1; + break; + } + } + // didn't match + if (j == fromWords.size()) { + // TOTRY: maybe check one word before and after? + // + // be creative! + // + + // let's see if we have common misses, regularize and recheck + SWBuf w1Orig = w1; + if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα"; + + if (w1 != w1Orig) { + for (int j = 0; j < fromWords.size(); ++j) { + if (fromWordTags[j] == -1) continue; + + SWBuf w2 = fromWords[j]; + int match = compare(w1, w2); + // if we have a better than 75% match of sequencial characters + // then we'll say we have a match + if (match > 99) { + targetWordTags[i] = fromWordTags[j]; + fromWordTags[j] = -1; + break; + } + } + } + } + } +} + +virtual SWBuf sanitizeWord(const SWBuf &word) { + SWBuf t1 = word; + // remove greek accents + sanitizeGreekAccentFilter.processText(t1); + t1.toUpper(); + t1.replaceBytes("[]", 0); + return t1; +} + +}; +#endif diff --git a/migratetags/matchers/matcher.h b/migratetags/matchers/matcher.h index 1448c2e..ec41e3c 100644 --- a/migratetags/matchers/matcher.h +++ b/migratetags/matchers/matcher.h @@ -20,6 +20,8 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0; // virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) = 0; +// sanitize word for comparing (e.g., toUpper, strip accents, etc) +virtual SWBuf sanitizeWord(const SWBuf &word) = 0; }; #endif diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp index 2051a22..689640a 100644 --- a/migratetags/migratetags.cpp +++ b/migratetags/migratetags.cpp @@ -50,7 +50,7 @@ void usage(const char *progName, const char *error = 0) { int main(int argc, char **argv) { const char *progName = argv[0]; - for (int i = 1; i < argc; i++) { + for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-v")) { optionDebug = true; } @@ -62,7 +62,7 @@ int main(int argc, char **argv) { } else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } - VerseKey vk; + SWMgr lib; lib.setGlobalOption("Textual Variants", "Secondary Reading"); SWModule *m = lib.getModule(targetModuleName); @@ -246,15 +246,40 @@ if (optionDebug) { cout << "\nTargetMod Words: " << endl; } bool warned = false; - for (int i = 0; i < targetWords.size(); i++) { + for (int i = 0; i < targetWords.size(); ++i) { if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { - if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; - warned = true; + if (!warned) { + cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; + cerr << strongsSourceModuleName << ":"; + for (int j = 0; j < fromWords.size(); ++j) { + cerr << " " << fromWords[j]; + } + cerr << endl; + cerr << targetModuleName << ":"; + for (int j = 0; j < targetWords.size(); ++j) { + cerr << " " << targetWords[j]; + } + cerr << endl; + cerr << endl; + cerr << "Unmatched Words:" << endl; + warned = true; + } + cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl; } if (optionDebug) { cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl; } } + if (warned) { + cerr << "\n" << targetModuleName << " Tags:\n"; + VerseKey *vk = (VerseKey *)targetMod.getKey(); + for (int j = 0; j < targetWords.size(); ++j) { + if (!strstr(ignoreSeries, targetWords[j])) { + cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl; + } + } + cerr << "---------------------" << endl; + } if (optionDebug) { cout << "---------------------" << endl; @@ -279,7 +304,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { int tagLevel = 0; int wTag = -1; int inTag = 0; - for (int i = 0; i < orig.length(); i++) { + for (int i = 0; i < orig.length(); ++i) { if (orig[i] == '<') { inTag = true; } @@ -330,7 +355,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags } if (!after || wTags[bibPos] == -1) { out.insert(to, addText); - for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) { + for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) { bibMap[i] += addText.length(); if (wTags[i] != -1) wTags[i] += addText.length(); } @@ -338,7 +363,6 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags } - SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &targetWords, vector &targetWordStarts, vector &targetWordEnds) { SWBuf bibWord = ""; SWBuf fromWord = ""; @@ -386,36 +410,38 @@ void pullFromModData(SWModule &fromMod, vector&wordTags, vector & // this is our new XMLTag. // attributes will be added below XMLTag w("w"); + // this only gives us word count, not if we have multiple entries per word + // don't use as loop int parts = atoi(it->second["PartCount"]); SWBuf lemma = ""; SWBuf morph = ""; - for (int i = 1; i <= parts; i++) { + bool found = true; + for (int i = 1; found; ++i) { + found = false; SWBuf key = ""; - key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); + key = SWBuf().setFormatted("Lemma.%d", i); AttributeValue::iterator li = it->second.find(key); + if (i == 1 && li == it->second.end()) li = it->second.find("Lemma"); if (li != it->second.end()) { + found = true; if (i > 1) lemma += " "; - key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); + key = SWBuf().setFormatted("LemmaClass.%d", i); AttributeValue::iterator lci = it->second.find(key); + if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass"); if (lci != it->second.end()) { lemma += lci->second + ":"; } lemma += li->second; } - key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); + key = SWBuf().setFormatted("Morph.%d", i); li = it->second.find(key); - // silly. sometimes morph counts don't equal lemma counts - if (i == 1 && parts != 1 && li == it->second.end()) { - li = it->second.find("Morph"); - } + if (i == 1 && li == it->second.end()) li = it->second.find("Morph"); if (li != it->second.end()) { + found = true; if (i > 1) morph += " "; - key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); + key = SWBuf().setFormatted("MorphClass.%d", i); AttributeValue::iterator lci = it->second.find(key); - // silly. sometimes morph counts don't equal lemma counts - if (i == 1 && parts != 1 && lci == it->second.end()) { - lci = it->second.find("MorphClass"); - } + if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass"); if (lci != it->second.end()) { morph += lci->second + ":"; } @@ -430,9 +456,8 @@ void pullFromModData(SWModule &fromMod, vector&wordTags, vector & fromWord = it->second["Text"]; bibWord = ""; - for (int j = 0; j < fromWord.length(); j++) { + for (int j = 0; j < fromWord.length(); ++j) { char c = fromWord[j]; -// if (!strchr(ignoreSeries, c)) { if (c != ' ' && c != '.' && c != ';' && c != ',') { bibWord += c; } @@ -458,7 +483,7 @@ void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vecto // TODO: this method needs some work, // like putting multiple consecutive words // together in one tag - for (int i = 0; i < targetWordTags.size(); i++) { + for (int i = 0; i < targetWordTags.size(); ++i) { if (targetWordTags[i] > -1) { insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags); insert("", markupBuf, targetWordEnds[i], bibMap, wTags, true); -- cgit