summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTroy A. Griffitts <scribe@crosswire.org>2020-05-06 03:50:10 +0000
committerTroy A. Griffitts <scribe@crosswire.org>2020-05-06 03:50:10 +0000
commitbba76bf0652ec85b97124ca94959bdd2623c1bd0 (patch)
treed1b7dcdfaa3d06b541bd17c5cdad37acf730fbcf
parentbafd4fb4ad4652362e47471c422c1ebecd3be0bb (diff)
downloadsword-tools-bba76bf0652ec85b97124ca94959bdd2623c1bd0.tar.gz
Updated from NA28 strongs migration effort. Added GNTMatcher
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@533 07627401-56e2-0310-80f4-f8cd0041bdcd
-rw-r--r--migratetags/matchers/defaultmatcher.h23
-rw-r--r--migratetags/matchers/gntmatcher.h114
-rw-r--r--migratetags/matchers/matcher.h2
-rw-r--r--migratetags/migratetags.cpp73
4 files changed, 176 insertions, 36 deletions
diff --git a/migratetags/matchers/defaultmatcher.h b/migratetags/matchers/defaultmatcher.h
index 592dbf5..b74ed38 100644
--- a/migratetags/matchers/defaultmatcher.h
+++ b/migratetags/matchers/defaultmatcher.h
@@ -6,22 +6,15 @@
class DefaultMatcher : public Matcher {
public:
+ DefaultMatcher() {
+ }
+
// Compares 2 words and tries to give a percentage assurance of a match
// TODO: could use more smarts here
//
virtual int compare(const SWBuf &s1, const SWBuf &s2) {
- SWBuf t1 = s1;
- SWBuf t2 = s2;
- UTF8GreekAccents filter;
- filter.setOptionValue("off");
-
- // remove greek accents
- filter.processText(t1);
- filter.processText(t2);
-
- // change to uppercase to match
- StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData());
- StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData());
+ SWBuf t1 = sanitizeWord(s1);
+ SWBuf t2 = sanitizeWord(s2);
int retVal = 0;
SWBuf largest = (t1.length() > t2.length()) ? t1 : t2;
@@ -38,6 +31,7 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) {
}
return (((float)matches) / largest.length()) * 100;
}
+
//
// This is where the magic happens
//
@@ -76,5 +70,10 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target
}
}
}
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+ SWBuf t1 = word;
+ t1.toUpper();
+ return t1;
+}
};
#endif
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h
new file mode 100644
index 0000000..aa8f296
--- /dev/null
+++ b/migratetags/matchers/gntmatcher.h
@@ -0,0 +1,114 @@
+#include "matcher.h"
+#include <utf8greekaccents.h>
+
+#ifndef gntmatcher_h
+#define gntmatcher_h
+
+class GNTMatcher : public Matcher {
+ UTF8GreekAccents sanitizeGreekAccentFilter;
+public:
+
+ GNTMatcher() : sanitizeGreekAccentFilter() {
+ sanitizeGreekAccentFilter.setOptionValue("off");
+ }
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+ SWBuf t1 = sanitizeWord(s1);
+ SWBuf t2 = sanitizeWord(s2);
+
+ int retVal = 0;
+ SWBuf largest = (t1.length() > t2.length()) ? t1 : t2;
+ SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+ int matches = 0;
+ int j = 0;
+ for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+ while (j < largest.length()) {
+ if (smallest[i] == largest[j++]) {
+ matches++;
+ break;
+ }
+ }
+ }
+ return (((float)matches) / largest.length()) * 100;
+}
+
+//
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+ // initialize our results to all -1 so we can pop around and set
+ // words as we find them, and know which ones we haven't yet set
+ for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+ // poor effort attempt
+ int j = 0;
+ for (int i = 0; i < targetWords.size(); ++i) {
+ SWBuf w1 = targetWords[i];
+ int j = 0;
+ for (; j < fromWords.size(); ++j) {
+ if (fromWordTags[j] == -1) continue;
+
+ SWBuf w2 = fromWords[j];
+ int match = compare(w1, w2);
+ // if we have a better than 75% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 99) {
+ targetWordTags[i] = fromWordTags[j];
+ fromWordTags[j] = -1;
+ break;
+ }
+ }
+ // didn't match
+ if (j == fromWords.size()) {
+ // TOTRY: maybe check one word before and after?
+ //
+ // be creative!
+ //
+
+ // let's see if we have common misses, regularize and recheck
+ SWBuf w1Orig = w1;
+ if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";
+
+ if (w1 != w1Orig) {
+ for (int j = 0; j < fromWords.size(); ++j) {
+ if (fromWordTags[j] == -1) continue;
+
+ SWBuf w2 = fromWords[j];
+ int match = compare(w1, w2);
+ // if we have a better than 75% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 99) {
+ targetWordTags[i] = fromWordTags[j];
+ fromWordTags[j] = -1;
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+ SWBuf t1 = word;
+ // remove greek accents
+ sanitizeGreekAccentFilter.processText(t1);
+ t1.toUpper();
+ t1.replaceBytes("[]", 0);
+ return t1;
+}
+
+};
+#endif
diff --git a/migratetags/matchers/matcher.h b/migratetags/matchers/matcher.h
index 1448c2e..ec41e3c 100644
--- a/migratetags/matchers/matcher.h
+++ b/migratetags/matchers/matcher.h
@@ -20,6 +20,8 @@ virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0;
//
virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) = 0;
+// sanitize word for comparing (e.g., toUpper, strip accents, etc)
+virtual SWBuf sanitizeWord(const SWBuf &word) = 0;
};
#endif
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 2051a22..689640a 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -50,7 +50,7 @@ void usage(const char *progName, const char *error = 0) {
int main(int argc, char **argv) {
const char *progName = argv[0];
- for (int i = 1; i < argc; i++) {
+ for (int i = 1; i < argc; ++i) {
if (!strcmp(argv[i], "-v")) {
optionDebug = true;
}
@@ -62,7 +62,7 @@ int main(int argc, char **argv) {
}
else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
- VerseKey vk;
+
SWMgr lib;
lib.setGlobalOption("Textual Variants", "Secondary Reading");
SWModule *m = lib.getModule(targetModuleName);
@@ -246,15 +246,40 @@ if (optionDebug) {
cout << "\nTargetMod Words: " << endl;
}
bool warned = false;
- for (int i = 0; i < targetWords.size(); i++) {
+ for (int i = 0; i < targetWords.size(); ++i) {
if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
- if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
- warned = true;
+ if (!warned) {
+ cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+ cerr << strongsSourceModuleName << ":";
+ for (int j = 0; j < fromWords.size(); ++j) {
+ cerr << " " << fromWords[j];
+ }
+ cerr << endl;
+ cerr << targetModuleName << ":";
+ for (int j = 0; j < targetWords.size(); ++j) {
+ cerr << " " << targetWords[j];
+ }
+ cerr << endl;
+ cerr << endl;
+ cerr << "Unmatched Words:" << endl;
+ warned = true;
+ }
+ cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
}
if (optionDebug) {
cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
}
+ if (warned) {
+ cerr << "\n" << targetModuleName << " Tags:\n";
+ VerseKey *vk = (VerseKey *)targetMod.getKey();
+ for (int j = 0; j < targetWords.size(); ++j) {
+ if (!strstr(ignoreSeries, targetWords[j])) {
+ cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl;
+ }
+ }
+ cerr << "---------------------" << endl;
+ }
if (optionDebug) {
cout << "---------------------" << endl;
@@ -279,7 +304,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
int tagLevel = 0;
int wTag = -1;
int inTag = 0;
- for (int i = 0; i < orig.length(); i++) {
+ for (int i = 0; i < orig.length(); ++i) {
if (orig[i] == '<') {
inTag = true;
}
@@ -330,7 +355,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
}
if (!after || wTags[bibPos] == -1) {
out.insert(to, addText);
- for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+ for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) {
bibMap[i] += addText.length();
if (wTags[i] != -1) wTags[i] += addText.length();
}
@@ -338,7 +363,6 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
}
-
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
SWBuf bibWord = "";
SWBuf fromWord = "";
@@ -386,36 +410,38 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
// this is our new <w> XMLTag.
// attributes will be added below
XMLTag w("w");
+ // this only gives us word count, not if we have multiple entries per word
+ // don't use as loop
int parts = atoi(it->second["PartCount"]);
SWBuf lemma = "";
SWBuf morph = "";
- for (int i = 1; i <= parts; i++) {
+ bool found = true;
+ for (int i = 1; found; ++i) {
+ found = false;
SWBuf key = "";
- key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+ key = SWBuf().setFormatted("Lemma.%d", i);
AttributeValue::iterator li = it->second.find(key);
+ if (i == 1 && li == it->second.end()) li = it->second.find("Lemma");
if (li != it->second.end()) {
+ found = true;
if (i > 1) lemma += " ";
- key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+ key = SWBuf().setFormatted("LemmaClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
+ if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass");
if (lci != it->second.end()) {
lemma += lci->second + ":";
}
lemma += li->second;
}
- key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+ key = SWBuf().setFormatted("Morph.%d", i);
li = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && li == it->second.end()) {
- li = it->second.find("Morph");
- }
+ if (i == 1 && li == it->second.end()) li = it->second.find("Morph");
if (li != it->second.end()) {
+ found = true;
if (i > 1) morph += " ";
- key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+ key = SWBuf().setFormatted("MorphClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && lci == it->second.end()) {
- lci = it->second.find("MorphClass");
- }
+ if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass");
if (lci != it->second.end()) {
morph += lci->second + ":";
}
@@ -430,9 +456,8 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
fromWord = it->second["Text"];
bibWord = "";
- for (int j = 0; j < fromWord.length(); j++) {
+ for (int j = 0; j < fromWord.length(); ++j) {
char c = fromWord[j];
-// if (!strchr(ignoreSeries, c)) {
if (c != ' ' && c != '.' && c != ';' && c != ',') {
bibWord += c;
}
@@ -458,7 +483,7 @@ void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vecto
// TODO: this method needs some work,
// like putting multiple consecutive words
// together in one tag
- for (int i = 0; i < targetWordTags.size(); i++) {
+ for (int i = 0; i < targetWordTags.size(); ++i) {
if (targetWordTags[i] > -1) {
insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);