From bafd4fb4ad4652362e47471c422c1ebecd3be0bb Mon Sep 17 00:00:00 2001 From: "Troy A. Griffitts" Date: Sun, 14 Apr 2019 22:36:24 +0000 Subject: Generalized migratetags and extracted matcher logic git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@532 07627401-56e2-0310-80f4-f8cd0041bdcd --- migratetags/Makefile | 3 +- migratetags/esvtag.cpp | 389 ---------------------------- migratetags/matchers/defaultmatcher.h | 80 ++++++ migratetags/matchers/matcher.h | 25 ++ migratetags/migratetags.cpp | 467 ++++++++++++++++++++++++++++++++++ 5 files changed, 574 insertions(+), 390 deletions(-) delete mode 100644 migratetags/esvtag.cpp create mode 100644 migratetags/matchers/defaultmatcher.h create mode 100644 migratetags/matchers/matcher.h create mode 100644 migratetags/migratetags.cpp diff --git a/migratetags/Makefile b/migratetags/Makefile index 8958367..19f4335 100644 --- a/migratetags/Makefile +++ b/migratetags/Makefile @@ -1,4 +1,5 @@ -TARGETS= esvtag + +TARGETS= migratetags all: $(TARGETS) diff --git a/migratetags/esvtag.cpp b/migratetags/esvtag.cpp deleted file mode 100644 index 3b86f70..0000000 --- a/migratetags/esvtag.cpp +++ /dev/null @@ -1,389 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -using namespace sword; -using namespace std; - -typedef vector BibMap; - -void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false); -int compare(const SWBuf &s1, const SWBuf &s2); - -SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap); -SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &esvWords, vector &esvWordStarts, vector &esvWordEnds); -void pullKJVData(SWModule &kjv, vector&wordTags, vector &kjvWords, vector &kjvWordTags); -void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector &esvWordTags, const vector &wordTags, const vector &esvWordStarts, const vector &esvWordEnds); - - -// -// This is where the magic happens -// -// we must point each esv word to an XMLTag -// -// when the magic is done, and your guess is made -// populate esvWordTags with the integer offset -// into wordTags for which XMLTag you think it should -// be. -// -void matchWords(vector &esvWordTags, const vector &esvWords, const vector &kjvWords, const vector &kjvWordTags) { - - // initialize our results to all -1 so we can pop around and set - // words as we find them, and know which ones we haven't yet set - for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1); - - - // poor effort attempt - int j = 0; - for (int i = 0; i < esvWords.size(); i++) { - while (true) { - int match = compare(esvWords[i], kjvWords[j]); - // if we have a better than 75% match of sequencial characters - // then we'll say we have a match - if (match > 75) { - esvWordTags[i] = kjvWordTags[j++]; - break; - } - // TOTRY: maybe check one word before and after? - // - // be creative! - // - } - } -} - - -int main(int argc, char **argv) { - VerseKey vk; - SWMgr lib; - SWModule &esv = *lib.getModule("ESV"); - SWModule &kjv = *lib.getModule("KJV"); - - // we'll do the whole Bible eventually, but let's just get one verse - // working well. - esv.setKey("gen1.1"); // lets try this verse -// for (esv = TOP; !esv.Error(); esv++) { - - // XML word tags which should be placed in this verse (start tag) - // eg., - // pulled from KJV - vector wordTags; - - // Just the raw canonical Bible text of this verse with no tags - // eg., "In the beginning God created the heavens and the earth." - SWBuf justESVBibleText = ""; - - // a mapping for each character in justESVBibleText to the real location - // in our out buffer. This allows us to insert our and - // tags in the correct place amongst the fully marked up - // ESV out buffer. This work is all done in the insert() method - // above - BibMap bibMap; - - // justESVBibleText (above) broken down into separate words - // ie. all words in the ESV from this verse - // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... - vector esvWords; - - // where each corresponding esvWords[x] starts in justESVBibleText - // eg. for "In the beginning..." - // [0] = 0; [1] = 3; [2] = 7; ... - // Needed to pass to insert method so we know where - // to insert the start tag - vector esvWordStarts; - - // same as esvWordStarts, but the end of each word - // eg. [0] = 1; [1] = 5; [2] = 15 - // Needed to pass to insert method so we know where - // to insert the end tag - vector esvWordEnds; - - // This is the doozy. This maps each ESV word to the correct - // wordTags entry. - vector esvWordTags; - - // Equivalent to esvWords above, but for the KJV. - // Useful for helping determine matches to ESV words - vector kjvWords; - - // Equivalent to esvWordTag which we need to produce, - // but this one is produced for us from the KJV data - // If we can match a kjvWords[x] entry, then we can assign - // esvWorkTags[ourMatch] = kjvWordTags[x] - vector kjvWordTags; - - bibMap.clear(); - - kjv.setKey(esv.getKey()); - - cout << "\nProcessing Verse: " << esv.getKeyText() << endl; - cout << "---------------------" << endl; - - cout << "\nOur KJV Verse Markup" << endl; - cout << "---------------------" << endl; - cout << kjv.getRawEntry() << endl; - cout << "---------------------" << endl; - - - // grab our raw, fully marked up ESV text for this verse - SWBuf orig = esv.getRawEntryBuf(); - - cout << "\nOur Original ESV Markup" << endl; - cout << "---------------------" << endl; - cout << orig << endl; - cout << "---------------------" << endl; - - // let's find where just the canonical text is amongst - // all our markup - // newESVMarkup will eventually hold our updated markup with - // the new tags, but we'll start here by setting it to - // the processed original markup. - // on return, bibMap will be populated with each character - // and the corresponding location into newESVMarkup where - // the character resides. - SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap); - - cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl; - cout << "---------------------" << endl; - cout << newESVMarkup << endl; - cout << "---------------------" << endl; - - // let's populate or ESV word data and fill in our - // justESVBibleText buffer - justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds); - - cout << "\nJust ESV Bible Text" << endl; - cout << "---------------------" << endl; - cout << justESVBibleText << endl; - cout << "---------------------" << endl; - - - // ok, now lets grab out the groovy data from the KJV module - pullKJVData(kjv, wordTags, kjvWords, kjvWordTags); - - - // - // ok, here's the real work. - // - // This method needs to guess which ESV words match which KJV - // words and then point them to their same original language - // word tag by populating esvWordTags - // - matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags); - - // ok, now that we have our esvWordTags magically populated - // let's do the grunt work of inserting the and tags - insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds); - - - cout << "\nHere's how you mapped things..." << endl; - cout << "---------------------" << endl; - cout << "Total wordTags: " << wordTags.size() << endl; - cout << "\nESV Words: " << endl; - for (int i = 0; i < esvWords.size(); i++) { - cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl; - } - cout << "---------------------" << endl; - - cout << "\nAND... Here's your final output" << endl; - cout << "---------------------" << endl; - cout << newESVMarkup << endl; - cout << endl; -// } - return 0; -} - - -// builds up bibMap to contain only characters of Biblical text -// and each character's corresponding real location in our output -// buffer (returned value) -SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) { - SWBuf out = ""; - SWBuf tag = ""; - int tagLevel = 0; - int inTag = 0; - for (int i = 0; i < orig.length(); i++) { - if (orig[i] == '<') { - inTag = true; - } - else if (orig[i] == '>') { - inTag = false; - XMLTag t = tag.c_str(); - if (!t.isEmpty()) { - if (t.isEndTag()) { - tagLevel--; - } - else { - tagLevel++; - } - } - out += t; - tag = ""; - } - else if (inTag) { - tag += orig[i]; - } - else { - if (!tagLevel) { - bibMap.push_back(out.size()); - } - out += orig[i]; - } - } - return out; -} - - -// Inserts addText into out buffer and adjusts Bible character pointers accordingly -// -void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) { - out.insert(bibMap[bibPos]+((after)?1:0), addText); - for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) { - bibMap[i] += addText.length(); - } -} - - -// Compares 2 words and tries to give a percentage assurance of a match -// TODO: could use more smarts here -// -int compare(const SWBuf &s1, const SWBuf &s2) { - int retVal = 0; - SWBuf largest = (s1.length() > s2.length()) ? s1 : s2; - SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1; - int matches = 0; - int j = 0; - for (int i = 0; i < smallest.length() && j < largest.length(); i++) { - while (j < largest.length()) { - if (smallest[i] == largest[j++]) { - matches++; - break; - } - } - } - return (((float)matches) / largest.length()) * 100; -} - - -SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &esvWords, vector &esvWordStarts, vector &esvWordEnds) { - SWBuf bibWord = ""; - SWBuf kjvWord = ""; - SWBuf bibText = ""; - for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { - char c = markupBuf[*it]; - if ((c >= 'a' && c <='z') || - (c >= 'A' && c <='Z') - ) { - if (!bibWord.length()) esvWordStarts.push_back(bibText.length()); - bibWord += c; - } - else { - if (bibWord.length()) { - esvWordEnds.push_back(bibText.length()-1); - esvWords.push_back(bibWord); - bibWord = ""; - } - } - bibText += c; - } - if (bibWord.length()) { - esvWordEnds.push_back(bibText.length()-1); - esvWords.push_back(bibWord); - } - return bibText; -} - - -void pullKJVData(SWModule &kjv, vector&wordTags, vector &kjvWords, vector &kjvWordTags) { - kjv.RenderText(); // be sure KJV has processed entry attributes - AttributeList &words = kjv.getEntryAttributes()["Word"]; - SWBuf kjvWord = ""; - SWBuf bibWord = ""; - for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { - // this is our new XMLTag. - // attributes will be added below - XMLTag w("w"); - int parts = atoi(it->second["PartCount"]); - SWBuf lemma = ""; - SWBuf morph = ""; - for (int i = 1; i <= parts; i++) { - SWBuf key = ""; - key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); - AttributeValue::iterator li = it->second.find(key); - if (li != it->second.end()) { - if (i > 1) lemma += " "; - key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); - AttributeValue::iterator lci = it->second.find(key); - if (lci != it->second.end()) { - lemma += lci->second + ":"; - } - lemma += li->second; - } - key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); - li = it->second.find(key); - // silly. sometimes morph counts don't equal lemma counts - if (i == 1 && parts != 1 && li == it->second.end()) { - li = it->second.find("Morph"); - } - if (li != it->second.end()) { - if (i > 1) morph += " "; - key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); - AttributeValue::iterator lci = it->second.find(key); - // silly. sometimes morph counts don't equal lemma counts - if (i == 1 && parts != 1 && lci == it->second.end()) { - lci = it->second.find("MorphClass"); - } - if (lci != it->second.end()) { - morph += lci->second + ":"; - } - morph += li->second; - } - // TODO: add src tags and maybe other attributes - } - - if (lemma.length()) w.setAttribute("lemma", lemma); - if (morph.length()) w.setAttribute("morph", morph); - - - kjvWord = it->second["Text"]; - bibWord = ""; - for (int j = 0; j < kjvWord.length(); j++) { - char c = kjvWord[j]; - if ((c >= 'a' && c <='z') || - (c >= 'A' && c <='Z') - ) { - bibWord += c; - } - else { - if (bibWord.length()) { - kjvWords.push_back(bibWord); - kjvWordTags.push_back(wordTags.size()); - bibWord = ""; - } - } - } - if (bibWord.length()) { - kjvWords.push_back(bibWord); - kjvWordTags.push_back(wordTags.size()); - } - - wordTags.push_back(w); - } -} - - -void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector &esvWordTags, const vector &wordTags, const vector &esvWordStarts, const vector &esvWordEnds) { - // TODO: this method needs some work, - // like putting multiple consecutive words - // together in one tag - for (int i = 0; i < esvWordTags.size(); i++) { - if (esvWordTags[i] > -1) { - insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap); - insert("", markupBuf, esvWordEnds[i], bibMap, true); - } - } -} diff --git a/migratetags/matchers/defaultmatcher.h b/migratetags/matchers/defaultmatcher.h new file mode 100644 index 0000000..592dbf5 --- /dev/null +++ b/migratetags/matchers/defaultmatcher.h @@ -0,0 +1,80 @@ +#include "matcher.h" + +#ifndef defaultmatcher_h +#define defaultmatcher_h + +class DefaultMatcher : public Matcher { +public: + +// Compares 2 words and tries to give a percentage assurance of a match +// TODO: could use more smarts here +// +virtual int compare(const SWBuf &s1, const SWBuf &s2) { + SWBuf t1 = s1; + SWBuf t2 = s2; + UTF8GreekAccents filter; + filter.setOptionValue("off"); + + // remove greek accents + filter.processText(t1); + filter.processText(t2); + + // change to uppercase to match + StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData()); + StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData()); + + int retVal = 0; + SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; + SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1; + int matches = 0; + int j = 0; + for (int i = 0; i < smallest.length() && j < largest.length(); i++) { + while (j < largest.length()) { + if (smallest[i] == largest[j++]) { + matches++; + break; + } + } + } + return (((float)matches) / largest.length()) * 100; +} +// +// This is where the magic happens +// +// we must point each targetMod word to an XMLTag +// +// when the magic is done, and your guess is made +// populate targetWordTags with the integer offset +// into wordTags for which XMLTag you think it should +// be. +// + +virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) { + + // initialize our results to all -1 so we can pop around and set + // words as we find them, and know which ones we haven't yet set + for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1); + + + // poor effort attempt + int j = 0; + for (int i = 0; i < targetWords.size(); ++i) { + for (int j = 0; j < fromWords.size(); ++j) { + if (fromWordTags[j] == -1) continue; + int match = compare(targetWords[i], fromWords[j]); + // if we have a better than XX% match of sequencial characters + // then we'll say we have a match + if (match > 49) { + targetWordTags[i] = fromWordTags[j]; + fromWordTags[j] = -1; + break; + } + // TOTRY: maybe check one word before and after? + // + // be creative! + // + } + } +} +}; +#endif diff --git a/migratetags/matchers/matcher.h b/migratetags/matchers/matcher.h new file mode 100644 index 0000000..1448c2e --- /dev/null +++ b/migratetags/matchers/matcher.h @@ -0,0 +1,25 @@ +#ifndef matcher_h +#define matcher_h + +class Matcher { +public: + +// Compares 2 words and tries to give a percentage assurance of a match +// TODO: could use more smarts here +// +virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0; + +// This is where the magic happens +// +// we must point each targetMod word to an XMLTag +// +// when the magic is done, and your guess is made +// populate targetWordTags with the integer offset +// into wordTags for which XMLTag you think it should +// be. +// +virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) = 0; + + +}; +#endif diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp new file mode 100644 index 0000000..2051a22 --- /dev/null +++ b/migratetags/migratetags.cpp @@ -0,0 +1,467 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace sword; +using namespace std; + +#include "matchers/matcher.h" +#include "matchers/defaultmatcher.h" + +// select your matcher here +Matcher *matcher = new DefaultMatcher(); +const char *targetModuleName="NA28"; +const char *strongsSourceModuleName="WHNU"; + + +const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ "; + +typedef vector BibMap; + +void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false); + +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags); +SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &targetWords, vector &targetWordStarts, vector &targetWordEnds); +void pullFromModData(SWModule &fromMod, vector&wordTags, vector &fromWords, vector &fromWordTags); +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector &targetWordTags, const vector &wordTags, const vector &targetWordStarts, const vector &targetWordEnds); + +// app options +bool optionFilterAccents = false; +bool optionFilterAppCrit = false; +bool optionDebug = false; + +void usage(const char *progName, const char *error = 0) { + if (error) fprintf(stderr, "\n%s: %s\n", progName, error); + fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n"); + fprintf(stderr, "\nusage: %s [options]\n", progName); + fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n"); + fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n"); + fprintf(stderr, "\n\n"); + exit(-1); +} + + + +int main(int argc, char **argv) { + const char *progName = argv[0]; + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-v")) { + optionDebug = true; + } + else if (!strcmp(argv[i], "-fa")) { + optionFilterAccents = true; + } + else if (!strcmp(argv[i], "-fc")) { + optionFilterAppCrit = true; + } + else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); + } + VerseKey vk; + SWMgr lib; + lib.setGlobalOption("Textual Variants", "Secondary Reading"); + SWModule *m = lib.getModule(targetModuleName); + if (!m) { + cerr << "couldn't find target module: " << targetModuleName << ".\n"; + exit(1); + } + SWModule &targetMod = *m; + m = lib.getModule(strongsSourceModuleName); + if (!m) { + cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n"; + exit(1); + } + SWModule &fromMod = *m; + + // we'll do the whole Bible eventually, but let's just get one verse + // working well. + targetMod.setKey("mat1.1"); // let's try this verse + int z = 0; + for (; +//!z && +!targetMod.popError(); targetMod++) { + z++; + + // XML word tags which should be placed in this verse (start tag) + // eg., + // pulled from FromMod + vector wordTags; + + // Just the raw canonical Bible text of this verse with no tags + // eg., "In the beginning God created the heavens and the earth." + SWBuf justTargetModBibleText = ""; + + // a mapping for each character in justTargetModBibleText to the real location + // in our out buffer. This allows us to insert our and + // tags in the correct place amongst the fully marked up + // TargetMod out buffer. This work is all done in the insert() method + // above + BibMap bibMap; + BibMap wTags; + + // justTargetModBibleText (above) broken down into separate words + // ie. all words in the TargetMod from this verse + // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... + vector targetWords; + + // where each corresponding targetWords[x] starts in justTargetModBibleText + // eg. for "In the beginning..." + // [0] = 0; [1] = 3; [2] = 7; ... + // Needed to pass to insert method so we know where + // to insert the start tag + vector targetWordStarts; + + // same as targetWordStarts, but the end of each word + // eg. [0] = 1; [1] = 5; [2] = 15 + // Needed to pass to insert method so we know where + // to insert the end tag + vector targetWordEnds; + + // This is the doozy. This maps each TargetMod word to the correct + // wordTags entry. + vector targetWordTags; + + // Equivalent to targetWords above, but for the FromMod. + // Useful for helping determine matches to TargetMod words + vector fromWords; + + // Equivalent to targetWordTag which we need to produce, + // but this one is produced for us from the FromMod data + // If we can match a fromWords[x] entry, then we can assign + // targetWorkTags[ourMatch] = fromWordTags[x] + vector fromWordTags; + + bibMap.clear(); + wTags.clear(); + + fromMod.setKey(targetMod.getKey()); + cout << "$$$ " << targetMod.getKeyText() << endl; + +if (optionDebug) { + cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl; + cout << "---------------------" << endl; + + cout << "\nOur FromMod Verse Markup" << endl; + cout << "---------------------" << endl; + cout << fromMod.getRawEntry() << endl; + cout << "---------------------" << endl; +} + + + // grab our raw, fully marked up TargetMod text for this verse + SWBuf orig = targetMod.getRawEntryBuf(); + + if (optionFilterAccents) { + UTF8GreekAccents filter; + filter.setOptionValue("off"); + filter.processText(orig); + } + + if (optionFilterAppCrit) { + SWBuf o = orig; + const unsigned char* from = (unsigned char*)o.c_str(); + orig = ""; + while (*from) { + __u32 ch = getUniCharFromUTF8(&from, true); + // if ch is bad, then convert to replacement char + if (!ch) ch = 0xFFFD; + SWBuf checkChar; + getUTF8FromUniChar(ch, &checkChar); + if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; + orig.append(checkChar); + } + } + +if (optionDebug) { + cout << "\nOur Original TargetMod Markup" << endl; + cout << "---------------------" << endl; + cout << orig << endl; + cout << "---------------------" << endl; +} + + // let's find where just the canonical text is amongst + // all our markup + // newTargetModMarkup will eventually hold our updated markup with + // the new tags, but we'll start here by setting it to + // the processed original markup. + // on return, bibMap will be populated with each character + // and the corresponding location into newTargetModMarkup where + // the character resides. + SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags); + +if (optionDebug) { + cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl; + cout << "---------------------" << endl; + cout << newTargetModMarkup << endl; + cout << "---------------------" << endl; + + cout << "\nOur bibMap" << endl; + cout << "---------------------" << endl; + for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) { + cout << *it << " "; + } + cout << "\n---------------------" << endl; +} + + // let's populate our TargetMod word data and fill in our + // justTargetModBibleText buffer + justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds); + +if (optionDebug) { + cout << "\nJust TargetMod Bible Text" << endl; + cout << "---------------------" << endl; + cout << justTargetModBibleText << endl; + cout << "---------------------" << endl; +} + + + // ok, now lets grab out the groovy data from the FromMod module + pullFromModData(fromMod, wordTags, fromWords, fromWordTags); + + + // + // ok, here's the real work. + // + // This method needs to guess which TargetMod words match which FromMod + // words and then point them to their same original language + // word tag by populating targetWordTags + // + matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags); + + + // ok, now that we have our targetWordTags magically populated + // let's do the grunt work of inserting the and tags + insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); + + +if (optionDebug) { + cout << "\nHere's how you mapped things..." << endl; + cout << "---------------------" << endl; + cout << "Total wordTags: " << wordTags.size() << endl; + cout << "\nTargetMod Words: " << endl; +} + bool warned = false; + for (int i = 0; i < targetWords.size(); i++) { + if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { + if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; + warned = true; + } +if (optionDebug) { + cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl; +} + } +if (optionDebug) { + cout << "---------------------" << endl; + + cout << "\nAND... Here's your final output" << endl; + cout << "---------------------" << endl; +} + cout << newTargetModMarkup << endl; +if (optionDebug) { + cout << endl; +} + } + return 0; +} + + +// builds up bibMap to contain only characters of Biblical text +// and each character's corresponding real location in our output +// buffer (returned value) +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { + SWBuf out = ""; + SWBuf tag = ""; + int tagLevel = 0; + int wTag = -1; + int inTag = 0; + for (int i = 0; i < orig.length(); i++) { + if (orig[i] == '<') { + inTag = true; + } + else if (orig[i] == '>') { + inTag = false; + XMLTag t = tag.c_str(); + if (!t.isEmpty()) { + if (t.isEndTag()) { + tagLevel--; + wTag = -1; + } + else { + tagLevel++; + wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1; + } + } + out += t; + tag = ""; + } + else if (inTag) { + tag += orig[i]; + } + else { +// for texts without tags +// if (!tagLevel || wTag != -1) { + if (wTag != -1 || orig[i] == ' ') { + bibMap.push_back(out.size()); + wTags.push_back(wTag); + } + out += orig[i]; + } + } + return out; +} + + +// Inserts addText into out buffer and adjusts Bible character pointers accordingly +// +void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) { + int to = 0; + if (!after && wTags[bibPos] != -1) { + to = wTags[bibPos] + 2; + addText--; // discard the '>' + addText << 2; // discard the ' &targetWords, vector &targetWordStarts, vector &targetWordEnds) { + SWBuf bibWord = ""; + SWBuf fromWord = ""; + SWBuf bibText = ""; + for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { +/* + char *b1 = markupBuf.getRawData()+*it; + char *b2 = b1; + __u32 uc = getUniCharFromUTF8(&b2); + bool wordBreak = false; + if (uc) { + SWBuf u8c; + u8c.append(b1, b2-b1); + if (strstr(ignoreSeries, u8c.getRawData())) + } +*/ + char c = markupBuf[*it]; + if (c != ' ' && c != '.' && c != ';' && c != ',') { + if (!bibWord.length()) targetWordStarts.push_back(bibText.length()); + bibWord += c; + } + else { + if (bibWord.length()) { + targetWordEnds.push_back(bibText.length()-1); + targetWords.push_back(bibWord); + bibWord = ""; + } + } + bibText += c; + } + if (bibWord.length()) { + targetWordEnds.push_back(bibText.length()-1); + targetWords.push_back(bibWord); + } + return bibText; +} + + +void pullFromModData(SWModule &fromMod, vector&wordTags, vector &fromWords, vector &fromWordTags) { + fromMod.renderText(); // be sure FromMod has processed entry attributes + AttributeList &words = fromMod.getEntryAttributes()["Word"]; + SWBuf fromWord = ""; + SWBuf bibWord = ""; + for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { + // this is our new XMLTag. + // attributes will be added below + XMLTag w("w"); + int parts = atoi(it->second["PartCount"]); + SWBuf lemma = ""; + SWBuf morph = ""; + for (int i = 1; i <= parts; i++) { + SWBuf key = ""; + key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); + AttributeValue::iterator li = it->second.find(key); + if (li != it->second.end()) { + if (i > 1) lemma += " "; + key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + if (lci != it->second.end()) { + lemma += lci->second + ":"; + } + lemma += li->second; + } + key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); + li = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && li == it->second.end()) { + li = it->second.find("Morph"); + } + if (li != it->second.end()) { + if (i > 1) morph += " "; + key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && lci == it->second.end()) { + lci = it->second.find("MorphClass"); + } + if (lci != it->second.end()) { + morph += lci->second + ":"; + } + morph += li->second; + } + // TODO: add src tags and maybe other attributes + } + + if (lemma.length()) w.setAttribute("lemma", lemma); + if (morph.length()) w.setAttribute("morph", morph); + + + fromWord = it->second["Text"]; + bibWord = ""; + for (int j = 0; j < fromWord.length(); j++) { + char c = fromWord[j]; +// if (!strchr(ignoreSeries, c)) { + if (c != ' ' && c != '.' && c != ';' && c != ',') { + bibWord += c; + } + else { + if (bibWord.length()) { + fromWords.push_back(bibWord); + fromWordTags.push_back(wordTags.size()); + bibWord = ""; + } + } + } + if (bibWord.length()) { + fromWords.push_back(bibWord); + fromWordTags.push_back(wordTags.size()); + } + + wordTags.push_back(w); + } +} + + +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector &targetWordTags, const vector &wordTags, const vector &targetWordStarts, const vector &targetWordEnds) { + // TODO: this method needs some work, + // like putting multiple consecutive words + // together in one tag + for (int i = 0; i < targetWordTags.size(); i++) { + if (targetWordTags[i] > -1) { + insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags); + insert("", markupBuf, targetWordEnds[i], bibMap, wTags, true); + } + } +} -- cgit