diff options
Diffstat (limited to 'migratetags/migratetags.cpp')
-rw-r--r-- | migratetags/migratetags.cpp | 467 |
1 files changed, 467 insertions, 0 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp new file mode 100644 index 0000000..2051a22 --- /dev/null +++ b/migratetags/migratetags.cpp @@ -0,0 +1,467 @@ +#include <versekey.h> +#include <utf8greekaccents.h> +#include <swmgr.h> +#include <utilxml.h> +#include <swbuf.h> +#include <swmodule.h> +#include <stringmgr.h> +#include <iostream> +#include <vector> + +using namespace sword; +using namespace std; + +#include "matchers/matcher.h" +#include "matchers/defaultmatcher.h" + +// select your matcher here +Matcher *matcher = new DefaultMatcher(); +const char *targetModuleName="NA28"; +const char *strongsSourceModuleName="WHNU"; + + +const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ "; + +typedef vector<unsigned long> BibMap; + +void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false); + +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags); +SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds); +void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags); +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds); + +// app options +bool optionFilterAccents = false; +bool optionFilterAppCrit = false; +bool optionDebug = false; + +void usage(const char *progName, const char *error = 0) { + if (error) fprintf(stderr, "\n%s: %s\n", progName, error); + fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n"); + fprintf(stderr, "\nusage: %s [options]\n", progName); + fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n"); + fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n"); + fprintf(stderr, "\n\n"); + exit(-1); +} + + + +int main(int argc, char **argv) { + const char *progName = argv[0]; + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-v")) { + optionDebug = true; + } + else if (!strcmp(argv[i], "-fa")) { + optionFilterAccents = true; + } + else if (!strcmp(argv[i], "-fc")) { + optionFilterAppCrit = true; + } + else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); + } + VerseKey vk; + SWMgr lib; + lib.setGlobalOption("Textual Variants", "Secondary Reading"); + SWModule *m = lib.getModule(targetModuleName); + if (!m) { + cerr << "couldn't find target module: " << targetModuleName << ".\n"; + exit(1); + } + SWModule &targetMod = *m; + m = lib.getModule(strongsSourceModuleName); + if (!m) { + cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n"; + exit(1); + } + SWModule &fromMod = *m; + + // we'll do the whole Bible eventually, but let's just get one verse + // working well. + targetMod.setKey("mat1.1"); // let's try this verse + int z = 0; + for (; +//!z && +!targetMod.popError(); targetMod++) { + z++; + + // XML word tags which should be placed in this verse (start tag) + // eg., <w lemma=...> + // pulled from FromMod + vector<XMLTag> wordTags; + + // Just the raw canonical Bible text of this verse with no tags + // eg., "In the beginning God created the heavens and the earth." + SWBuf justTargetModBibleText = ""; + + // a mapping for each character in justTargetModBibleText to the real location + // in our out buffer. This allows us to insert our <w> and </w> + // tags in the correct place amongst the fully marked up + // TargetMod out buffer. This work is all done in the insert() method + // above + BibMap bibMap; + BibMap wTags; + + // justTargetModBibleText (above) broken down into separate words + // ie. all words in the TargetMod from this verse + // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... + vector<SWBuf> targetWords; + + // where each corresponding targetWords[x] starts in justTargetModBibleText + // eg. for "In the beginning..." + // [0] = 0; [1] = 3; [2] = 7; ... + // Needed to pass to insert method so we know where + // to insert the <w> start tag + vector<int> targetWordStarts; + + // same as targetWordStarts, but the end of each word + // eg. [0] = 1; [1] = 5; [2] = 15 + // Needed to pass to insert method so we know where + // to insert the </w> end tag + vector<int> targetWordEnds; + + // This is the doozy. This maps each TargetMod word to the correct + // wordTags entry. + vector<int> targetWordTags; + + // Equivalent to targetWords above, but for the FromMod. + // Useful for helping determine matches to TargetMod words + vector<SWBuf> fromWords; + + // Equivalent to targetWordTag which we need to produce, + // but this one is produced for us from the FromMod data + // If we can match a fromWords[x] entry, then we can assign + // targetWorkTags[ourMatch] = fromWordTags[x] + vector<int> fromWordTags; + + bibMap.clear(); + wTags.clear(); + + fromMod.setKey(targetMod.getKey()); + cout << "$$$ " << targetMod.getKeyText() << endl; + +if (optionDebug) { + cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl; + cout << "---------------------" << endl; + + cout << "\nOur FromMod Verse Markup" << endl; + cout << "---------------------" << endl; + cout << fromMod.getRawEntry() << endl; + cout << "---------------------" << endl; +} + + + // grab our raw, fully marked up TargetMod text for this verse + SWBuf orig = targetMod.getRawEntryBuf(); + + if (optionFilterAccents) { + UTF8GreekAccents filter; + filter.setOptionValue("off"); + filter.processText(orig); + } + + if (optionFilterAppCrit) { + SWBuf o = orig; + const unsigned char* from = (unsigned char*)o.c_str(); + orig = ""; + while (*from) { + __u32 ch = getUniCharFromUTF8(&from, true); + // if ch is bad, then convert to replacement char + if (!ch) ch = 0xFFFD; + SWBuf checkChar; + getUTF8FromUniChar(ch, &checkChar); + if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; + orig.append(checkChar); + } + } + +if (optionDebug) { + cout << "\nOur Original TargetMod Markup" << endl; + cout << "---------------------" << endl; + cout << orig << endl; + cout << "---------------------" << endl; +} + + // let's find where just the canonical text is amongst + // all our markup + // newTargetModMarkup will eventually hold our updated markup with + // the new <w> tags, but we'll start here by setting it to + // the processed original markup. + // on return, bibMap will be populated with each character + // and the corresponding location into newTargetModMarkup where + // the character resides. + SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags); + +if (optionDebug) { + cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl; + cout << "---------------------" << endl; + cout << newTargetModMarkup << endl; + cout << "---------------------" << endl; + + cout << "\nOur bibMap" << endl; + cout << "---------------------" << endl; + for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) { + cout << *it << " "; + } + cout << "\n---------------------" << endl; +} + + // let's populate our TargetMod word data and fill in our + // justTargetModBibleText buffer + justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds); + +if (optionDebug) { + cout << "\nJust TargetMod Bible Text" << endl; + cout << "---------------------" << endl; + cout << justTargetModBibleText << endl; + cout << "---------------------" << endl; +} + + + // ok, now lets grab out the groovy data from the FromMod module + pullFromModData(fromMod, wordTags, fromWords, fromWordTags); + + + // + // ok, here's the real work. + // + // This method needs to guess which TargetMod words match which FromMod + // words and then point them to their same original language + // word tag by populating targetWordTags + // + matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags); + + + // ok, now that we have our targetWordTags magically populated + // let's do the grunt work of inserting the <w> and </w> tags + insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); + + +if (optionDebug) { + cout << "\nHere's how you mapped things..." << endl; + cout << "---------------------" << endl; + cout << "Total wordTags: " << wordTags.size() << endl; + cout << "\nTargetMod Words: " << endl; +} + bool warned = false; + for (int i = 0; i < targetWords.size(); i++) { + if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { + if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; + warned = true; + } +if (optionDebug) { + cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl; +} + } +if (optionDebug) { + cout << "---------------------" << endl; + + cout << "\nAND... Here's your final output" << endl; + cout << "---------------------" << endl; +} + cout << newTargetModMarkup << endl; +if (optionDebug) { + cout << endl; +} + } + return 0; +} + + +// builds up bibMap to contain only characters of Biblical text +// and each character's corresponding real location in our output +// buffer (returned value) +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { + SWBuf out = ""; + SWBuf tag = ""; + int tagLevel = 0; + int wTag = -1; + int inTag = 0; + for (int i = 0; i < orig.length(); i++) { + if (orig[i] == '<') { + inTag = true; + } + else if (orig[i] == '>') { + inTag = false; + XMLTag t = tag.c_str(); + if (!t.isEmpty()) { + if (t.isEndTag()) { + tagLevel--; + wTag = -1; + } + else { + tagLevel++; + wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1; + } + } + out += t; + tag = ""; + } + else if (inTag) { + tag += orig[i]; + } + else { +// for texts without <w> tags +// if (!tagLevel || wTag != -1) { + if (wTag != -1 || orig[i] == ' ') { + bibMap.push_back(out.size()); + wTags.push_back(wTag); + } + out += orig[i]; + } + } + return out; +} + + +// Inserts addText into out buffer and adjusts Bible character pointers accordingly +// +void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) { + int to = 0; + if (!after && wTags[bibPos] != -1) { + to = wTags[bibPos] + 2; + addText--; // discard the '>' + addText << 2; // discard the '<w' + } + else { + to = bibMap[bibPos]+((after)?1:0); + } + if (!after || wTags[bibPos] == -1) { + out.insert(to, addText); + for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) { + bibMap[i] += addText.length(); + if (wTags[i] != -1) wTags[i] += addText.length(); + } + } +} + + + +SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) { + SWBuf bibWord = ""; + SWBuf fromWord = ""; + SWBuf bibText = ""; + for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { +/* + char *b1 = markupBuf.getRawData()+*it; + char *b2 = b1; + __u32 uc = getUniCharFromUTF8(&b2); + bool wordBreak = false; + if (uc) { + SWBuf u8c; + u8c.append(b1, b2-b1); + if (strstr(ignoreSeries, u8c.getRawData())) + } +*/ + char c = markupBuf[*it]; + if (c != ' ' && c != '.' && c != ';' && c != ',') { + if (!bibWord.length()) targetWordStarts.push_back(bibText.length()); + bibWord += c; + } + else { + if (bibWord.length()) { + targetWordEnds.push_back(bibText.length()-1); + targetWords.push_back(bibWord); + bibWord = ""; + } + } + bibText += c; + } + if (bibWord.length()) { + targetWordEnds.push_back(bibText.length()-1); + targetWords.push_back(bibWord); + } + return bibText; +} + + +void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) { + fromMod.renderText(); // be sure FromMod has processed entry attributes + AttributeList &words = fromMod.getEntryAttributes()["Word"]; + SWBuf fromWord = ""; + SWBuf bibWord = ""; + for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { + // this is our new <w> XMLTag. + // attributes will be added below + XMLTag w("w"); + int parts = atoi(it->second["PartCount"]); + SWBuf lemma = ""; + SWBuf morph = ""; + for (int i = 1; i <= parts; i++) { + SWBuf key = ""; + key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); + AttributeValue::iterator li = it->second.find(key); + if (li != it->second.end()) { + if (i > 1) lemma += " "; + key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + if (lci != it->second.end()) { + lemma += lci->second + ":"; + } + lemma += li->second; + } + key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); + li = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && li == it->second.end()) { + li = it->second.find("Morph"); + } + if (li != it->second.end()) { + if (i > 1) morph += " "; + key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && lci == it->second.end()) { + lci = it->second.find("MorphClass"); + } + if (lci != it->second.end()) { + morph += lci->second + ":"; + } + morph += li->second; + } + // TODO: add src tags and maybe other attributes + } + + if (lemma.length()) w.setAttribute("lemma", lemma); + if (morph.length()) w.setAttribute("morph", morph); + + + fromWord = it->second["Text"]; + bibWord = ""; + for (int j = 0; j < fromWord.length(); j++) { + char c = fromWord[j]; +// if (!strchr(ignoreSeries, c)) { + if (c != ' ' && c != '.' && c != ';' && c != ',') { + bibWord += c; + } + else { + if (bibWord.length()) { + fromWords.push_back(bibWord); + fromWordTags.push_back(wordTags.size()); + bibWord = ""; + } + } + } + if (bibWord.length()) { + fromWords.push_back(bibWord); + fromWordTags.push_back(wordTags.size()); + } + + wordTags.push_back(w); + } +} + + +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) { + // TODO: this method needs some work, + // like putting multiple consecutive words + // together in one tag + for (int i = 0; i < targetWordTags.size(); i++) { + if (targetWordTags[i] > -1) { + insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags); + insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true); + } + } +} |