summaryrefslogtreecommitdiffstats
path: root/migratetags/migratetags.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'migratetags/migratetags.cpp')
-rw-r--r--migratetags/migratetags.cpp467
1 files changed, 467 insertions, 0 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
new file mode 100644
index 0000000..2051a22
--- /dev/null
+++ b/migratetags/migratetags.cpp
@@ -0,0 +1,467 @@
+#include <versekey.h>
+#include <utf8greekaccents.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <stringmgr.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+#include "matchers/matcher.h"
+#include "matchers/defaultmatcher.h"
+
+// select your matcher here
+Matcher *matcher = new DefaultMatcher();
+const char *targetModuleName="NA28";
+const char *strongsSourceModuleName="WHNU";
+
+
+const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ ";
+
+typedef vector<unsigned long> BibMap;
+
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+
+// app options
+bool optionFilterAccents = false;
+bool optionFilterAppCrit = false;
+bool optionDebug = false;
+
+void usage(const char *progName, const char *error = 0) {
+ if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
+ fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
+ fprintf(stderr, "\nusage: %s [options]\n", progName);
+ fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n");
+ fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n");
+ fprintf(stderr, "\n\n");
+ exit(-1);
+}
+
+
+
+int main(int argc, char **argv) {
+ const char *progName = argv[0];
+ for (int i = 1; i < argc; i++) {
+ if (!strcmp(argv[i], "-v")) {
+ optionDebug = true;
+ }
+ else if (!strcmp(argv[i], "-fa")) {
+ optionFilterAccents = true;
+ }
+ else if (!strcmp(argv[i], "-fc")) {
+ optionFilterAppCrit = true;
+ }
+ else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+ }
+ VerseKey vk;
+ SWMgr lib;
+ lib.setGlobalOption("Textual Variants", "Secondary Reading");
+ SWModule *m = lib.getModule(targetModuleName);
+ if (!m) {
+ cerr << "couldn't find target module: " << targetModuleName << ".\n";
+ exit(1);
+ }
+ SWModule &targetMod = *m;
+ m = lib.getModule(strongsSourceModuleName);
+ if (!m) {
+ cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+ exit(1);
+ }
+ SWModule &fromMod = *m;
+
+ // we'll do the whole Bible eventually, but let's just get one verse
+ // working well.
+ targetMod.setKey("mat1.1"); // let's try this verse
+ int z = 0;
+ for (;
+//!z &&
+!targetMod.popError(); targetMod++) {
+ z++;
+
+ // XML word tags which should be placed in this verse (start tag)
+ // eg., <w lemma=...>
+ // pulled from FromMod
+ vector<XMLTag> wordTags;
+
+ // Just the raw canonical Bible text of this verse with no tags
+ // eg., "In the beginning God created the heavens and the earth."
+ SWBuf justTargetModBibleText = "";
+
+ // a mapping for each character in justTargetModBibleText to the real location
+ // in our out buffer. This allows us to insert our <w> and </w>
+ // tags in the correct place amongst the fully marked up
+ // TargetMod out buffer. This work is all done in the insert() method
+ // above
+ BibMap bibMap;
+ BibMap wTags;
+
+ // justTargetModBibleText (above) broken down into separate words
+ // ie. all words in the TargetMod from this verse
+ // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+ vector<SWBuf> targetWords;
+
+ // where each corresponding targetWords[x] starts in justTargetModBibleText
+ // eg. for "In the beginning..."
+ // [0] = 0; [1] = 3; [2] = 7; ...
+ // Needed to pass to insert method so we know where
+ // to insert the <w> start tag
+ vector<int> targetWordStarts;
+
+ // same as targetWordStarts, but the end of each word
+ // eg. [0] = 1; [1] = 5; [2] = 15
+ // Needed to pass to insert method so we know where
+ // to insert the </w> end tag
+ vector<int> targetWordEnds;
+
+ // This is the doozy. This maps each TargetMod word to the correct
+ // wordTags entry.
+ vector<int> targetWordTags;
+
+ // Equivalent to targetWords above, but for the FromMod.
+ // Useful for helping determine matches to TargetMod words
+ vector<SWBuf> fromWords;
+
+ // Equivalent to targetWordTag which we need to produce,
+ // but this one is produced for us from the FromMod data
+ // If we can match a fromWords[x] entry, then we can assign
+ // targetWorkTags[ourMatch] = fromWordTags[x]
+ vector<int> fromWordTags;
+
+ bibMap.clear();
+ wTags.clear();
+
+ fromMod.setKey(targetMod.getKey());
+ cout << "$$$ " << targetMod.getKeyText() << endl;
+
+if (optionDebug) {
+ cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+ cout << "---------------------" << endl;
+
+ cout << "\nOur FromMod Verse Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << fromMod.getRawEntry() << endl;
+ cout << "---------------------" << endl;
+}
+
+
+ // grab our raw, fully marked up TargetMod text for this verse
+ SWBuf orig = targetMod.getRawEntryBuf();
+
+ if (optionFilterAccents) {
+ UTF8GreekAccents filter;
+ filter.setOptionValue("off");
+ filter.processText(orig);
+ }
+
+ if (optionFilterAppCrit) {
+ SWBuf o = orig;
+ const unsigned char* from = (unsigned char*)o.c_str();
+ orig = "";
+ while (*from) {
+ __u32 ch = getUniCharFromUTF8(&from, true);
+ // if ch is bad, then convert to replacement char
+ if (!ch) ch = 0xFFFD;
+ SWBuf checkChar;
+ getUTF8FromUniChar(ch, &checkChar);
+ if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+ orig.append(checkChar);
+ }
+ }
+
+if (optionDebug) {
+ cout << "\nOur Original TargetMod Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << orig << endl;
+ cout << "---------------------" << endl;
+}
+
+ // let's find where just the canonical text is amongst
+ // all our markup
+ // newTargetModMarkup will eventually hold our updated markup with
+ // the new <w> tags, but we'll start here by setting it to
+ // the processed original markup.
+ // on return, bibMap will be populated with each character
+ // and the corresponding location into newTargetModMarkup where
+ // the character resides.
+ SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
+
+if (optionDebug) {
+ cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+ cout << "---------------------" << endl;
+ cout << newTargetModMarkup << endl;
+ cout << "---------------------" << endl;
+
+ cout << "\nOur bibMap" << endl;
+ cout << "---------------------" << endl;
+ for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
+ cout << *it << " ";
+ }
+ cout << "\n---------------------" << endl;
+}
+
+ // let's populate our TargetMod word data and fill in our
+ // justTargetModBibleText buffer
+ justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
+
+if (optionDebug) {
+ cout << "\nJust TargetMod Bible Text" << endl;
+ cout << "---------------------" << endl;
+ cout << justTargetModBibleText << endl;
+ cout << "---------------------" << endl;
+}
+
+
+ // ok, now lets grab out the groovy data from the FromMod module
+ pullFromModData(fromMod, wordTags, fromWords, fromWordTags);
+
+
+ //
+ // ok, here's the real work.
+ //
+ // This method needs to guess which TargetMod words match which FromMod
+ // words and then point them to their same original language
+ // word tag by populating targetWordTags
+ //
+ matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
+
+
+ // ok, now that we have our targetWordTags magically populated
+ // let's do the grunt work of inserting the <w> and </w> tags
+ insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+
+
+if (optionDebug) {
+ cout << "\nHere's how you mapped things..." << endl;
+ cout << "---------------------" << endl;
+ cout << "Total wordTags: " << wordTags.size() << endl;
+ cout << "\nTargetMod Words: " << endl;
+}
+ bool warned = false;
+ for (int i = 0; i < targetWords.size(); i++) {
+ if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
+ if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+ warned = true;
+ }
+if (optionDebug) {
+ cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+}
+ }
+if (optionDebug) {
+ cout << "---------------------" << endl;
+
+ cout << "\nAND... Here's your final output" << endl;
+ cout << "---------------------" << endl;
+}
+ cout << newTargetModMarkup << endl;
+if (optionDebug) {
+ cout << endl;
+}
+ }
+ return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+ SWBuf out = "";
+ SWBuf tag = "";
+ int tagLevel = 0;
+ int wTag = -1;
+ int inTag = 0;
+ for (int i = 0; i < orig.length(); i++) {
+ if (orig[i] == '<') {
+ inTag = true;
+ }
+ else if (orig[i] == '>') {
+ inTag = false;
+ XMLTag t = tag.c_str();
+ if (!t.isEmpty()) {
+ if (t.isEndTag()) {
+ tagLevel--;
+ wTag = -1;
+ }
+ else {
+ tagLevel++;
+ wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+ }
+ }
+ out += t;
+ tag = "";
+ }
+ else if (inTag) {
+ tag += orig[i];
+ }
+ else {
+// for texts without <w> tags
+// if (!tagLevel || wTag != -1) {
+ if (wTag != -1 || orig[i] == ' ') {
+ bibMap.push_back(out.size());
+ wTags.push_back(wTag);
+ }
+ out += orig[i];
+ }
+ }
+ return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
+ int to = 0;
+ if (!after && wTags[bibPos] != -1) {
+ to = wTags[bibPos] + 2;
+ addText--; // discard the '>'
+ addText << 2; // discard the '<w'
+ }
+ else {
+ to = bibMap[bibPos]+((after)?1:0);
+ }
+ if (!after || wTags[bibPos] == -1) {
+ out.insert(to, addText);
+ for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+ bibMap[i] += addText.length();
+ if (wTags[i] != -1) wTags[i] += addText.length();
+ }
+ }
+}
+
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
+ SWBuf bibWord = "";
+ SWBuf fromWord = "";
+ SWBuf bibText = "";
+ for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+/*
+ char *b1 = markupBuf.getRawData()+*it;
+ char *b2 = b1;
+ __u32 uc = getUniCharFromUTF8(&b2);
+ bool wordBreak = false;
+ if (uc) {
+ SWBuf u8c;
+ u8c.append(b1, b2-b1);
+ if (strstr(ignoreSeries, u8c.getRawData()))
+ }
+*/
+ char c = markupBuf[*it];
+ if (c != ' ' && c != '.' && c != ';' && c != ',') {
+ if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ targetWordEnds.push_back(bibText.length()-1);
+ targetWords.push_back(bibWord);
+ bibWord = "";
+ }
+ }
+ bibText += c;
+ }
+ if (bibWord.length()) {
+ targetWordEnds.push_back(bibText.length()-1);
+ targetWords.push_back(bibWord);
+ }
+ return bibText;
+}
+
+
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
+ fromMod.renderText(); // be sure FromMod has processed entry attributes
+ AttributeList &words = fromMod.getEntryAttributes()["Word"];
+ SWBuf fromWord = "";
+ SWBuf bibWord = "";
+ for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+ // this is our new <w> XMLTag.
+ // attributes will be added below
+ XMLTag w("w");
+ int parts = atoi(it->second["PartCount"]);
+ SWBuf lemma = "";
+ SWBuf morph = "";
+ for (int i = 1; i <= parts; i++) {
+ SWBuf key = "";
+ key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+ AttributeValue::iterator li = it->second.find(key);
+ if (li != it->second.end()) {
+ if (i > 1) lemma += " ";
+ key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ if (lci != it->second.end()) {
+ lemma += lci->second + ":";
+ }
+ lemma += li->second;
+ }
+ key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+ li = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && li == it->second.end()) {
+ li = it->second.find("Morph");
+ }
+ if (li != it->second.end()) {
+ if (i > 1) morph += " ";
+ key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && lci == it->second.end()) {
+ lci = it->second.find("MorphClass");
+ }
+ if (lci != it->second.end()) {
+ morph += lci->second + ":";
+ }
+ morph += li->second;
+ }
+ // TODO: add src tags and maybe other attributes
+ }
+
+ if (lemma.length()) w.setAttribute("lemma", lemma);
+ if (morph.length()) w.setAttribute("morph", morph);
+
+
+ fromWord = it->second["Text"];
+ bibWord = "";
+ for (int j = 0; j < fromWord.length(); j++) {
+ char c = fromWord[j];
+// if (!strchr(ignoreSeries, c)) {
+ if (c != ' ' && c != '.' && c != ';' && c != ',') {
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ fromWords.push_back(bibWord);
+ fromWordTags.push_back(wordTags.size());
+ bibWord = "";
+ }
+ }
+ }
+ if (bibWord.length()) {
+ fromWords.push_back(bibWord);
+ fromWordTags.push_back(wordTags.size());
+ }
+
+ wordTags.push_back(w);
+ }
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+ // TODO: this method needs some work,
+ // like putting multiple consecutive words
+ // together in one tag
+ for (int i = 0; i < targetWordTags.size(); i++) {
+ if (targetWordTags[i] > -1) {
+ insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+ insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
+ }
+ }
+}