1 files changed, 467 insertions, 0 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
new file mode 100644
index 0000000..2051a22
--- /dev/null
+++ b/migratetags/migratetags.cpp
@@ -0,0 +1,467 @@
+#include <versekey.h>
+#include <utf8greekaccents.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <stringmgr.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+#include "matchers/matcher.h"
+#include "matchers/defaultmatcher.h"
+
+// select your matcher here
+Matcher *matcher = new DefaultMatcher();
+const char *targetModuleName="NA28";
+const char *strongsSourceModuleName="WHNU";
+
+
+const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊  ";
+
+typedef vector<unsigned long> BibMap;
+
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+
+// app options
+bool optionFilterAccents = false;
+bool optionFilterAppCrit = false;
+bool optionDebug         = false;
+
+void usage(const char *progName, const char *error = 0) {
+	if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
+	fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
+	fprintf(stderr, "\nusage: %s [options]\n", progName);
+	fprintf(stderr, "  -v\t\t\t verbose: print lots of information while processing\n");
+	fprintf(stderr, "  -fa\t\t\t filter accents: remove Greek accents from final text\n");
+	fprintf(stderr, "\n\n");
+	exit(-1);
+}
+
+
+
+int main(int argc, char **argv) {
+	const char *progName   = argv[0];
+	for (int i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "-v")) {
+			optionDebug = true;
+		}
+		else if (!strcmp(argv[i], "-fa")) {
+			optionFilterAccents = true;
+		}
+		else if (!strcmp(argv[i], "-fc")) {
+			optionFilterAppCrit = true;
+		}
+		else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+	}
+	VerseKey vk;
+	SWMgr lib;
+	lib.setGlobalOption("Textual Variants", "Secondary Reading");
+	SWModule *m = lib.getModule(targetModuleName);
+	if (!m) {
+		cerr << "couldn't find target module: " << targetModuleName << ".\n";
+		exit(1);
+	}
+	SWModule &targetMod = *m;
+	m = lib.getModule(strongsSourceModuleName);
+	if (!m) {
+		cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+		exit(1);
+	}
+	SWModule &fromMod = *m;
+
+	// we'll do the whole Bible eventually, but let's just get one verse
+	// working well.
+	targetMod.setKey("mat1.1");		// let's try this verse
+	int z = 0;
+	for (;
+//!z &&
+!targetMod.popError(); targetMod++) {
+	z++;
+
+		// XML word tags which should be placed in this verse (start tag)
+		// eg., <w lemma=...>
+		// pulled from FromMod
+		vector<XMLTag> wordTags;
+
+		// Just the raw canonical Bible text of this verse with no tags
+		// eg., "In the beginning God created the heavens and the earth."
+		SWBuf justTargetModBibleText = "";
+
+		// a mapping for each character in justTargetModBibleText to the real location
+		// in our out buffer.  This allows us to insert our <w> and </w>
+		// tags in the correct place amongst the fully marked up
+		// TargetMod out buffer.  This work is all done in the insert() method
+		// above
+		BibMap bibMap;
+		BibMap wTags;
+
+		// justTargetModBibleText (above) broken down into separate words
+		// ie. all words in the TargetMod from this verse
+		// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+		vector<SWBuf> targetWords;
+
+		// where each corresponding targetWords[x] starts in justTargetModBibleText
+		// eg. for "In the beginning..."
+		//         [0] = 0; [1] = 3; [2] = 7; ...
+		// Needed to pass to insert method so we know where
+		// to insert the <w> start tag
+		vector<int> targetWordStarts;
+
+		// same as targetWordStarts, but the end of each word
+		// eg. [0] = 1; [1] = 5; [2] = 15
+		// Needed to pass to insert method so we know where
+		// to insert the </w> end tag
+		vector<int> targetWordEnds;
+
+		// This is the doozy.  This maps each TargetMod word to the correct
+		// wordTags entry.
+		vector<int> targetWordTags;
+
+		// Equivalent to targetWords above, but for the FromMod.
+		// Useful for helping determine matches to TargetMod words
+		vector<SWBuf> fromWords;
+
+		// Equivalent to targetWordTag which we need to produce,
+		// but this one is produced for us from the FromMod data
+		// If we can match a fromWords[x] entry, then we can assign
+		// targetWorkTags[ourMatch] = fromWordTags[x]
+		vector<int> fromWordTags;
+
+		bibMap.clear();
+		wTags.clear();
+
+		fromMod.setKey(targetMod.getKey());
+		cout << "$$$ " << targetMod.getKeyText() << endl;
+
+if (optionDebug) {
+		cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+		cout << "---------------------" << endl;
+
+		cout << "\nOur FromMod Verse Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << fromMod.getRawEntry() << endl;
+		cout << "---------------------" << endl;
+}
+
+
+		// grab our raw, fully marked up TargetMod text for this verse
+		SWBuf orig = targetMod.getRawEntryBuf();
+
+		if (optionFilterAccents) {
+			UTF8GreekAccents filter;
+			filter.setOptionValue("off");
+			filter.processText(orig);
+		}
+
+		if (optionFilterAppCrit) {
+			SWBuf o = orig;
+			const unsigned char* from = (unsigned char*)o.c_str();
+			orig = "";
+			while (*from) {		
+				__u32 ch = getUniCharFromUTF8(&from, true);
+				// if ch is bad, then convert to replacement char
+				if (!ch) ch = 0xFFFD;
+				SWBuf checkChar;
+				getUTF8FromUniChar(ch, &checkChar);
+				if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+				orig.append(checkChar);
+			}
+		}
+
+if (optionDebug) {
+		cout << "\nOur Original TargetMod Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << orig << endl;
+		cout << "---------------------" << endl;
+}
+
+		// let's find where just the canonical text is amongst
+		// all our markup
+		// newTargetModMarkup will eventually hold our updated markup with
+		// the new <w> tags, but we'll start here by setting it to
+		// the processed original markup.
+		// on return, bibMap will be populated with each character
+		// and the corresponding location into newTargetModMarkup where
+		// the character resides.
+		SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
+
+if (optionDebug) {
+		cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+		cout << "---------------------" << endl;
+		cout << newTargetModMarkup << endl;
+		cout << "---------------------" << endl;
+
+		cout << "\nOur bibMap" << endl;
+		cout << "---------------------" << endl;
+		for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
+			cout << *it << " ";
+		}
+		cout << "\n---------------------" << endl;
+}
+
+		// let's populate our TargetMod word data and fill in our
+		// justTargetModBibleText buffer
+		justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
+
+if (optionDebug) {
+		cout << "\nJust TargetMod Bible Text" << endl;
+		cout << "---------------------" << endl;
+		cout << justTargetModBibleText << endl;
+		cout << "---------------------" << endl;
+}
+
+ 
+		// ok, now lets grab out the groovy data from the FromMod module
+		pullFromModData(fromMod, wordTags, fromWords, fromWordTags);
+
+
+		// 
+		// ok, here's the real work.
+		//
+		// This method needs to guess which TargetMod words match which FromMod
+		// words and then point them to their same original language
+		// word tag by populating targetWordTags
+		//
+		matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
+
+
+		// ok, now that we have our targetWordTags magically populated
+		// let's do the grunt work of inserting the <w> and </w> tags
+		insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+
+
+if (optionDebug) {
+		cout << "\nHere's how you mapped things..." << endl;
+		cout << "---------------------" << endl;
+		cout << "Total wordTags: " << wordTags.size() << endl;
+		cout << "\nTargetMod Words: " << endl;
+}
+		bool warned = false;
+		for (int i = 0; i < targetWords.size(); i++) {
+			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
+				if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+				warned = true;
+			}
+if (optionDebug) {
+			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+}
+		}
+if (optionDebug) {
+		cout << "---------------------" << endl;
+		
+		cout << "\nAND... Here's your final output" << endl;
+		cout << "---------------------" << endl;
+}
+		cout << newTargetModMarkup << endl;
+if (optionDebug) {
+		cout << endl;
+}
+	}
+	return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+	SWBuf out = "";
+	SWBuf tag = "";
+	int tagLevel = 0;
+	int wTag = -1;
+	int inTag = 0;
+	for (int i = 0; i < orig.length(); i++) {
+		if (orig[i] == '<') {
+			inTag = true;
+		}
+		else if (orig[i] == '>') {
+			inTag = false;
+			XMLTag t = tag.c_str();
+			if (!t.isEmpty()) {
+				if (t.isEndTag()) {
+					tagLevel--;
+					wTag = -1;
+				}
+				else {
+					tagLevel++;
+					wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+				}
+			}
+			out += t;
+			tag = "";
+		}
+		else if (inTag) {
+			tag += orig[i];
+		}
+		else {
+// for texts without <w> tags
+//			if (!tagLevel || wTag != -1) {
+			if (wTag != -1 || orig[i] == ' ') {
+				bibMap.push_back(out.size());
+				wTags.push_back(wTag);
+			}
+			out += orig[i];
+		}
+	}
+	return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
+	int to = 0;
+	if (!after && wTags[bibPos] != -1) {
+		to = wTags[bibPos] + 2;
+		addText--; // discard the '>'
+		addText << 2; // discard the '<w'
+	}
+	else {
+		to = bibMap[bibPos]+((after)?1:0);
+	}
+	if (!after || wTags[bibPos] == -1) {
+		out.insert(to, addText);
+		for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+			bibMap[i] += addText.length();
+			if (wTags[i] != -1) wTags[i] += addText.length();
+		}
+	}
+}
+
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
+	SWBuf bibWord = "";
+	SWBuf fromWord = "";
+	SWBuf bibText = "";
+	for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+/*
+		char *b1 = markupBuf.getRawData()+*it;
+		char *b2 = b1;
+		__u32 uc = getUniCharFromUTF8(&b2);
+		bool wordBreak = false;
+		if (uc) {
+			SWBuf u8c;
+			u8c.append(b1, b2-b1);
+			if (strstr(ignoreSeries, u8c.getRawData()))
+		}
+*/
+		char c = markupBuf[*it];
+		if (c != ' ' && c != '.' && c != ';' && c != ',') {
+			if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
+			bibWord += c;
+		}
+		else {
+			if (bibWord.length()) {
+				targetWordEnds.push_back(bibText.length()-1);
+				targetWords.push_back(bibWord);
+				bibWord = "";
+			}
+		}
+		bibText += c;
+	}
+	if (bibWord.length()) {
+		targetWordEnds.push_back(bibText.length()-1);
+		targetWords.push_back(bibWord);
+	}
+	return bibText;
+}
+
+
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
+	fromMod.renderText();	// be sure FromMod has processed entry attributes
+	AttributeList &words = fromMod.getEntryAttributes()["Word"];
+	SWBuf fromWord = "";
+	SWBuf bibWord = "";
+	for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+		// this is our new <w> XMLTag.
+		// attributes will be added below
+		XMLTag w("w");
+		int parts = atoi(it->second["PartCount"]);
+		SWBuf lemma = "";
+		SWBuf morph = "";
+		for (int i = 1; i <= parts; i++) {
+			SWBuf key = "";
+			key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+			AttributeValue::iterator li = it->second.find(key);
+			if (li != it->second.end()) {
+				if (i > 1) lemma += " ";
+				key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				if (lci != it->second.end()) {
+					lemma += lci->second + ":";
+				}
+				lemma += li->second;
+			}
+			key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+			li = it->second.find(key);
+			// silly.  sometimes morph counts don't equal lemma counts
+			if (i == 1 && parts != 1 && li == it->second.end()) {
+				li = it->second.find("Morph");
+			}
+			if (li != it->second.end()) {
+				if (i > 1) morph += " ";
+				key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				// silly.  sometimes morph counts don't equal lemma counts
+				if (i == 1 && parts != 1 && lci == it->second.end()) {
+					lci = it->second.find("MorphClass");
+				}
+				if (lci != it->second.end()) {
+					morph += lci->second + ":";
+				}
+				morph += li->second;
+			}
+			// TODO: add src tags and maybe other attributes
+		}
+
+		if (lemma.length()) w.setAttribute("lemma", lemma);
+		if (morph.length()) w.setAttribute("morph", morph);
+
+
+		fromWord = it->second["Text"];
+		bibWord = "";
+		for (int j = 0; j < fromWord.length(); j++) {
+			char c = fromWord[j];
+//			if (!strchr(ignoreSeries, c)) {
+			if (c != ' ' && c != '.' && c != ';' && c != ',') {
+				bibWord += c;
+			}
+			else {
+				if (bibWord.length()) {
+					fromWords.push_back(bibWord);
+					fromWordTags.push_back(wordTags.size());
+					bibWord = "";
+				}
+			}
+		}
+		if (bibWord.length()) {
+			fromWords.push_back(bibWord);
+			fromWordTags.push_back(wordTags.size());
+		}
+
+		wordTags.push_back(w);
+	}
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+	// TODO: this method needs some work,
+	// like putting multiple consecutive words
+	// together in one tag
+	for (int i = 0; i < targetWordTags.size(); i++) {
+		if (targetWordTags[i] > -1) {
+			insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+			insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
+		}
+	}
+}