Generalized migratetags and extracted matcher logic

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@532 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Troy A. Griffitts <scribe@crosswire.org> 2019-04-14 22:36:24 +0000
committer: Troy A. Griffitts <scribe@crosswire.org> 2019-04-14 22:36:24 +0000
commit: bafd4fb4ad4652362e47471c422c1ebecd3be0bb (patch)
tree: fbccbba5ebbfef63bf4f60baa987f5337c290d65
parent: 90a9565d52b69c8001dae59d043d31295ac7640e (diff)
download: sword-tools-bafd4fb4ad4652362e47471c422c1ebecd3be0bb.tar.gz
5 files changed, 574 insertions, 390 deletions
diff --git a/migratetags/Makefile b/migratetags/Makefile
index 8958367..19f4335 100644
--- a/migratetags/Makefile
+++ b/migratetags/Makefile
@@ -1,4 +1,5 @@
-TARGETS= esvtag
+
+TARGETS= migratetags
 
 all: $(TARGETS)
 
diff --git a/migratetags/esvtag.cpp b/migratetags/esvtag.cpp
deleted file mode 100644
index 3b86f70..0000000
--- a/migratetags/esvtag.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-#include <versekey.h>
-#include <swmgr.h>
-#include <utilxml.h>
-#include <swbuf.h>
-#include <swmodule.h>
-#include <iostream>
-#include <vector>
-
-using namespace sword;
-using namespace std;
-
-typedef vector<unsigned long> BibMap;
-
-void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false);
-int compare(const SWBuf &s1, const SWBuf &s2);
-
-SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap);
-SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds);
-void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags);
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds);
-
-
-// 
-// This is where the magic happens
-//
-// we must point each esv word to an XMLTag
-//
-// when the magic is done, and your guess is made
-// populate esvWordTags with the integer offset
-// into wordTags for which XMLTag you think it should
-// be.
-//
-void matchWords(vector<int> &esvWordTags, const vector<SWBuf> &esvWords, const vector<SWBuf> &kjvWords, const vector<int> &kjvWordTags) {
-
-	// initialize our results to all -1 so we can pop around and set
-	// words as we find them, and know which ones we haven't yet set
-	for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1);
-
-
-	// poor effort attempt
-	int j = 0;
-	for (int i = 0; i < esvWords.size(); i++) {
-		while (true) {
-			int match = compare(esvWords[i], kjvWords[j]);
-			// if we have a better than 75% match of sequencial characters
-			// then we'll say we have a match
-			if (match > 75) {
-				esvWordTags[i] = kjvWordTags[j++];
-				break;
-			}
-			// TOTRY: maybe check one word before and after?
-			//
-			// be creative!
-			//
-		}
-	}
-}
-
-
-int main(int argc, char **argv) {
-	VerseKey vk;
-	SWMgr lib;
-	SWModule &esv = *lib.getModule("ESV");
-	SWModule &kjv = *lib.getModule("KJV");
-
-	// we'll do the whole Bible eventually, but let's just get one verse
-	// working well.
-	esv.setKey("gen1.1");		// lets try this verse
-//	for (esv = TOP; !esv.Error(); esv++) {
-
-		// XML word tags which should be placed in this verse (start tag)
-		// eg., <w lemma=...>
-		// pulled from KJV
-		vector<XMLTag> wordTags;
-
-		// Just the raw canonical Bible text of this verse with no tags
-		// eg., "In the beginning God created the heavens and the earth."
-		SWBuf justESVBibleText = "";
-
-		// a mapping for each character in justESVBibleText to the real location
-		// in our out buffer.  This allows us to insert our <w> and </w>
-		// tags in the correct place amongst the fully marked up
-		// ESV out buffer.  This work is all done in the insert() method
-		// above
-		BibMap bibMap;
-
-		// justESVBibleText (above) broken down into separate words
-		// ie. all words in the ESV from this verse
-		// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
-		vector<SWBuf> esvWords;
-
-		// where each corresponding esvWords[x] starts in justESVBibleText
-		// eg. for "In the beginning..."
-		//         [0] = 0; [1] = 3; [2] = 7; ...
-		// Needed to pass to insert method so we know where
-		// to insert the <w> start tag
-		vector<int> esvWordStarts;
-
-		// same as esvWordStarts, but the end of each word
-		// eg. [0] = 1; [1] = 5; [2] = 15
-		// Needed to pass to insert method so we know where
-		// to insert the </w> end tag
-		vector<int> esvWordEnds;
-
-		// This is the doozy.  This maps each ESV word to the correct
-		// wordTags entry.
-		vector<int> esvWordTags;
-
-		// Equivalent to esvWords above, but for the KJV.
-		// Useful for helping determine matches to ESV words
-		vector<SWBuf> kjvWords;
-
-		// Equivalent to esvWordTag which we need to produce,
-		// but this one is produced for us from the KJV data
-		// If we can match a kjvWords[x] entry, then we can assign
-		// esvWorkTags[ourMatch] = kjvWordTags[x]
-		vector<int> kjvWordTags;
-
-		bibMap.clear();
-
-		kjv.setKey(esv.getKey());
-
-		cout << "\nProcessing Verse: " << esv.getKeyText() << endl;
-		cout << "---------------------" << endl;
-
-		cout << "\nOur KJV Verse Markup" << endl;
-		cout << "---------------------" << endl;
-		cout << kjv.getRawEntry() << endl;
-		cout << "---------------------" << endl;
-
-
-		// grab our raw, fully marked up ESV text for this verse
-		SWBuf orig = esv.getRawEntryBuf();
-
-		cout << "\nOur Original ESV Markup" << endl;
-		cout << "---------------------" << endl;
-		cout << orig << endl;
-		cout << "---------------------" << endl;
-
-		// let's find where just the canonical text is amongst
-		// all our markup
-		// newESVMarkup will eventually hold our updated markup with
-		// the new <w> tags, but we'll start here by setting it to
-		// the processed original markup.
-		// on return, bibMap will be populated with each character
-		// and the corresponding location into newESVMarkup where
-		// the character resides.
-		SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap);
-
-		cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl;
-		cout << "---------------------" << endl;
-		cout << newESVMarkup << endl;
-		cout << "---------------------" << endl;
-
-		// let's populate or ESV word data and fill in our
-		// justESVBibleText buffer
-		justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds);
-
-		cout << "\nJust ESV Bible Text" << endl;
-		cout << "---------------------" << endl;
-		cout << justESVBibleText << endl;
-		cout << "---------------------" << endl;
-
- 
-		// ok, now lets grab out the groovy data from the KJV module
-		pullKJVData(kjv, wordTags, kjvWords, kjvWordTags);
-
-
-		// 
-		// ok, here's the real work.
-		//
-		// This method needs to guess which ESV words match which KJV
-		// words and then point them to their same original language
-		// word tag by populating esvWordTags
-		//
-		matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags);
-
-		// ok, now that we have our esvWordTags magically populated
-		// let's do the grunt work of inserting the <w> and </w> tags
-		insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds);
-
-
-		cout << "\nHere's how you mapped things..." << endl;
-		cout << "---------------------" << endl;
-		cout << "Total wordTags: " << wordTags.size() << endl;
-		cout << "\nESV Words: " << endl;
-		for (int i = 0; i < esvWords.size(); i++) {
-			cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl;
-		}
-		cout << "---------------------" << endl;
-		
-		cout << "\nAND... Here's your final output" << endl;
-		cout << "---------------------" << endl;
-		cout << newESVMarkup << endl;
-		cout << endl;
-//	}
-	return 0;
-}
-
-
-// builds up bibMap to contain only characters of Biblical text
-// and each character's corresponding real location in our output
-// buffer (returned value)
-SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) {
-	SWBuf out = "";
-	SWBuf tag = "";
-	int tagLevel = 0;
-	int inTag = 0;
-	for (int i = 0; i < orig.length(); i++) {
-		if (orig[i] == '<') {
-			inTag = true;
-		}
-		else if (orig[i] == '>') {
-			inTag = false;
-			XMLTag t = tag.c_str();
-			if (!t.isEmpty()) {
-				if (t.isEndTag()) {
-					tagLevel--;
-				}
-				else {
-					tagLevel++;
-				}
-			}
-			out += t;
-			tag = "";
-		}
-		else if (inTag) {
-			tag += orig[i];
-		}
-		else {
-			if (!tagLevel) {
-				bibMap.push_back(out.size());
-			}
-			out += orig[i];
-		}
-	}
-	return out;
-}
-
-
-// Inserts addText into out buffer and adjusts Bible character pointers accordingly
-//
-void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) {
-	out.insert(bibMap[bibPos]+((after)?1:0), addText);
-	for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
-		bibMap[i] += addText.length();
-	}
-}
-
-
-// Compares 2 words and tries to give a percentage assurance of a match
-// TODO: could use more smarts here
-//
-int compare(const SWBuf &s1, const SWBuf &s2) {
-	int retVal = 0;
-	SWBuf largest  = (s1.length() > s2.length()) ? s1 : s2;
-	SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1;
-	int matches = 0;
-	int j = 0;
-	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
-		while (j < largest.length()) {
-			if (smallest[i] == largest[j++]) {
-				matches++;
-				break;
-			}
-		}
-	}
-	return (((float)matches) / largest.length()) * 100;
-}
-
-
-SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds) {
-	SWBuf bibWord = "";
-	SWBuf kjvWord = "";
-	SWBuf bibText = "";
-	for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
-		char c = markupBuf[*it];
-		if ((c >= 'a' && c <='z') ||
-		    (c >= 'A' && c <='Z')
-		) {
-			if (!bibWord.length()) esvWordStarts.push_back(bibText.length());
-			bibWord += c;
-		}
-		else {
-			if (bibWord.length()) {
-				esvWordEnds.push_back(bibText.length()-1);
-				esvWords.push_back(bibWord);
-				bibWord = "";
-			}
-		}
-		bibText += c;
-	}
-	if (bibWord.length()) {
-		esvWordEnds.push_back(bibText.length()-1);
-		esvWords.push_back(bibWord);
-	}
-	return bibText;
-}
-
-
-void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags) {
-	kjv.RenderText();	// be sure KJV has processed entry attributes
-	AttributeList &words = kjv.getEntryAttributes()["Word"];
-	SWBuf kjvWord = "";
-	SWBuf bibWord = "";
-	for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
-		// this is our new <w> XMLTag.
-		// attributes will be added below
-		XMLTag w("w");
-		int parts = atoi(it->second["PartCount"]);
-		SWBuf lemma = "";
-		SWBuf morph = "";
-		for (int i = 1; i <= parts; i++) {
-			SWBuf key = "";
-			key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
-			AttributeValue::iterator li = it->second.find(key);
-			if (li != it->second.end()) {
-				if (i > 1) lemma += " ";
-				key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
-				AttributeValue::iterator lci = it->second.find(key);
-				if (lci != it->second.end()) {
-					lemma += lci->second + ":";
-				}
-				lemma += li->second;
-			}
-			key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
-			li = it->second.find(key);
-			// silly.  sometimes morph counts don't equal lemma counts
-			if (i == 1 && parts != 1 && li == it->second.end()) {
-				li = it->second.find("Morph");
-			}
-			if (li != it->second.end()) {
-				if (i > 1) morph += " ";
-				key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
-				AttributeValue::iterator lci = it->second.find(key);
-				// silly.  sometimes morph counts don't equal lemma counts
-				if (i == 1 && parts != 1 && lci == it->second.end()) {
-					lci = it->second.find("MorphClass");
-				}
-				if (lci != it->second.end()) {
-					morph += lci->second + ":";
-				}
-				morph += li->second;
-			}
-			// TODO: add src tags and maybe other attributes
-		}
-
-		if (lemma.length()) w.setAttribute("lemma", lemma);
-		if (morph.length()) w.setAttribute("morph", morph);
-
-
-		kjvWord = it->second["Text"];
-		bibWord = "";
-		for (int j = 0; j < kjvWord.length(); j++) {
-			char c = kjvWord[j];
-			if ((c >= 'a' && c <='z') ||
-			    (c >= 'A' && c <='Z')
-			) {
-				bibWord += c;
-			}
-			else {
-				if (bibWord.length()) {
-					kjvWords.push_back(bibWord);
-					kjvWordTags.push_back(wordTags.size());
-					bibWord = "";
-				}
-			}
-		}
-		if (bibWord.length()) {
-			kjvWords.push_back(bibWord);
-			kjvWordTags.push_back(wordTags.size());
-		}
-
-		wordTags.push_back(w);
-	}
-}
-
-
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds) {
-	// TODO: this method needs some work,
-	// like putting multiple consecutive words
-	// together in one tag
-	for (int i = 0; i < esvWordTags.size(); i++) {
-		if (esvWordTags[i] > -1) {
-			insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap);
-			insert("</w>", markupBuf, esvWordEnds[i], bibMap, true);
-		}
-	}
-}
diff --git a/migratetags/matchers/defaultmatcher.h b/migratetags/matchers/defaultmatcher.h
new file mode 100644
index 0000000..592dbf5
--- /dev/null
+++ b/migratetags/matchers/defaultmatcher.h
@@ -0,0 +1,80 @@
+#include "matcher.h"
+
+#ifndef defaultmatcher_h
+#define defaultmatcher_h
+
+class DefaultMatcher : public Matcher {
+public:
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+	SWBuf t1 = s1;
+	SWBuf t2 = s2;
+	UTF8GreekAccents filter;
+	filter.setOptionValue("off");
+
+	// remove greek accents
+	filter.processText(t1);
+	filter.processText(t2);
+
+	// change to uppercase to match
+	StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData());
+	StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData());
+
+	int retVal = 0;
+	SWBuf largest  = (t1.length() > t2.length()) ? t1 : t2;
+	SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+	int matches = 0;
+	int j = 0;
+	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+		while (j < largest.length()) {
+			if (smallest[i] == largest[j++]) {
+				matches++;
+				break;
+			}
+		}
+	}
+	return (((float)matches) / largest.length()) * 100;
+}
+// 
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+	// initialize our results to all -1 so we can pop around and set
+	// words as we find them, and know which ones we haven't yet set
+	for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+	// poor effort attempt
+	int j = 0;
+	for (int i = 0; i < targetWords.size(); ++i) {
+		for (int j = 0; j < fromWords.size(); ++j) {
+			if (fromWordTags[j] == -1) continue;
+			int match = compare(targetWords[i], fromWords[j]);
+			// if we have a better than XX% match of sequencial characters
+			// then we'll say we have a match
+			if (match > 49) {
+				targetWordTags[i] = fromWordTags[j];
+				fromWordTags[j] = -1;
+				break;
+			}
+			// TOTRY: maybe check one word before and after?
+			//
+			// be creative!
+			//
+		}
+	}
+}
+};
+#endif
diff --git a/migratetags/matchers/matcher.h b/migratetags/matchers/matcher.h
new file mode 100644
index 0000000..1448c2e
--- /dev/null
+++ b/migratetags/matchers/matcher.h
@@ -0,0 +1,25 @@
+#ifndef matcher_h
+#define matcher_h
+
+class Matcher {
+public:
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0;
+
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) = 0;
+
+
+};
+#endif
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
new file mode 100644
index 0000000..2051a22
--- /dev/null
+++ b/migratetags/migratetags.cpp
@@ -0,0 +1,467 @@
+#include <versekey.h>
+#include <utf8greekaccents.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <stringmgr.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+#include "matchers/matcher.h"
+#include "matchers/defaultmatcher.h"
+
+// select your matcher here
+Matcher *matcher = new DefaultMatcher();
+const char *targetModuleName="NA28";
+const char *strongsSourceModuleName="WHNU";
+
+
+const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊  ";
+
+typedef vector<unsigned long> BibMap;
+
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+
+// app options
+bool optionFilterAccents = false;
+bool optionFilterAppCrit = false;
+bool optionDebug         = false;
+
+void usage(const char *progName, const char *error = 0) {
+	if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
+	fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
+	fprintf(stderr, "\nusage: %s [options]\n", progName);
+	fprintf(stderr, "  -v\t\t\t verbose: print lots of information while processing\n");
+	fprintf(stderr, "  -fa\t\t\t filter accents: remove Greek accents from final text\n");
+	fprintf(stderr, "\n\n");
+	exit(-1);
+}
+
+
+
+int main(int argc, char **argv) {
+	const char *progName   = argv[0];
+	for (int i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "-v")) {
+			optionDebug = true;
+		}
+		else if (!strcmp(argv[i], "-fa")) {
+			optionFilterAccents = true;
+		}
+		else if (!strcmp(argv[i], "-fc")) {
+			optionFilterAppCrit = true;
+		}
+		else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+	}
+	VerseKey vk;
+	SWMgr lib;
+	lib.setGlobalOption("Textual Variants", "Secondary Reading");
+	SWModule *m = lib.getModule(targetModuleName);
+	if (!m) {
+		cerr << "couldn't find target module: " << targetModuleName << ".\n";
+		exit(1);
+	}
+	SWModule &targetMod = *m;
+	m = lib.getModule(strongsSourceModuleName);
+	if (!m) {
+		cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+		exit(1);
+	}
+	SWModule &fromMod = *m;
+
+	// we'll do the whole Bible eventually, but let's just get one verse
+	// working well.
+	targetMod.setKey("mat1.1");		// let's try this verse
+	int z = 0;
+	for (;
+//!z &&
+!targetMod.popError(); targetMod++) {
+	z++;
+
+		// XML word tags which should be placed in this verse (start tag)
+		// eg., <w lemma=...>
+		// pulled from FromMod
+		vector<XMLTag> wordTags;
+
+		// Just the raw canonical Bible text of this verse with no tags
+		// eg., "In the beginning God created the heavens and the earth."
+		SWBuf justTargetModBibleText = "";
+
+		// a mapping for each character in justTargetModBibleText to the real location
+		// in our out buffer.  This allows us to insert our <w> and </w>
+		// tags in the correct place amongst the fully marked up
+		// TargetMod out buffer.  This work is all done in the insert() method
+		// above
+		BibMap bibMap;
+		BibMap wTags;
+
+		// justTargetModBibleText (above) broken down into separate words
+		// ie. all words in the TargetMod from this verse
+		// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+		vector<SWBuf> targetWords;
+
+		// where each corresponding targetWords[x] starts in justTargetModBibleText
+		// eg. for "In the beginning..."
+		//         [0] = 0; [1] = 3; [2] = 7; ...
+		// Needed to pass to insert method so we know where
+		// to insert the <w> start tag
+		vector<int> targetWordStarts;
+
+		// same as targetWordStarts, but the end of each word
+		// eg. [0] = 1; [1] = 5; [2] = 15
+		// Needed to pass to insert method so we know where
+		// to insert the </w> end tag
+		vector<int> targetWordEnds;
+
+		// This is the doozy.  This maps each TargetMod word to the correct
+		// wordTags entry.
+		vector<int> targetWordTags;
+
+		// Equivalent to targetWords above, but for the FromMod.
+		// Useful for helping determine matches to TargetMod words
+		vector<SWBuf> fromWords;
+
+		// Equivalent to targetWordTag which we need to produce,
+		// but this one is produced for us from the FromMod data
+		// If we can match a fromWords[x] entry, then we can assign
+		// targetWorkTags[ourMatch] = fromWordTags[x]
+		vector<int> fromWordTags;
+
+		bibMap.clear();
+		wTags.clear();
+
+		fromMod.setKey(targetMod.getKey());
+		cout << "$$$ " << targetMod.getKeyText() << endl;
+
+if (optionDebug) {
+		cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+		cout << "---------------------" << endl;
+
+		cout << "\nOur FromMod Verse Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << fromMod.getRawEntry() << endl;
+		cout << "---------------------" << endl;
+}
+
+
+		// grab our raw, fully marked up TargetMod text for this verse
+		SWBuf orig = targetMod.getRawEntryBuf();
+
+		if (optionFilterAccents) {
+			UTF8GreekAccents filter;
+			filter.setOptionValue("off");
+			filter.processText(orig);
+		}
+
+		if (optionFilterAppCrit) {
+			SWBuf o = orig;
+			const unsigned char* from = (unsigned char*)o.c_str();
+			orig = "";
+			while (*from) {		
+				__u32 ch = getUniCharFromUTF8(&from, true);
+				// if ch is bad, then convert to replacement char
+				if (!ch) ch = 0xFFFD;
+				SWBuf checkChar;
+				getUTF8FromUniChar(ch, &checkChar);
+				if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+				orig.append(checkChar);
+			}
+		}
+
+if (optionDebug) {
+		cout << "\nOur Original TargetMod Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << orig << endl;
+		cout << "---------------------" << endl;
+}
+
+		// let's find where just the canonical text is amongst
+		// all our markup
+		// newTargetModMarkup will eventually hold our updated markup with
+		// the new <w> tags, but we'll start here by setting it to
+		// the processed original markup.
+		// on return, bibMap will be populated with each character
+		// and the corresponding location into newTargetModMarkup where
+		// the character resides.
+		SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
+
+if (optionDebug) {
+		cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+		cout << "---------------------" << endl;
+		cout << newTargetModMarkup << endl;
+		cout << "---------------------" << endl;
+
+		cout << "\nOur bibMap" << endl;
+		cout << "---------------------" << endl;
+		for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
+			cout << *it << " ";
+		}
+		cout << "\n---------------------" << endl;
+}
+
+		// let's populate our TargetMod word data and fill in our
+		// justTargetModBibleText buffer
+		justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
+
+if (optionDebug) {
+		cout << "\nJust TargetMod Bible Text" << endl;
+		cout << "---------------------" << endl;
+		cout << justTargetModBibleText << endl;
+		cout << "---------------------" << endl;
+}
+
+ 
+		// ok, now lets grab out the groovy data from the FromMod module
+		pullFromModData(fromMod, wordTags, fromWords, fromWordTags);
+
+
+		// 
+		// ok, here's the real work.
+		//
+		// This method needs to guess which TargetMod words match which FromMod
+		// words and then point them to their same original language
+		// word tag by populating targetWordTags
+		//
+		matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
+
+
+		// ok, now that we have our targetWordTags magically populated
+		// let's do the grunt work of inserting the <w> and </w> tags
+		insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+
+
+if (optionDebug) {
+		cout << "\nHere's how you mapped things..." << endl;
+		cout << "---------------------" << endl;
+		cout << "Total wordTags: " << wordTags.size() << endl;
+		cout << "\nTargetMod Words: " << endl;
+}
+		bool warned = false;
+		for (int i = 0; i < targetWords.size(); i++) {
+			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
+				if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+				warned = true;
+			}
+if (optionDebug) {
+			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+}
+		}
+if (optionDebug) {
+		cout << "---------------------" << endl;
+		
+		cout << "\nAND... Here's your final output" << endl;
+		cout << "---------------------" << endl;
+}
+		cout << newTargetModMarkup << endl;
+if (optionDebug) {
+		cout << endl;
+}
+	}
+	return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+	SWBuf out = "";
+	SWBuf tag = "";
+	int tagLevel = 0;
+	int wTag = -1;
+	int inTag = 0;
+	for (int i = 0; i < orig.length(); i++) {
+		if (orig[i] == '<') {
+			inTag = true;
+		}
+		else if (orig[i] == '>') {
+			inTag = false;
+			XMLTag t = tag.c_str();
+			if (!t.isEmpty()) {
+				if (t.isEndTag()) {
+					tagLevel--;
+					wTag = -1;
+				}
+				else {
+					tagLevel++;
+					wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+				}
+			}
+			out += t;
+			tag = "";
+		}
+		else if (inTag) {
+			tag += orig[i];
+		}
+		else {
+// for texts without <w> tags
+//			if (!tagLevel || wTag != -1) {
+			if (wTag != -1 || orig[i] == ' ') {
+				bibMap.push_back(out.size());
+				wTags.push_back(wTag);
+			}
+			out += orig[i];
+		}
+	}
+	return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
+	int to = 0;
+	if (!after && wTags[bibPos] != -1) {
+		to = wTags[bibPos] + 2;
+		addText--; // discard the '>'
+		addText << 2; // discard the '<w'
+	}
+	else {
+		to = bibMap[bibPos]+((after)?1:0);
+	}
+	if (!after || wTags[bibPos] == -1) {
+		out.insert(to, addText);
+		for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+			bibMap[i] += addText.length();
+			if (wTags[i] != -1) wTags[i] += addText.length();
+		}
+	}
+}
+
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
+	SWBuf bibWord = "";
+	SWBuf fromWord = "";
+	SWBuf bibText = "";
+	for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+/*
+		char *b1 = markupBuf.getRawData()+*it;
+		char *b2 = b1;
+		__u32 uc = getUniCharFromUTF8(&b2);
+		bool wordBreak = false;
+		if (uc) {
+			SWBuf u8c;
+			u8c.append(b1, b2-b1);
+			if (strstr(ignoreSeries, u8c.getRawData()))
+		}
+*/
+		char c = markupBuf[*it];
+		if (c != ' ' && c != '.' && c != ';' && c != ',') {
+			if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
+			bibWord += c;
+		}
+		else {
+			if (bibWord.length()) {
+				targetWordEnds.push_back(bibText.length()-1);
+				targetWords.push_back(bibWord);
+				bibWord = "";
+			}
+		}
+		bibText += c;
+	}
+	if (bibWord.length()) {
+		targetWordEnds.push_back(bibText.length()-1);
+		targetWords.push_back(bibWord);
+	}
+	return bibText;
+}
+
+
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
+	fromMod.renderText();	// be sure FromMod has processed entry attributes
+	AttributeList &words = fromMod.getEntryAttributes()["Word"];
+	SWBuf fromWord = "";
+	SWBuf bibWord = "";
+	for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+		// this is our new <w> XMLTag.
+		// attributes will be added below
+		XMLTag w("w");
+		int parts = atoi(it->second["PartCount"]);
+		SWBuf lemma = "";
+		SWBuf morph = "";
+		for (int i = 1; i <= parts; i++) {
+			SWBuf key = "";
+			key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+			AttributeValue::iterator li = it->second.find(key);
+			if (li != it->second.end()) {
+				if (i > 1) lemma += " ";
+				key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				if (lci != it->second.end()) {
+					lemma += lci->second + ":";
+				}
+				lemma += li->second;
+			}
+			key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+			li = it->second.find(key);
+			// silly.  sometimes morph counts don't equal lemma counts
+			if (i == 1 && parts != 1 && li == it->second.end()) {
+				li = it->second.find("Morph");
+			}
+			if (li != it->second.end()) {
+				if (i > 1) morph += " ";
+				key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				// silly.  sometimes morph counts don't equal lemma counts
+				if (i == 1 && parts != 1 && lci == it->second.end()) {
+					lci = it->second.find("MorphClass");
+				}
+				if (lci != it->second.end()) {
+					morph += lci->second + ":";
+				}
+				morph += li->second;
+			}
+			// TODO: add src tags and maybe other attributes
+		}
+
+		if (lemma.length()) w.setAttribute("lemma", lemma);
+		if (morph.length()) w.setAttribute("morph", morph);
+
+
+		fromWord = it->second["Text"];
+		bibWord = "";
+		for (int j = 0; j < fromWord.length(); j++) {
+			char c = fromWord[j];
+//			if (!strchr(ignoreSeries, c)) {
+			if (c != ' ' && c != '.' && c != ';' && c != ',') {
+				bibWord += c;
+			}
+			else {
+				if (bibWord.length()) {
+					fromWords.push_back(bibWord);
+					fromWordTags.push_back(wordTags.size());
+					bibWord = "";
+				}
+			}
+		}
+		if (bibWord.length()) {
+			fromWords.push_back(bibWord);
+			fromWordTags.push_back(wordTags.size());
+		}
+
+		wordTags.push_back(w);
+	}
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+	// TODO: this method needs some work,
+	// like putting multiple consecutive words
+	// together in one tag
+	for (int i = 0; i < targetWordTags.size(); i++) {
+		if (targetWordTags[i] > -1) {
+			insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+			insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
+		}
+	}
+}
author	Troy A. Griffitts <scribe@crosswire.org>	2019-04-14 22:36:24 +0000
committer	Troy A. Griffitts <scribe@crosswire.org>	2019-04-14 22:36:24 +0000
commit	bafd4fb4ad4652362e47471c422c1ebecd3be0bb (patch)
tree	fbccbba5ebbfef63bf4f60baa987f5337c290d65
parent	90a9565d52b69c8001dae59d043d31295ac7640e (diff)
download	sword-tools-bafd4fb4ad4652362e47471c422c1ebecd3be0bb.tar.gz