committed first cut at tag migration tool to move <w> tags from one module to another

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@470 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Troy A. Griffitts <scribe@crosswire.org> 2013-10-23 14:33:38 +0000
committer: Troy A. Griffitts <scribe@crosswire.org> 2013-10-23 14:33:38 +0000
commit: e9f75761bbd24bb89a1c031d6bf6749e022e2549 (patch)
tree: 0f87f2d7c8fb370b98547edf76b68c42d7554eee
parent: 52bd2b309a8fe80357e35f018807a24fc8575042 (diff)
download: sword-tools-e9f75761bbd24bb89a1c031d6bf6749e022e2549.tar.gz
2 files changed, 400 insertions, 0 deletions
diff --git a/migratetags/Makefile b/migratetags/Makefile
new file mode 100644
index 0000000..8958367
--- /dev/null
+++ b/migratetags/Makefile
@@ -0,0 +1,11 @@
+TARGETS= esvtag
+
+all: $(TARGETS)
+
+clean:
+	rm $(TARGETS)
+
+.cpp:
+	g++ -g `pkg-config --cflags sword` $< -o $@ `pkg-config --libs sword`
+
+
diff --git a/migratetags/esvtag.cpp b/migratetags/esvtag.cpp
new file mode 100644
index 0000000..3b86f70
--- /dev/null
+++ b/migratetags/esvtag.cpp
@@ -0,0 +1,389 @@
+#include <versekey.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+typedef vector<unsigned long> BibMap;
+
+void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false);
+int compare(const SWBuf &s1, const SWBuf &s2);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds);
+void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds);
+
+
+// 
+// This is where the magic happens
+//
+// we must point each esv word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate esvWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+void matchWords(vector<int> &esvWordTags, const vector<SWBuf> &esvWords, const vector<SWBuf> &kjvWords, const vector<int> &kjvWordTags) {
+
+	// initialize our results to all -1 so we can pop around and set
+	// words as we find them, and know which ones we haven't yet set
+	for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1);
+
+
+	// poor effort attempt
+	int j = 0;
+	for (int i = 0; i < esvWords.size(); i++) {
+		while (true) {
+			int match = compare(esvWords[i], kjvWords[j]);
+			// if we have a better than 75% match of sequencial characters
+			// then we'll say we have a match
+			if (match > 75) {
+				esvWordTags[i] = kjvWordTags[j++];
+				break;
+			}
+			// TOTRY: maybe check one word before and after?
+			//
+			// be creative!
+			//
+		}
+	}
+}
+
+
+int main(int argc, char **argv) {
+	VerseKey vk;
+	SWMgr lib;
+	SWModule &esv = *lib.getModule("ESV");
+	SWModule &kjv = *lib.getModule("KJV");
+
+	// we'll do the whole Bible eventually, but let's just get one verse
+	// working well.
+	esv.setKey("gen1.1");		// lets try this verse
+//	for (esv = TOP; !esv.Error(); esv++) {
+
+		// XML word tags which should be placed in this verse (start tag)
+		// eg., <w lemma=...>
+		// pulled from KJV
+		vector<XMLTag> wordTags;
+
+		// Just the raw canonical Bible text of this verse with no tags
+		// eg., "In the beginning God created the heavens and the earth."
+		SWBuf justESVBibleText = "";
+
+		// a mapping for each character in justESVBibleText to the real location
+		// in our out buffer.  This allows us to insert our <w> and </w>
+		// tags in the correct place amongst the fully marked up
+		// ESV out buffer.  This work is all done in the insert() method
+		// above
+		BibMap bibMap;
+
+		// justESVBibleText (above) broken down into separate words
+		// ie. all words in the ESV from this verse
+		// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+		vector<SWBuf> esvWords;
+
+		// where each corresponding esvWords[x] starts in justESVBibleText
+		// eg. for "In the beginning..."
+		//         [0] = 0; [1] = 3; [2] = 7; ...
+		// Needed to pass to insert method so we know where
+		// to insert the <w> start tag
+		vector<int> esvWordStarts;
+
+		// same as esvWordStarts, but the end of each word
+		// eg. [0] = 1; [1] = 5; [2] = 15
+		// Needed to pass to insert method so we know where
+		// to insert the </w> end tag
+		vector<int> esvWordEnds;
+
+		// This is the doozy.  This maps each ESV word to the correct
+		// wordTags entry.
+		vector<int> esvWordTags;
+
+		// Equivalent to esvWords above, but for the KJV.
+		// Useful for helping determine matches to ESV words
+		vector<SWBuf> kjvWords;
+
+		// Equivalent to esvWordTag which we need to produce,
+		// but this one is produced for us from the KJV data
+		// If we can match a kjvWords[x] entry, then we can assign
+		// esvWorkTags[ourMatch] = kjvWordTags[x]
+		vector<int> kjvWordTags;
+
+		bibMap.clear();
+
+		kjv.setKey(esv.getKey());
+
+		cout << "\nProcessing Verse: " << esv.getKeyText() << endl;
+		cout << "---------------------" << endl;
+
+		cout << "\nOur KJV Verse Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << kjv.getRawEntry() << endl;
+		cout << "---------------------" << endl;
+
+
+		// grab our raw, fully marked up ESV text for this verse
+		SWBuf orig = esv.getRawEntryBuf();
+
+		cout << "\nOur Original ESV Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << orig << endl;
+		cout << "---------------------" << endl;
+
+		// let's find where just the canonical text is amongst
+		// all our markup
+		// newESVMarkup will eventually hold our updated markup with
+		// the new <w> tags, but we'll start here by setting it to
+		// the processed original markup.
+		// on return, bibMap will be populated with each character
+		// and the corresponding location into newESVMarkup where
+		// the character resides.
+		SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap);
+
+		cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl;
+		cout << "---------------------" << endl;
+		cout << newESVMarkup << endl;
+		cout << "---------------------" << endl;
+
+		// let's populate or ESV word data and fill in our
+		// justESVBibleText buffer
+		justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds);
+
+		cout << "\nJust ESV Bible Text" << endl;
+		cout << "---------------------" << endl;
+		cout << justESVBibleText << endl;
+		cout << "---------------------" << endl;
+
+ 
+		// ok, now lets grab out the groovy data from the KJV module
+		pullKJVData(kjv, wordTags, kjvWords, kjvWordTags);
+
+
+		// 
+		// ok, here's the real work.
+		//
+		// This method needs to guess which ESV words match which KJV
+		// words and then point them to their same original language
+		// word tag by populating esvWordTags
+		//
+		matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags);
+
+		// ok, now that we have our esvWordTags magically populated
+		// let's do the grunt work of inserting the <w> and </w> tags
+		insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds);
+
+
+		cout << "\nHere's how you mapped things..." << endl;
+		cout << "---------------------" << endl;
+		cout << "Total wordTags: " << wordTags.size() << endl;
+		cout << "\nESV Words: " << endl;
+		for (int i = 0; i < esvWords.size(); i++) {
+			cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl;
+		}
+		cout << "---------------------" << endl;
+		
+		cout << "\nAND... Here's your final output" << endl;
+		cout << "---------------------" << endl;
+		cout << newESVMarkup << endl;
+		cout << endl;
+//	}
+	return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) {
+	SWBuf out = "";
+	SWBuf tag = "";
+	int tagLevel = 0;
+	int inTag = 0;
+	for (int i = 0; i < orig.length(); i++) {
+		if (orig[i] == '<') {
+			inTag = true;
+		}
+		else if (orig[i] == '>') {
+			inTag = false;
+			XMLTag t = tag.c_str();
+			if (!t.isEmpty()) {
+				if (t.isEndTag()) {
+					tagLevel--;
+				}
+				else {
+					tagLevel++;
+				}
+			}
+			out += t;
+			tag = "";
+		}
+		else if (inTag) {
+			tag += orig[i];
+		}
+		else {
+			if (!tagLevel) {
+				bibMap.push_back(out.size());
+			}
+			out += orig[i];
+		}
+	}
+	return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) {
+	out.insert(bibMap[bibPos]+((after)?1:0), addText);
+	for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+		bibMap[i] += addText.length();
+	}
+}
+
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+int compare(const SWBuf &s1, const SWBuf &s2) {
+	int retVal = 0;
+	SWBuf largest  = (s1.length() > s2.length()) ? s1 : s2;
+	SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1;
+	int matches = 0;
+	int j = 0;
+	for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+		while (j < largest.length()) {
+			if (smallest[i] == largest[j++]) {
+				matches++;
+				break;
+			}
+		}
+	}
+	return (((float)matches) / largest.length()) * 100;
+}
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds) {
+	SWBuf bibWord = "";
+	SWBuf kjvWord = "";
+	SWBuf bibText = "";
+	for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+		char c = markupBuf[*it];
+		if ((c >= 'a' && c <='z') ||
+		    (c >= 'A' && c <='Z')
+		) {
+			if (!bibWord.length()) esvWordStarts.push_back(bibText.length());
+			bibWord += c;
+		}
+		else {
+			if (bibWord.length()) {
+				esvWordEnds.push_back(bibText.length()-1);
+				esvWords.push_back(bibWord);
+				bibWord = "";
+			}
+		}
+		bibText += c;
+	}
+	if (bibWord.length()) {
+		esvWordEnds.push_back(bibText.length()-1);
+		esvWords.push_back(bibWord);
+	}
+	return bibText;
+}
+
+
+void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags) {
+	kjv.RenderText();	// be sure KJV has processed entry attributes
+	AttributeList &words = kjv.getEntryAttributes()["Word"];
+	SWBuf kjvWord = "";
+	SWBuf bibWord = "";
+	for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+		// this is our new <w> XMLTag.
+		// attributes will be added below
+		XMLTag w("w");
+		int parts = atoi(it->second["PartCount"]);
+		SWBuf lemma = "";
+		SWBuf morph = "";
+		for (int i = 1; i <= parts; i++) {
+			SWBuf key = "";
+			key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+			AttributeValue::iterator li = it->second.find(key);
+			if (li != it->second.end()) {
+				if (i > 1) lemma += " ";
+				key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				if (lci != it->second.end()) {
+					lemma += lci->second + ":";
+				}
+				lemma += li->second;
+			}
+			key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+			li = it->second.find(key);
+			// silly.  sometimes morph counts don't equal lemma counts
+			if (i == 1 && parts != 1 && li == it->second.end()) {
+				li = it->second.find("Morph");
+			}
+			if (li != it->second.end()) {
+				if (i > 1) morph += " ";
+				key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+				AttributeValue::iterator lci = it->second.find(key);
+				// silly.  sometimes morph counts don't equal lemma counts
+				if (i == 1 && parts != 1 && lci == it->second.end()) {
+					lci = it->second.find("MorphClass");
+				}
+				if (lci != it->second.end()) {
+					morph += lci->second + ":";
+				}
+				morph += li->second;
+			}
+			// TODO: add src tags and maybe other attributes
+		}
+
+		if (lemma.length()) w.setAttribute("lemma", lemma);
+		if (morph.length()) w.setAttribute("morph", morph);
+
+
+		kjvWord = it->second["Text"];
+		bibWord = "";
+		for (int j = 0; j < kjvWord.length(); j++) {
+			char c = kjvWord[j];
+			if ((c >= 'a' && c <='z') ||
+			    (c >= 'A' && c <='Z')
+			) {
+				bibWord += c;
+			}
+			else {
+				if (bibWord.length()) {
+					kjvWords.push_back(bibWord);
+					kjvWordTags.push_back(wordTags.size());
+					bibWord = "";
+				}
+			}
+		}
+		if (bibWord.length()) {
+			kjvWords.push_back(bibWord);
+			kjvWordTags.push_back(wordTags.size());
+		}
+
+		wordTags.push_back(w);
+	}
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds) {
+	// TODO: this method needs some work,
+	// like putting multiple consecutive words
+	// together in one tag
+	for (int i = 0; i < esvWordTags.size(); i++) {
+		if (esvWordTags[i] > -1) {
+			insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap);
+			insert("</w>", markupBuf, esvWordEnds[i], bibMap, true);
+		}
+	}
+}
author	Troy A. Griffitts <scribe@crosswire.org>	2013-10-23 14:33:38 +0000
committer	Troy A. Griffitts <scribe@crosswire.org>	2013-10-23 14:33:38 +0000
commit	e9f75761bbd24bb89a1c031d6bf6749e022e2549 (patch)
tree	0f87f2d7c8fb370b98547edf76b68c42d7554eee
parent	52bd2b309a8fe80357e35f018807a24fc8575042 (diff)
download	sword-tools-e9f75761bbd24bb89a1c031d6bf6749e022e2549.tar.gz