Committed latest version of migrate tags

made it works with both <w> modules and modules without <w> tags added parameter -t to specify target module git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@540 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Troy A. Griffitts <scribe@crosswire.org> 2022-02-17 17:42:32 +0000
committer: Troy A. Griffitts <scribe@crosswire.org> 2022-02-17 17:42:32 +0000
commit: baa326c09201d76f02c28168ab4eaad427596b48 (patch)
tree: 28152621d237eebed45ca9f1e4fab4e6596c348f /migratetags/migratetags.cpp
parent: a05a43ad3ea9aef59466d4bb600e26c4cb777784 (diff)
download: sword-tools-baa326c09201d76f02c28168ab4eaad427596b48.tar.gz
1 files changed, 112 insertions, 35 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 2b6b92b..ecb1ed0 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -3,6 +3,7 @@
 #include <swmgr.h>
 #include <utilxml.h>
 #include <swbuf.h>
+#include <swconfig.h>
 #include <swmodule.h>
 #include <stringmgr.h>
 #include <iostream>
@@ -14,11 +15,13 @@ using namespace std;
 #include "matchers/matcher.h"
 
 // select your matcher here
+//#include "matchers/gntmatcher.h"
 #include "matchers/defaultmatcher.h"
 Matcher *matcher = new DefaultMatcher();
 
-const char *targetModuleName="NA28";
-const char *strongsSourceModuleName="WHNU";
+// hard code your from and to modules here or pass them on the command line with -
+SWBuf strongsSourceModuleName = "WHNU";
+SWBuf targetModuleName = "NA28FromImp";
 
 
 const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊  ";
@@ -30,12 +33,14 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
 SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
 SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
 void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
 
 // app options
 bool optionFilterAccents = false;
 bool optionFilterAppCrit = false;
 bool optionDebug         = false;
+vector<SWBuf> optionExceptionFile;
+SWConfig *exceptionFile = 0;
 
 void usage(const char *progName, const char *error = 0) {
 	if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
@@ -43,6 +48,10 @@ void usage(const char *progName, const char *error = 0) {
 	fprintf(stderr, "\nusage: %s [options]\n", progName);
 	fprintf(stderr, "  -v\t\t\t verbose: print lots of information while processing\n");
 	fprintf(stderr, "  -fa\t\t\t filter accents: remove Greek accents from final text\n");
+	fprintf(stderr, "  -fc\t\t\t filter critical apparatus markers from final text\n");
+	fprintf(stderr, "  -ss <moduleName>\t provide the Strong's source module name\n");
+	fprintf(stderr, "  -t <moduleName>\t provide the target module name\n");
+	fprintf(stderr, "  -e <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
 	fprintf(stderr, "\n\n");
 	exit(-1);
 }
@@ -61,6 +70,24 @@ int main(int argc, char **argv) {
 		else if (!strcmp(argv[i], "-fc")) {
 			optionFilterAppCrit = true;
 		}
+		else if (!strcmp(argv[i], "-ss")) {
+			if ((i + 1) < argc) {
+				strongsSourceModuleName = argv[++i];
+			}
+			else usage(progName, "-ss argument requires a module name.");
+		}
+		else if (!strcmp(argv[i], "-t")) {
+			if ((i + 1) < argc) {
+				targetModuleName = argv[++i];
+			}
+			else usage(progName, "-t argument requires a module name.");
+		}
+		else if (!strcmp(argv[i], "-e")) {
+			if (i+1 < argc) {
+				optionExceptionFile.push_back(argv[++i]);
+			}
+			else usage(progName, "-e argument requires a file name.");
+		}
 		else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
 	}
 
@@ -72,16 +99,23 @@ int main(int argc, char **argv) {
 		exit(1);
 	}
 	SWModule &targetMod = *m;
-	m = lib.getModule(strongsSourceModuleName);
+	m = lib.getModule(strongsSourceModuleName.c_str());
 	if (!m) {
-		cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+		cerr << "couldn't find source module: " << strongsSourceModuleName.c_str() << ".\n";
 		exit(1);
 	}
 	SWModule &fromMod = *m;
 
+	for (int i = 0; i < optionExceptionFile.size(); ++i) {
+		SWBuf fileName = optionExceptionFile[i];
+		if (!i) exceptionFile = new SWConfig(fileName);
+		else (*exceptionFile) += SWConfig(fileName);
+	}
+
 	// we'll do the whole Bible eventually, but let's just get one verse
 	// working well.
-	targetMod.setKey("mat1.1");		// let's try this verse
+	((VerseKey *)targetMod.getKey())->setIntros(true);
+	targetMod.getKey()->setText("mat0.0");		// let's try this verse
 	int z = 0;
 	for (;
 //!z &&
@@ -147,7 +181,7 @@ if (optionDebug) {
 		cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
 		cout << "---------------------" << endl;
 
-		cout << "\nOur FromMod Verse Markup" << endl;
+		cout << "\nOur strongsSourceModule Markup" << endl;
 		cout << "---------------------" << endl;
 		cout << fromMod.getRawEntry() << endl;
 		cout << "---------------------" << endl;
@@ -157,18 +191,20 @@ if (optionDebug) {
 		// grab our raw, fully marked up TargetMod text for this verse
 		SWBuf orig = targetMod.getRawEntryBuf();
 
-		if (optionFilterAccents) {
-			UTF8GreekAccents filter;
-			filter.setOptionValue("off");
-			filter.processText(orig);
-		}
+
+if (optionDebug) {
+		cout << "\nOur Original targetModule Markup" << endl;
+		cout << "---------------------" << endl;
+		cout << orig << endl;
+		cout << "---------------------" << endl;
+}
 
 		if (optionFilterAppCrit) {
 			SWBuf o = orig;
 			const unsigned char* from = (unsigned char*)o.c_str();
 			orig = "";
 			while (*from) {		
-				__u32 ch = getUniCharFromUTF8(&from, true);
+				SW_u32 ch = getUniCharFromUTF8(&from, true);
 				// if ch is bad, then convert to replacement char
 				if (!ch) ch = 0xFFFD;
 				SWBuf checkChar;
@@ -178,13 +214,6 @@ if (optionDebug) {
 			}
 		}
 
-if (optionDebug) {
-		cout << "\nOur Original TargetMod Markup" << endl;
-		cout << "---------------------" << endl;
-		cout << orig << endl;
-		cout << "---------------------" << endl;
-}
-
 		// let's find where just the canonical text is amongst
 		// all our markup
 		// newTargetModMarkup will eventually hold our updated markup with
@@ -196,7 +225,7 @@ if (optionDebug) {
 		SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
 
 if (optionDebug) {
-		cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+		cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
 		cout << "---------------------" << endl;
 		cout << newTargetModMarkup << endl;
 		cout << "---------------------" << endl;
@@ -214,7 +243,7 @@ if (optionDebug) {
 		justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
 
 if (optionDebug) {
-		cout << "\nJust TargetMod Bible Text" << endl;
+		cout << "\nJust targetModule Bible Text" << endl;
 		cout << "---------------------" << endl;
 		cout << justTargetModBibleText << endl;
 		cout << "---------------------" << endl;
@@ -234,14 +263,13 @@ if (optionDebug) {
 		//
 		matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
 
-
 		// ok, now that we have our targetWordTags magically populated
 		// let's do the grunt work of inserting the <w> and </w> tags
-		insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+		insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
 
 
 if (optionDebug) {
-		cout << "\nHere's how you mapped things..." << endl;
+		cout << "\nHere's how we mapped things..." << endl;
 		cout << "---------------------" << endl;
 		cout << "Total wordTags: " << wordTags.size() << endl;
 		cout << "\nTargetMod Words: " << endl;
@@ -251,7 +279,7 @@ if (optionDebug) {
 			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
 				if (!warned) {
 					cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
-					cerr << strongsSourceModuleName << ":";
+					cerr << strongsSourceModuleName.c_str() << ":";
 					for (int j = 0; j < fromWords.size(); ++j) {
 						cerr << " " << fromWords[j];
 					}
@@ -268,7 +296,7 @@ if (optionDebug) {
 				cerr << "  " << i << ": " <<  targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
 			}
 if (optionDebug) {
-			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
 }
 		}
 		if (warned) {
@@ -276,15 +304,22 @@ if (optionDebug) {
 			VerseKey *vk = (VerseKey *)targetMod.getKey();
 			for (int j = 0; j < targetWords.size(); ++j) {
 				if (!strstr(ignoreSeries, targetWords[j])) {
-					cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl;
+					cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
 				}
 			}
 			cerr << "---------------------" << endl;
 		}
+
+		if (optionFilterAccents) {
+			UTF8GreekAccents filter;
+			filter.setOptionValue("off");
+			filter.processText(newTargetModMarkup);
+		}
+
 if (optionDebug) {
 		cout << "---------------------" << endl;
 		
-		cout << "\nAND... Here's your final output" << endl;
+		cout << "\nAND... Here's our final output" << endl;
 		cout << "---------------------" << endl;
 }
 		cout << newTargetModMarkup << endl;
@@ -292,6 +327,9 @@ if (optionDebug) {
 		cout << endl;
 }
 	}
+
+	delete exceptionFile;
+
 	return 0;
 }
 
@@ -305,6 +343,8 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 	int tagLevel = 0;
 	int wTag = -1;
 	int inTag = 0;
+	bool wTagsPresent = orig.indexOf("<w") > -1;
+	SWBuf lastElementText = "";
 	for (int i = 0; i < orig.length(); ++i) {
 		if (orig[i] == '<') {
 			inTag = true;
@@ -312,30 +352,48 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 		else if (orig[i] == '>') {
 			inTag = false;
 			XMLTag t = tag.c_str();
+			bool skipTag = false;
 			if (!t.isEmpty()) {
 				if (t.isEndTag()) {
+					// clear out empty w tags
+					if (t.getName() && !strcmp("w", t.getName())) {
+						if (!lastElementText.size()) {
+							out.setSize(wTag);
+							if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
+								out.setSize(out.size() - 1);
+								bibMap.pop_back();
+								wTags.pop_back();
+							}
+							skipTag = true;
+						}
+					}
 					tagLevel--;
 					wTag = -1;
 				}
 				else {
+					lastElementText = "";
 					tagLevel++;
 					wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
 				}
 			}
-			out += t;
+			if (!skipTag) out += t;
 			tag = "";
 		}
 		else if (inTag) {
 			tag += orig[i];
 		}
 		else {
-// for texts without <w> tags
-//			if (!tagLevel || wTag != -1) {
-			if (wTag != -1 || orig[i] == ' ') {
+			if (
+				   // for texts without <w> tags
+				   (!wTagsPresent && (!tagLevel || wTag != -1))
+				   // for texts with <w> tags
+				|| ( wTagsPresent && (wTag != -1 || orig[i] == ' '))
+			) {
 				bibMap.push_back(out.size());
 				wTags.push_back(wTag);
 			}
 			out += orig[i];
+			lastElementText += orig[i];
 		}
 	}
 	return out;
@@ -480,13 +538,32 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
 }
 
 
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
 	// TODO: this method needs some work,
 	// like putting multiple consecutive words
 	// together in one tag
+
+	ConfigEntMap exceptions;
+
+	if (exceptionFile) {
+		exceptions = exceptionFile->getSection("exceptions");
+	}
+
 	for (int i = 0; i < targetWordTags.size(); ++i) {
+		SWBuf wordTag = "";
 		if (targetWordTags[i] > -1) {
-			insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+			wordTag = wordTags[targetWordTags[i]];
+		}
+		if (exceptionFile) {
+			SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
+			ConfigEntMap::const_iterator it = exceptions.find(key);
+			if (it != exceptions.end()) {
+				targetWordTags[i] = -2;	// note that we are using an exception, not a mapping, not unset (-1)
+				wordTag = it->second;
+			}
+		}
+		if (wordTag.length()) {
+			insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags);
 			insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
 		}
 	}
author	Troy A. Griffitts <scribe@crosswire.org>	2022-02-17 17:42:32 +0000
committer	Troy A. Griffitts <scribe@crosswire.org>	2022-02-17 17:42:32 +0000
commit	baa326c09201d76f02c28168ab4eaad427596b48 (patch)
tree	28152621d237eebed45ca9f1e4fab4e6596c348f /migratetags/migratetags.cpp
parent	a05a43ad3ea9aef59466d4bb600e26c4cb777784 (diff)
download	sword-tools-baa326c09201d76f02c28168ab4eaad427596b48.tar.gz