From f304a5a5fe1d4dd2ff7efab2ee07e272ece87840 Mon Sep 17 00:00:00 2001
From: "Troy A. Griffitts" <scribe@crosswire.org>
Date: Tue, 18 Apr 2023 09:35:52 +0000
Subject: updated to ignore stuff and leave source less changed

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@559 07627401-56e2-0310-80f4-f8cd0041bdcd
---
 migratetags/matchers/gntmatcher.h |  16 ++++-
 migratetags/migratetags.cpp       | 127 ++++++++++++++++++++++++--------------
 2 files changed, 96 insertions(+), 47 deletions(-)
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h
index 8c8f3e4..d2fcbd8 100644
--- a/migratetags/matchers/gntmatcher.h
+++ b/migratetags/matchers/gntmatcher.h
@@ -104,12 +104,26 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target
 	}
 }
  
+const char *ignoreSeries = "[]\nʼ‾̷‾";
 virtual SWBuf sanitizeWord(const SWBuf &word) {
 	SWBuf t1 = word;
 	// remove greek accents
 	sanitizeGreekAccentFilter.processText(t1);
 	t1.toUpper();
-	t1.replaceBytes("[]", 0);
+
+	// remove ignoreSeries characters
+	SWBuf o = t1;
+	const unsigned char* from = (unsigned char*)o.c_str();
+	t1 = "";
+	while (*from) {		
+		SW_u32 ch = getUniCharFromUTF8(&from, true);
+		// if ch is bad, then convert to replacement char
+		if (!ch) ch = 0xFFFD;
+		SWBuf checkChar;
+		getUTF8FromUniChar(ch, &checkChar);
+		if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+		t1.append(checkChar);
+	}
 	return t1;
 }
 
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 3dec240..0ea3ac7 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -36,7 +36,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
 SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
 SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
 void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
-void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex = 0);
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex = 0);
 
 // app options
 bool optionFilterAccents = false;
@@ -99,13 +99,24 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
 
 	while (!fileEnd || line.size()) {
 		if (!line.size()) {
-			fileEnd = !FileMgr::getLine(targetInput, line);
+			fileEnd = !FileMgr::getLine(targetInput, line, false);
 			if (!fileEnd) line.append("\n");
 		}
 		int offset = line.indexOf("<ab ");
 		int endOffset = line.indexOf("</ab>");
 		if (offset < 0) offset = endOffset;
 		else if (endOffset > -1 && endOffset < offset) offset = endOffset;
+		// <ab> was found and was before </ab>
+		else {
+			// this is for when we have interverse data we've output before we hit an <ab>
+			if (targetModText->length() || offset > 0) {
+				targetModText->append(line, offset);
+				line << offset;
+				break;
+			}
+		}
+
+
 		if (offset > -1) {
 			targetModText->append(line, offset);
 			line << offset;
@@ -129,7 +140,7 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
 	}
 
 	// assert our source is in good condition to give us more data
-	if (fileEnd && !line.size()) return false;
+	if (fileEnd && !line.size() && !targetModText->size()) return false;
 
 	// grab our raw, fully marked up TargetMod text for this verse
 	if (lastAB.isEndTag()) {
@@ -332,13 +343,13 @@ int main(int argc, char **argv) {
 		}
 
 if (optionDebug) {
-		cout << "\nProcessing Verse: " << targetModKey->getText() << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nProcessing Verse: " << targetModKey->getText() << endl;
+		cerr << "---------------------" << endl;
 
-		cout << "\nOur strongsSourceModule Markup" << endl;
-		cout << "---------------------" << endl;
-		cout << fromMod.getRawEntry() << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nOur strongsSourceModule Markup" << endl;
+		cerr << "---------------------" << endl;
+		cerr << fromMod.getRawEntry() << endl;
+		cerr << "---------------------" << endl;
 }
 
 
@@ -347,10 +358,10 @@ if (optionDebug) {
 
 
 if (optionDebug) {
-		cout << "\nOur Original targetModule Markup" << endl;
-		cout << "---------------------" << endl;
-		cout << orig << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nOur Original targetModule Markup" << endl;
+		cerr << "---------------------" << endl;
+		cerr << orig << endl;
+		cerr << "---------------------" << endl;
 }
 
 		if (optionFilterAppCrit) {
@@ -366,6 +377,12 @@ if (optionDebug) {
 				if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
 				orig.append(checkChar);
 			}
+if (optionDebug) {
+			cerr << "\nOur Original targetModule Markup after FilterAppCrit" << endl;
+			cerr << "---------------------" << endl;
+			cerr << orig << endl;
+			cerr << "---------------------" << endl;
+}
 		}
 
 		// let's find where just the canonical text is amongst
@@ -379,17 +396,17 @@ if (optionDebug) {
 		SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
 
 if (optionDebug) {
-		cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
-		cout << "---------------------" << endl;
-		cout << newTargetModMarkup << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
+		cerr << "---------------------" << endl;
+		cerr << newTargetModMarkup << endl;
+		cerr << "---------------------" << endl;
 
-		cout << "\nOur bibMap" << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nOur bibMap" << endl;
+		cerr << "---------------------" << endl;
 		for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
-			cout << *it << " ";
+			cerr << *it << " ";
 		}
-		cout << "\n---------------------" << endl;
+		cerr << "\n---------------------" << endl;
 }
 
 		// let's populate our TargetMod word data and fill in our
@@ -397,10 +414,10 @@ if (optionDebug) {
 		justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
 
 if (optionDebug) {
-		cout << "\nJust targetModule Bible Text" << endl;
-		cout << "---------------------" << endl;
-		cout << justTargetModBibleText << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nJust targetModule Bible Text" << endl;
+		cerr << "---------------------" << endl;
+		cerr << justTargetModBibleText << endl;
+		cerr << "---------------------" << endl;
 }
 
  
@@ -420,14 +437,14 @@ if (optionDebug) {
 
 		// ok, now that we have our targetWordTags magically populated
 		// let's do the grunt work of inserting the <w> and </w> tags
-		insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, lex);
+		insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, fromWords, lex);
 
 
 if (optionDebug) {
-		cout << "\nHere's how we mapped things..." << endl;
-		cout << "---------------------" << endl;
-		cout << "Total wordTags: " << wordTags.size() << endl;
-		cout << "\nTargetMod Words: " << endl;
+		cerr << "\nHere's how we mapped things..." << endl;
+		cerr << "---------------------" << endl;
+		cerr << "Total wordTags: " << wordTags.size() << endl;
+		cerr << "\nTargetMod Words: " << endl;
 }
 		bool warned = false;
 		for (int i = 0; i < targetWords.size(); ++i) {
@@ -451,7 +468,7 @@ if (optionDebug) {
 				cerr << "  " << i << ": " <<  targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
 			}
 if (optionDebug) {
-			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
+			cerr << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
 }
 		}
 		if (warned) {
@@ -472,17 +489,20 @@ if (optionDebug) {
 		}
 
 if (optionDebug) {
-		cout << "---------------------" << endl;
+		cerr << "---------------------" << endl;
 		
-		cout << "\nAND... Here's our final output" << endl;
-		cout << "---------------------" << endl;
+		cerr << "\nAND... Here's our final output" << endl;
+		cerr << "---------------------" << endl;
 }
 		cout << newTargetModMarkup;
+if (optionDebug) {
+		cerr << newTargetModMarkup << endl;
+}
 		if (!targetTEIFile.size()) {
 			cout << endl;
 		}
 if (optionDebug) {
-		cout << endl;
+		cerr << endl;
 }
 	}
 
@@ -497,13 +517,15 @@ if (optionDebug) {
 // and each character's corresponding real location in our output
 // buffer (returned value)
 SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+	bool XML_TAGGIFY = false;
 	SWBuf out = "";
 	SWBuf tag = "";
 	int tagLevel = 0;
 	int wTag = -1;
 	int inTag = 0;
 	bool wTagsPresent = orig.indexOf("<w") > -1;
-	SWBuf lastElementText = "";
+	SWBuf lastWElementText = "";
+	bool lastLBBreak = false;
 	for (int i = 0; i < orig.length(); ++i) {
 		if (orig[i] == '<') {
 			inTag = true;
@@ -516,9 +538,9 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 				if (t.isEndTag()) {
 					// clear out empty w tags
 					if (t.getName() && !strcmp("w", t.getName())) {
-						if (!lastElementText.size()) {
+						if (!lastWElementText.size()) {
 							out.setSize(wTag);
-							if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
+							if (out.endsWith(' ')) {
 								out.setSize(out.size() - 1);
 								bibMap.pop_back();
 								wTags.pop_back();
@@ -527,15 +549,23 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 						}
 					}
 					tagLevel--;
-					wTag = -1;
+					if (t.getName() && !strcmp("w", t.getName())) wTag = -1;
 				}
 				else {
-					lastElementText = "";
 					tagLevel++;
-					wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+					if (t.getName() && !strcmp("w", t.getName())) {
+						wTag = out.size();
+						lastWElementText = "";
+					}
 				}
 			}
-			if (!skipTag) out += t;
+			else {
+				if (SWBuf(t.getName()) == "lb") {
+				       lastLBBreak = !(SWBuf(t.getAttribute("break")) == "no");
+				}
+			}
+
+			if (!skipTag) out += (XML_TAGGIFY ? t : SWBuf("<") + tag + ">");
 			tag = "";
 		}
 		else if (inTag) {
@@ -552,7 +582,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
 				wTags.push_back(wTag);
 			}
 			out += orig[i];
-			lastElementText += orig[i];
+			lastWElementText += orig[i];
 		}
 	}
 	return out;
@@ -697,7 +727,7 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
 }
 
 
-void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex) {
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex) {
 	// TODO: this method needs some work,
 	// like putting multiple consecutive words
 	// together in one tag
@@ -710,8 +740,12 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag
 
 	for (int i = 0; i < targetWordTags.size(); ++i) {
 		SWBuf wordTag = "";
+		SWBuf norm = "";
 		if (targetWordTags[i] > -1) {
 			wordTag = wordTags[targetWordTags[i]];
+			if (lex) {
+				norm = fromWords[targetWordTags[i]];
+			}
 		}
 		if (exceptionFile) {
 			SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
@@ -733,8 +767,9 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag
 						if (a.startsWith("G") || a.startsWith("H")) a << 1;
 						SWBuf dict = (*lex)[a]["UTF8"];
 						SWBuf gloss = (*lex)[a]["Meaning"];
-						w.setAttribute("corresp", dict);
-						w.setAttribute("gloss", gloss);
+						//w.setAttribute("corresp", dict);
+						if (norm.length()) w.setAttribute("norm", norm);
+						//w.setAttribute("gloss", gloss);
 						wordTag = w.toString();
 					}
 				}
-- 
cgit