summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--migratetags/matchers/gntmatcher.h16
-rw-r--r--migratetags/migratetags.cpp127
2 files changed, 96 insertions, 47 deletions
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h
index 8c8f3e4..d2fcbd8 100644
--- a/migratetags/matchers/gntmatcher.h
+++ b/migratetags/matchers/gntmatcher.h
@@ -104,12 +104,26 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target
}
}
+const char *ignoreSeries = "[]\nʼ‾̷‾";
virtual SWBuf sanitizeWord(const SWBuf &word) {
SWBuf t1 = word;
// remove greek accents
sanitizeGreekAccentFilter.processText(t1);
t1.toUpper();
- t1.replaceBytes("[]", 0);
+
+ // remove ignoreSeries characters
+ SWBuf o = t1;
+ const unsigned char* from = (unsigned char*)o.c_str();
+ t1 = "";
+ while (*from) {
+ SW_u32 ch = getUniCharFromUTF8(&from, true);
+ // if ch is bad, then convert to replacement char
+ if (!ch) ch = 0xFFFD;
+ SWBuf checkChar;
+ getUTF8FromUniChar(ch, &checkChar);
+ if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+ t1.append(checkChar);
+ }
return t1;
}
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 3dec240..0ea3ac7 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -36,7 +36,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
-void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex = 0);
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex = 0);
// app options
bool optionFilterAccents = false;
@@ -99,13 +99,24 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
while (!fileEnd || line.size()) {
if (!line.size()) {
- fileEnd = !FileMgr::getLine(targetInput, line);
+ fileEnd = !FileMgr::getLine(targetInput, line, false);
if (!fileEnd) line.append("\n");
}
int offset = line.indexOf("<ab ");
int endOffset = line.indexOf("</ab>");
if (offset < 0) offset = endOffset;
else if (endOffset > -1 && endOffset < offset) offset = endOffset;
+ // <ab> was found and was before </ab>
+ else {
+ // this is for when we have interverse data we've output before we hit an <ab>
+ if (targetModText->length() || offset > 0) {
+ targetModText->append(line, offset);
+ line << offset;
+ break;
+ }
+ }
+
+
if (offset > -1) {
targetModText->append(line, offset);
line << offset;
@@ -129,7 +140,7 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
}
// assert our source is in good condition to give us more data
- if (fileEnd && !line.size()) return false;
+ if (fileEnd && !line.size() && !targetModText->size()) return false;
// grab our raw, fully marked up TargetMod text for this verse
if (lastAB.isEndTag()) {
@@ -332,13 +343,13 @@ int main(int argc, char **argv) {
}
if (optionDebug) {
- cout << "\nProcessing Verse: " << targetModKey->getText() << endl;
- cout << "---------------------" << endl;
+ cerr << "\nProcessing Verse: " << targetModKey->getText() << endl;
+ cerr << "---------------------" << endl;
- cout << "\nOur strongsSourceModule Markup" << endl;
- cout << "---------------------" << endl;
- cout << fromMod.getRawEntry() << endl;
- cout << "---------------------" << endl;
+ cerr << "\nOur strongsSourceModule Markup" << endl;
+ cerr << "---------------------" << endl;
+ cerr << fromMod.getRawEntry() << endl;
+ cerr << "---------------------" << endl;
}
@@ -347,10 +358,10 @@ if (optionDebug) {
if (optionDebug) {
- cout << "\nOur Original targetModule Markup" << endl;
- cout << "---------------------" << endl;
- cout << orig << endl;
- cout << "---------------------" << endl;
+ cerr << "\nOur Original targetModule Markup" << endl;
+ cerr << "---------------------" << endl;
+ cerr << orig << endl;
+ cerr << "---------------------" << endl;
}
if (optionFilterAppCrit) {
@@ -366,6 +377,12 @@ if (optionDebug) {
if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
orig.append(checkChar);
}
+if (optionDebug) {
+ cerr << "\nOur Original targetModule Markup after FilterAppCrit" << endl;
+ cerr << "---------------------" << endl;
+ cerr << orig << endl;
+ cerr << "---------------------" << endl;
+}
}
// let's find where just the canonical text is amongst
@@ -379,17 +396,17 @@ if (optionDebug) {
SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
if (optionDebug) {
- cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
- cout << "---------------------" << endl;
- cout << newTargetModMarkup << endl;
- cout << "---------------------" << endl;
+ cerr << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
+ cerr << "---------------------" << endl;
+ cerr << newTargetModMarkup << endl;
+ cerr << "---------------------" << endl;
- cout << "\nOur bibMap" << endl;
- cout << "---------------------" << endl;
+ cerr << "\nOur bibMap" << endl;
+ cerr << "---------------------" << endl;
for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
- cout << *it << " ";
+ cerr << *it << " ";
}
- cout << "\n---------------------" << endl;
+ cerr << "\n---------------------" << endl;
}
// let's populate our TargetMod word data and fill in our
@@ -397,10 +414,10 @@ if (optionDebug) {
justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
if (optionDebug) {
- cout << "\nJust targetModule Bible Text" << endl;
- cout << "---------------------" << endl;
- cout << justTargetModBibleText << endl;
- cout << "---------------------" << endl;
+ cerr << "\nJust targetModule Bible Text" << endl;
+ cerr << "---------------------" << endl;
+ cerr << justTargetModBibleText << endl;
+ cerr << "---------------------" << endl;
}
@@ -420,14 +437,14 @@ if (optionDebug) {
// ok, now that we have our targetWordTags magically populated
// let's do the grunt work of inserting the <w> and </w> tags
- insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, lex);
+ insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, fromWords, lex);
if (optionDebug) {
- cout << "\nHere's how we mapped things..." << endl;
- cout << "---------------------" << endl;
- cout << "Total wordTags: " << wordTags.size() << endl;
- cout << "\nTargetMod Words: " << endl;
+ cerr << "\nHere's how we mapped things..." << endl;
+ cerr << "---------------------" << endl;
+ cerr << "Total wordTags: " << wordTags.size() << endl;
+ cerr << "\nTargetMod Words: " << endl;
}
bool warned = false;
for (int i = 0; i < targetWords.size(); ++i) {
@@ -451,7 +468,7 @@ if (optionDebug) {
cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
}
if (optionDebug) {
- cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
+ cerr << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
}
if (warned) {
@@ -472,17 +489,20 @@ if (optionDebug) {
}
if (optionDebug) {
- cout << "---------------------" << endl;
+ cerr << "---------------------" << endl;
- cout << "\nAND... Here's our final output" << endl;
- cout << "---------------------" << endl;
+ cerr << "\nAND... Here's our final output" << endl;
+ cerr << "---------------------" << endl;
}
cout << newTargetModMarkup;
+if (optionDebug) {
+ cerr << newTargetModMarkup << endl;
+}
if (!targetTEIFile.size()) {
cout << endl;
}
if (optionDebug) {
- cout << endl;
+ cerr << endl;
}
}
@@ -497,13 +517,15 @@ if (optionDebug) {
// and each character's corresponding real location in our output
// buffer (returned value)
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+ bool XML_TAGGIFY = false;
SWBuf out = "";
SWBuf tag = "";
int tagLevel = 0;
int wTag = -1;
int inTag = 0;
bool wTagsPresent = orig.indexOf("<w") > -1;
- SWBuf lastElementText = "";
+ SWBuf lastWElementText = "";
+ bool lastLBBreak = false;
for (int i = 0; i < orig.length(); ++i) {
if (orig[i] == '<') {
inTag = true;
@@ -516,9 +538,9 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
if (t.isEndTag()) {
// clear out empty w tags
if (t.getName() && !strcmp("w", t.getName())) {
- if (!lastElementText.size()) {
+ if (!lastWElementText.size()) {
out.setSize(wTag);
- if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
+ if (out.endsWith(' ')) {
out.setSize(out.size() - 1);
bibMap.pop_back();
wTags.pop_back();
@@ -527,15 +549,23 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
}
}
tagLevel--;
- wTag = -1;
+ if (t.getName() && !strcmp("w", t.getName())) wTag = -1;
}
else {
- lastElementText = "";
tagLevel++;
- wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+ if (t.getName() && !strcmp("w", t.getName())) {
+ wTag = out.size();
+ lastWElementText = "";
+ }
}
}
- if (!skipTag) out += t;
+ else {
+ if (SWBuf(t.getName()) == "lb") {
+ lastLBBreak = !(SWBuf(t.getAttribute("break")) == "no");
+ }
+ }
+
+ if (!skipTag) out += (XML_TAGGIFY ? t : SWBuf("<") + tag + ">");
tag = "";
}
else if (inTag) {
@@ -552,7 +582,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
wTags.push_back(wTag);
}
out += orig[i];
- lastElementText += orig[i];
+ lastWElementText += orig[i];
}
}
return out;
@@ -697,7 +727,7 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
}
-void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex) {
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex) {
// TODO: this method needs some work,
// like putting multiple consecutive words
// together in one tag
@@ -710,8 +740,12 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag
for (int i = 0; i < targetWordTags.size(); ++i) {
SWBuf wordTag = "";
+ SWBuf norm = "";
if (targetWordTags[i] > -1) {
wordTag = wordTags[targetWordTags[i]];
+ if (lex) {
+ norm = fromWords[targetWordTags[i]];
+ }
}
if (exceptionFile) {
SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
@@ -733,8 +767,9 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag
if (a.startsWith("G") || a.startsWith("H")) a << 1;
SWBuf dict = (*lex)[a]["UTF8"];
SWBuf gloss = (*lex)[a]["Meaning"];
- w.setAttribute("corresp", dict);
- w.setAttribute("gloss", gloss);
+ //w.setAttribute("corresp", dict);
+ if (norm.length()) w.setAttribute("norm", norm);
+ //w.setAttribute("gloss", gloss);
wordTag = w.toString();
}
}