diff options
-rw-r--r-- | migratetags/matchers/gntmatcher.h | 16 | ||||
-rw-r--r-- | migratetags/migratetags.cpp | 127 |
2 files changed, 96 insertions, 47 deletions
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h index 8c8f3e4..d2fcbd8 100644 --- a/migratetags/matchers/gntmatcher.h +++ b/migratetags/matchers/gntmatcher.h @@ -104,12 +104,26 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target } } +const char *ignoreSeries = "[]\nʼ‾̷‾"; virtual SWBuf sanitizeWord(const SWBuf &word) { SWBuf t1 = word; // remove greek accents sanitizeGreekAccentFilter.processText(t1); t1.toUpper(); - t1.replaceBytes("[]", 0); + + // remove ignoreSeries characters + SWBuf o = t1; + const unsigned char* from = (unsigned char*)o.c_str(); + t1 = ""; + while (*from) { + SW_u32 ch = getUniCharFromUTF8(&from, true); + // if ch is bad, then convert to replacement char + if (!ch) ch = 0xFFFD; + SWBuf checkChar; + getUTF8FromUniChar(ch, &checkChar); + if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; + t1.append(checkChar); + } return t1; } diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp index 3dec240..0ea3ac7 100644 --- a/migratetags/migratetags.cpp +++ b/migratetags/migratetags.cpp @@ -36,7 +36,7 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags); SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds); void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags); -void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex = 0); +void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex = 0); // app options bool optionFilterAccents = false; @@ -99,13 +99,24 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) { while (!fileEnd || line.size()) { if (!line.size()) { - fileEnd = !FileMgr::getLine(targetInput, line); + fileEnd = !FileMgr::getLine(targetInput, line, false); if (!fileEnd) line.append("\n"); } int offset = line.indexOf("<ab "); int endOffset = line.indexOf("</ab>"); if (offset < 0) offset = endOffset; else if (endOffset > -1 && endOffset < offset) offset = endOffset; + // <ab> was found and was before </ab> + else { + // this is for when we have interverse data we've output before we hit an <ab> + if (targetModText->length() || offset > 0) { + targetModText->append(line, offset); + line << offset; + break; + } + } + + if (offset > -1) { targetModText->append(line, offset); line << offset; @@ -129,7 +140,7 @@ bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) { } // assert our source is in good condition to give us more data - if (fileEnd && !line.size()) return false; + if (fileEnd && !line.size() && !targetModText->size()) return false; // grab our raw, fully marked up TargetMod text for this verse if (lastAB.isEndTag()) { @@ -332,13 +343,13 @@ int main(int argc, char **argv) { } if (optionDebug) { - cout << "\nProcessing Verse: " << targetModKey->getText() << endl; - cout << "---------------------" << endl; + cerr << "\nProcessing Verse: " << targetModKey->getText() << endl; + cerr << "---------------------" << endl; - cout << "\nOur strongsSourceModule Markup" << endl; - cout << "---------------------" << endl; - cout << fromMod.getRawEntry() << endl; - cout << "---------------------" << endl; + cerr << "\nOur strongsSourceModule Markup" << endl; + cerr << "---------------------" << endl; + cerr << fromMod.getRawEntry() << endl; + cerr << "---------------------" << endl; } @@ -347,10 +358,10 @@ if (optionDebug) { if (optionDebug) { - cout << "\nOur Original targetModule Markup" << endl; - cout << "---------------------" << endl; - cout << orig << endl; - cout << "---------------------" << endl; + cerr << "\nOur Original targetModule Markup" << endl; + cerr << "---------------------" << endl; + cerr << orig << endl; + cerr << "---------------------" << endl; } if (optionFilterAppCrit) { @@ -366,6 +377,12 @@ if (optionDebug) { if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; orig.append(checkChar); } +if (optionDebug) { + cerr << "\nOur Original targetModule Markup after FilterAppCrit" << endl; + cerr << "---------------------" << endl; + cerr << orig << endl; + cerr << "---------------------" << endl; +} } // let's find where just the canonical text is amongst @@ -379,17 +396,17 @@ if (optionDebug) { SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags); if (optionDebug) { - cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl; - cout << "---------------------" << endl; - cout << newTargetModMarkup << endl; - cout << "---------------------" << endl; + cerr << "\nOur Original targetModule Markup After XMLTag-ifying" << endl; + cerr << "---------------------" << endl; + cerr << newTargetModMarkup << endl; + cerr << "---------------------" << endl; - cout << "\nOur bibMap" << endl; - cout << "---------------------" << endl; + cerr << "\nOur bibMap" << endl; + cerr << "---------------------" << endl; for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) { - cout << *it << " "; + cerr << *it << " "; } - cout << "\n---------------------" << endl; + cerr << "\n---------------------" << endl; } // let's populate our TargetMod word data and fill in our @@ -397,10 +414,10 @@ if (optionDebug) { justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds); if (optionDebug) { - cout << "\nJust targetModule Bible Text" << endl; - cout << "---------------------" << endl; - cout << justTargetModBibleText << endl; - cout << "---------------------" << endl; + cerr << "\nJust targetModule Bible Text" << endl; + cerr << "---------------------" << endl; + cerr << justTargetModBibleText << endl; + cerr << "---------------------" << endl; } @@ -420,14 +437,14 @@ if (optionDebug) { // ok, now that we have our targetWordTags magically populated // let's do the grunt work of inserting the <w> and </w> tags - insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, lex); + insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds, fromWords, lex); if (optionDebug) { - cout << "\nHere's how we mapped things..." << endl; - cout << "---------------------" << endl; - cout << "Total wordTags: " << wordTags.size() << endl; - cout << "\nTargetMod Words: " << endl; + cerr << "\nHere's how we mapped things..." << endl; + cerr << "---------------------" << endl; + cerr << "Total wordTags: " << wordTags.size() << endl; + cerr << "\nTargetMod Words: " << endl; } bool warned = false; for (int i = 0; i < targetWords.size(); ++i) { @@ -451,7 +468,7 @@ if (optionDebug) { cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl; } if (optionDebug) { - cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl; + cerr << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl; } } if (warned) { @@ -472,17 +489,20 @@ if (optionDebug) { } if (optionDebug) { - cout << "---------------------" << endl; + cerr << "---------------------" << endl; - cout << "\nAND... Here's our final output" << endl; - cout << "---------------------" << endl; + cerr << "\nAND... Here's our final output" << endl; + cerr << "---------------------" << endl; } cout << newTargetModMarkup; +if (optionDebug) { + cerr << newTargetModMarkup << endl; +} if (!targetTEIFile.size()) { cout << endl; } if (optionDebug) { - cout << endl; + cerr << endl; } } @@ -497,13 +517,15 @@ if (optionDebug) { // and each character's corresponding real location in our output // buffer (returned value) SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { + bool XML_TAGGIFY = false; SWBuf out = ""; SWBuf tag = ""; int tagLevel = 0; int wTag = -1; int inTag = 0; bool wTagsPresent = orig.indexOf("<w") > -1; - SWBuf lastElementText = ""; + SWBuf lastWElementText = ""; + bool lastLBBreak = false; for (int i = 0; i < orig.length(); ++i) { if (orig[i] == '<') { inTag = true; @@ -516,9 +538,9 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { if (t.isEndTag()) { // clear out empty w tags if (t.getName() && !strcmp("w", t.getName())) { - if (!lastElementText.size()) { + if (!lastWElementText.size()) { out.setSize(wTag); - if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') { + if (out.endsWith(' ')) { out.setSize(out.size() - 1); bibMap.pop_back(); wTags.pop_back(); @@ -527,15 +549,23 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { } } tagLevel--; - wTag = -1; + if (t.getName() && !strcmp("w", t.getName())) wTag = -1; } else { - lastElementText = ""; tagLevel++; - wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1; + if (t.getName() && !strcmp("w", t.getName())) { + wTag = out.size(); + lastWElementText = ""; + } } } - if (!skipTag) out += t; + else { + if (SWBuf(t.getName()) == "lb") { + lastLBBreak = !(SWBuf(t.getAttribute("break")) == "no"); + } + } + + if (!skipTag) out += (XML_TAGGIFY ? t : SWBuf("<") + tag + ">"); tag = ""; } else if (inTag) { @@ -552,7 +582,7 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { wTags.push_back(wTag); } out += orig[i]; - lastElementText += orig[i]; + lastWElementText += orig[i]; } } return out; @@ -697,7 +727,7 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> & } -void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, SWConfig *lex) { +void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds, vector<SWBuf> &fromWords, SWConfig *lex) { // TODO: this method needs some work, // like putting multiple consecutive words // together in one tag @@ -710,8 +740,12 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag for (int i = 0; i < targetWordTags.size(); ++i) { SWBuf wordTag = ""; + SWBuf norm = ""; if (targetWordTags[i] > -1) { wordTag = wordTags[targetWordTags[i]]; + if (lex) { + norm = fromWords[targetWordTags[i]]; + } } if (exceptionFile) { SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i); @@ -733,8 +767,9 @@ void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTag if (a.startsWith("G") || a.startsWith("H")) a << 1; SWBuf dict = (*lex)[a]["UTF8"]; SWBuf gloss = (*lex)[a]["Meaning"]; - w.setAttribute("corresp", dict); - w.setAttribute("gloss", gloss); + //w.setAttribute("corresp", dict); + if (norm.length()) w.setAttribute("norm", norm); + //w.setAttribute("gloss", gloss); wordTag = w.toString(); } } |