diff options
author | Troy A. Griffitts <scribe@crosswire.org> | 2022-02-17 17:42:32 +0000 |
---|---|---|
committer | Troy A. Griffitts <scribe@crosswire.org> | 2022-02-17 17:42:32 +0000 |
commit | baa326c09201d76f02c28168ab4eaad427596b48 (patch) | |
tree | 28152621d237eebed45ca9f1e4fab4e6596c348f /migratetags/migratetags.cpp | |
parent | a05a43ad3ea9aef59466d4bb600e26c4cb777784 (diff) | |
download | sword-tools-baa326c09201d76f02c28168ab4eaad427596b48.tar.gz |
Committed latest version of migrate tags
made it works with both <w> modules and modules without <w> tags
added parameter -t to specify target module
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@540 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'migratetags/migratetags.cpp')
-rw-r--r-- | migratetags/migratetags.cpp | 147 |
1 files changed, 112 insertions, 35 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp index 2b6b92b..ecb1ed0 100644 --- a/migratetags/migratetags.cpp +++ b/migratetags/migratetags.cpp @@ -3,6 +3,7 @@ #include <swmgr.h> #include <utilxml.h> #include <swbuf.h> +#include <swconfig.h> #include <swmodule.h> #include <stringmgr.h> #include <iostream> @@ -14,11 +15,13 @@ using namespace std; #include "matchers/matcher.h" // select your matcher here +//#include "matchers/gntmatcher.h" #include "matchers/defaultmatcher.h" Matcher *matcher = new DefaultMatcher(); -const char *targetModuleName="NA28"; -const char *strongsSourceModuleName="WHNU"; +// hard code your from and to modules here or pass them on the command line with - +SWBuf strongsSourceModuleName = "WHNU"; +SWBuf targetModuleName = "NA28FromImp"; const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ "; @@ -30,12 +33,14 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags); SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds); void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags); -void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds); +void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds); // app options bool optionFilterAccents = false; bool optionFilterAppCrit = false; bool optionDebug = false; +vector<SWBuf> optionExceptionFile; +SWConfig *exceptionFile = 0; void usage(const char *progName, const char *error = 0) { if (error) fprintf(stderr, "\n%s: %s\n", progName, error); @@ -43,6 +48,10 @@ void usage(const char *progName, const char *error = 0) { fprintf(stderr, "\nusage: %s [options]\n", progName); fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n"); fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n"); + fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n"); + fprintf(stderr, " -ss <moduleName>\t provide the Strong's source module name\n"); + fprintf(stderr, " -t <moduleName>\t provide the target module name\n"); + fprintf(stderr, " -e <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n"); fprintf(stderr, "\n\n"); exit(-1); } @@ -61,6 +70,24 @@ int main(int argc, char **argv) { else if (!strcmp(argv[i], "-fc")) { optionFilterAppCrit = true; } + else if (!strcmp(argv[i], "-ss")) { + if ((i + 1) < argc) { + strongsSourceModuleName = argv[++i]; + } + else usage(progName, "-ss argument requires a module name."); + } + else if (!strcmp(argv[i], "-t")) { + if ((i + 1) < argc) { + targetModuleName = argv[++i]; + } + else usage(progName, "-t argument requires a module name."); + } + else if (!strcmp(argv[i], "-e")) { + if (i+1 < argc) { + optionExceptionFile.push_back(argv[++i]); + } + else usage(progName, "-e argument requires a file name."); + } else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } @@ -72,16 +99,23 @@ int main(int argc, char **argv) { exit(1); } SWModule &targetMod = *m; - m = lib.getModule(strongsSourceModuleName); + m = lib.getModule(strongsSourceModuleName.c_str()); if (!m) { - cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n"; + cerr << "couldn't find source module: " << strongsSourceModuleName.c_str() << ".\n"; exit(1); } SWModule &fromMod = *m; + for (int i = 0; i < optionExceptionFile.size(); ++i) { + SWBuf fileName = optionExceptionFile[i]; + if (!i) exceptionFile = new SWConfig(fileName); + else (*exceptionFile) += SWConfig(fileName); + } + // we'll do the whole Bible eventually, but let's just get one verse // working well. - targetMod.setKey("mat1.1"); // let's try this verse + ((VerseKey *)targetMod.getKey())->setIntros(true); + targetMod.getKey()->setText("mat0.0"); // let's try this verse int z = 0; for (; //!z && @@ -147,7 +181,7 @@ if (optionDebug) { cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl; cout << "---------------------" << endl; - cout << "\nOur FromMod Verse Markup" << endl; + cout << "\nOur strongsSourceModule Markup" << endl; cout << "---------------------" << endl; cout << fromMod.getRawEntry() << endl; cout << "---------------------" << endl; @@ -157,18 +191,20 @@ if (optionDebug) { // grab our raw, fully marked up TargetMod text for this verse SWBuf orig = targetMod.getRawEntryBuf(); - if (optionFilterAccents) { - UTF8GreekAccents filter; - filter.setOptionValue("off"); - filter.processText(orig); - } + +if (optionDebug) { + cout << "\nOur Original targetModule Markup" << endl; + cout << "---------------------" << endl; + cout << orig << endl; + cout << "---------------------" << endl; +} if (optionFilterAppCrit) { SWBuf o = orig; const unsigned char* from = (unsigned char*)o.c_str(); orig = ""; while (*from) { - __u32 ch = getUniCharFromUTF8(&from, true); + SW_u32 ch = getUniCharFromUTF8(&from, true); // if ch is bad, then convert to replacement char if (!ch) ch = 0xFFFD; SWBuf checkChar; @@ -178,13 +214,6 @@ if (optionDebug) { } } -if (optionDebug) { - cout << "\nOur Original TargetMod Markup" << endl; - cout << "---------------------" << endl; - cout << orig << endl; - cout << "---------------------" << endl; -} - // let's find where just the canonical text is amongst // all our markup // newTargetModMarkup will eventually hold our updated markup with @@ -196,7 +225,7 @@ if (optionDebug) { SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags); if (optionDebug) { - cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl; + cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl; cout << "---------------------" << endl; cout << newTargetModMarkup << endl; cout << "---------------------" << endl; @@ -214,7 +243,7 @@ if (optionDebug) { justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds); if (optionDebug) { - cout << "\nJust TargetMod Bible Text" << endl; + cout << "\nJust targetModule Bible Text" << endl; cout << "---------------------" << endl; cout << justTargetModBibleText << endl; cout << "---------------------" << endl; @@ -234,14 +263,13 @@ if (optionDebug) { // matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags); - // ok, now that we have our targetWordTags magically populated // let's do the grunt work of inserting the <w> and </w> tags - insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); + insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); if (optionDebug) { - cout << "\nHere's how you mapped things..." << endl; + cout << "\nHere's how we mapped things..." << endl; cout << "---------------------" << endl; cout << "Total wordTags: " << wordTags.size() << endl; cout << "\nTargetMod Words: " << endl; @@ -251,7 +279,7 @@ if (optionDebug) { if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { if (!warned) { cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; - cerr << strongsSourceModuleName << ":"; + cerr << strongsSourceModuleName.c_str() << ":"; for (int j = 0; j < fromWords.size(); ++j) { cerr << " " << fromWords[j]; } @@ -268,7 +296,7 @@ if (optionDebug) { cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl; } if (optionDebug) { - cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl; + cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl; } } if (warned) { @@ -276,15 +304,22 @@ if (optionDebug) { VerseKey *vk = (VerseKey *)targetMod.getKey(); for (int j = 0; j < targetWords.size(); ++j) { if (!strstr(ignoreSeries, targetWords[j])) { - cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl; + cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl; } } cerr << "---------------------" << endl; } + + if (optionFilterAccents) { + UTF8GreekAccents filter; + filter.setOptionValue("off"); + filter.processText(newTargetModMarkup); + } + if (optionDebug) { cout << "---------------------" << endl; - cout << "\nAND... Here's your final output" << endl; + cout << "\nAND... Here's our final output" << endl; cout << "---------------------" << endl; } cout << newTargetModMarkup << endl; @@ -292,6 +327,9 @@ if (optionDebug) { cout << endl; } } + + delete exceptionFile; + return 0; } @@ -305,6 +343,8 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { int tagLevel = 0; int wTag = -1; int inTag = 0; + bool wTagsPresent = orig.indexOf("<w") > -1; + SWBuf lastElementText = ""; for (int i = 0; i < orig.length(); ++i) { if (orig[i] == '<') { inTag = true; @@ -312,30 +352,48 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { else if (orig[i] == '>') { inTag = false; XMLTag t = tag.c_str(); + bool skipTag = false; if (!t.isEmpty()) { if (t.isEndTag()) { + // clear out empty w tags + if (t.getName() && !strcmp("w", t.getName())) { + if (!lastElementText.size()) { + out.setSize(wTag); + if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') { + out.setSize(out.size() - 1); + bibMap.pop_back(); + wTags.pop_back(); + } + skipTag = true; + } + } tagLevel--; wTag = -1; } else { + lastElementText = ""; tagLevel++; wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1; } } - out += t; + if (!skipTag) out += t; tag = ""; } else if (inTag) { tag += orig[i]; } else { -// for texts without <w> tags -// if (!tagLevel || wTag != -1) { - if (wTag != -1 || orig[i] == ' ') { + if ( + // for texts without <w> tags + (!wTagsPresent && (!tagLevel || wTag != -1)) + // for texts with <w> tags + || ( wTagsPresent && (wTag != -1 || orig[i] == ' ')) + ) { bibMap.push_back(out.size()); wTags.push_back(wTag); } out += orig[i]; + lastElementText += orig[i]; } } return out; @@ -480,13 +538,32 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> & } -void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) { +void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) { // TODO: this method needs some work, // like putting multiple consecutive words // together in one tag + + ConfigEntMap exceptions; + + if (exceptionFile) { + exceptions = exceptionFile->getSection("exceptions"); + } + for (int i = 0; i < targetWordTags.size(); ++i) { + SWBuf wordTag = ""; if (targetWordTags[i] > -1) { - insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags); + wordTag = wordTags[targetWordTags[i]]; + } + if (exceptionFile) { + SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i); + ConfigEntMap::const_iterator it = exceptions.find(key); + if (it != exceptions.end()) { + targetWordTags[i] = -2; // note that we are using an exception, not a mapping, not unset (-1) + wordTag = it->second; + } + } + if (wordTag.length()) { + insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags); insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true); } } |