summaryrefslogtreecommitdiffstats
path: root/migratetags/migratetags.cpp
diff options
context:
space:
mode:
authorTroy A. Griffitts <scribe@crosswire.org>2022-02-17 17:42:32 +0000
committerTroy A. Griffitts <scribe@crosswire.org>2022-02-17 17:42:32 +0000
commitbaa326c09201d76f02c28168ab4eaad427596b48 (patch)
tree28152621d237eebed45ca9f1e4fab4e6596c348f /migratetags/migratetags.cpp
parenta05a43ad3ea9aef59466d4bb600e26c4cb777784 (diff)
downloadsword-tools-baa326c09201d76f02c28168ab4eaad427596b48.tar.gz
Committed latest version of migrate tags
made it works with both <w> modules and modules without <w> tags added parameter -t to specify target module git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@540 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'migratetags/migratetags.cpp')
-rw-r--r--migratetags/migratetags.cpp147
1 files changed, 112 insertions, 35 deletions
diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp
index 2b6b92b..ecb1ed0 100644
--- a/migratetags/migratetags.cpp
+++ b/migratetags/migratetags.cpp
@@ -3,6 +3,7 @@
#include <swmgr.h>
#include <utilxml.h>
#include <swbuf.h>
+#include <swconfig.h>
#include <swmodule.h>
#include <stringmgr.h>
#include <iostream>
@@ -14,11 +15,13 @@ using namespace std;
#include "matchers/matcher.h"
// select your matcher here
+//#include "matchers/gntmatcher.h"
#include "matchers/defaultmatcher.h"
Matcher *matcher = new DefaultMatcher();
-const char *targetModuleName="NA28";
-const char *strongsSourceModuleName="WHNU";
+// hard code your from and to modules here or pass them on the command line with -
+SWBuf strongsSourceModuleName = "WHNU";
+SWBuf targetModuleName = "NA28FromImp";
const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ ";
@@ -30,12 +33,14 @@ void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
// app options
bool optionFilterAccents = false;
bool optionFilterAppCrit = false;
bool optionDebug = false;
+vector<SWBuf> optionExceptionFile;
+SWConfig *exceptionFile = 0;
void usage(const char *progName, const char *error = 0) {
if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
@@ -43,6 +48,10 @@ void usage(const char *progName, const char *error = 0) {
fprintf(stderr, "\nusage: %s [options]\n", progName);
fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n");
fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n");
+ fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n");
+ fprintf(stderr, " -ss <moduleName>\t provide the Strong's source module name\n");
+ fprintf(stderr, " -t <moduleName>\t provide the target module name\n");
+ fprintf(stderr, " -e <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
fprintf(stderr, "\n\n");
exit(-1);
}
@@ -61,6 +70,24 @@ int main(int argc, char **argv) {
else if (!strcmp(argv[i], "-fc")) {
optionFilterAppCrit = true;
}
+ else if (!strcmp(argv[i], "-ss")) {
+ if ((i + 1) < argc) {
+ strongsSourceModuleName = argv[++i];
+ }
+ else usage(progName, "-ss argument requires a module name.");
+ }
+ else if (!strcmp(argv[i], "-t")) {
+ if ((i + 1) < argc) {
+ targetModuleName = argv[++i];
+ }
+ else usage(progName, "-t argument requires a module name.");
+ }
+ else if (!strcmp(argv[i], "-e")) {
+ if (i+1 < argc) {
+ optionExceptionFile.push_back(argv[++i]);
+ }
+ else usage(progName, "-e argument requires a file name.");
+ }
else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
@@ -72,16 +99,23 @@ int main(int argc, char **argv) {
exit(1);
}
SWModule &targetMod = *m;
- m = lib.getModule(strongsSourceModuleName);
+ m = lib.getModule(strongsSourceModuleName.c_str());
if (!m) {
- cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+ cerr << "couldn't find source module: " << strongsSourceModuleName.c_str() << ".\n";
exit(1);
}
SWModule &fromMod = *m;
+ for (int i = 0; i < optionExceptionFile.size(); ++i) {
+ SWBuf fileName = optionExceptionFile[i];
+ if (!i) exceptionFile = new SWConfig(fileName);
+ else (*exceptionFile) += SWConfig(fileName);
+ }
+
// we'll do the whole Bible eventually, but let's just get one verse
// working well.
- targetMod.setKey("mat1.1"); // let's try this verse
+ ((VerseKey *)targetMod.getKey())->setIntros(true);
+ targetMod.getKey()->setText("mat0.0"); // let's try this verse
int z = 0;
for (;
//!z &&
@@ -147,7 +181,7 @@ if (optionDebug) {
cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
cout << "---------------------" << endl;
- cout << "\nOur FromMod Verse Markup" << endl;
+ cout << "\nOur strongsSourceModule Markup" << endl;
cout << "---------------------" << endl;
cout << fromMod.getRawEntry() << endl;
cout << "---------------------" << endl;
@@ -157,18 +191,20 @@ if (optionDebug) {
// grab our raw, fully marked up TargetMod text for this verse
SWBuf orig = targetMod.getRawEntryBuf();
- if (optionFilterAccents) {
- UTF8GreekAccents filter;
- filter.setOptionValue("off");
- filter.processText(orig);
- }
+
+if (optionDebug) {
+ cout << "\nOur Original targetModule Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << orig << endl;
+ cout << "---------------------" << endl;
+}
if (optionFilterAppCrit) {
SWBuf o = orig;
const unsigned char* from = (unsigned char*)o.c_str();
orig = "";
while (*from) {
- __u32 ch = getUniCharFromUTF8(&from, true);
+ SW_u32 ch = getUniCharFromUTF8(&from, true);
// if ch is bad, then convert to replacement char
if (!ch) ch = 0xFFFD;
SWBuf checkChar;
@@ -178,13 +214,6 @@ if (optionDebug) {
}
}
-if (optionDebug) {
- cout << "\nOur Original TargetMod Markup" << endl;
- cout << "---------------------" << endl;
- cout << orig << endl;
- cout << "---------------------" << endl;
-}
-
// let's find where just the canonical text is amongst
// all our markup
// newTargetModMarkup will eventually hold our updated markup with
@@ -196,7 +225,7 @@ if (optionDebug) {
SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
if (optionDebug) {
- cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+ cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
cout << "---------------------" << endl;
cout << newTargetModMarkup << endl;
cout << "---------------------" << endl;
@@ -214,7 +243,7 @@ if (optionDebug) {
justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
if (optionDebug) {
- cout << "\nJust TargetMod Bible Text" << endl;
+ cout << "\nJust targetModule Bible Text" << endl;
cout << "---------------------" << endl;
cout << justTargetModBibleText << endl;
cout << "---------------------" << endl;
@@ -234,14 +263,13 @@ if (optionDebug) {
//
matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
-
// ok, now that we have our targetWordTags magically populated
// let's do the grunt work of inserting the <w> and </w> tags
- insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+ insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
if (optionDebug) {
- cout << "\nHere's how you mapped things..." << endl;
+ cout << "\nHere's how we mapped things..." << endl;
cout << "---------------------" << endl;
cout << "Total wordTags: " << wordTags.size() << endl;
cout << "\nTargetMod Words: " << endl;
@@ -251,7 +279,7 @@ if (optionDebug) {
if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
if (!warned) {
cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
- cerr << strongsSourceModuleName << ":";
+ cerr << strongsSourceModuleName.c_str() << ":";
for (int j = 0; j < fromWords.size(); ++j) {
cerr << " " << fromWords[j];
}
@@ -268,7 +296,7 @@ if (optionDebug) {
cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
}
if (optionDebug) {
- cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+ cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
}
if (warned) {
@@ -276,15 +304,22 @@ if (optionDebug) {
VerseKey *vk = (VerseKey *)targetMod.getKey();
for (int j = 0; j < targetWords.size(); ++j) {
if (!strstr(ignoreSeries, targetWords[j])) {
- cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl;
+ cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
}
}
cerr << "---------------------" << endl;
}
+
+ if (optionFilterAccents) {
+ UTF8GreekAccents filter;
+ filter.setOptionValue("off");
+ filter.processText(newTargetModMarkup);
+ }
+
if (optionDebug) {
cout << "---------------------" << endl;
- cout << "\nAND... Here's your final output" << endl;
+ cout << "\nAND... Here's our final output" << endl;
cout << "---------------------" << endl;
}
cout << newTargetModMarkup << endl;
@@ -292,6 +327,9 @@ if (optionDebug) {
cout << endl;
}
}
+
+ delete exceptionFile;
+
return 0;
}
@@ -305,6 +343,8 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
int tagLevel = 0;
int wTag = -1;
int inTag = 0;
+ bool wTagsPresent = orig.indexOf("<w") > -1;
+ SWBuf lastElementText = "";
for (int i = 0; i < orig.length(); ++i) {
if (orig[i] == '<') {
inTag = true;
@@ -312,30 +352,48 @@ SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
else if (orig[i] == '>') {
inTag = false;
XMLTag t = tag.c_str();
+ bool skipTag = false;
if (!t.isEmpty()) {
if (t.isEndTag()) {
+ // clear out empty w tags
+ if (t.getName() && !strcmp("w", t.getName())) {
+ if (!lastElementText.size()) {
+ out.setSize(wTag);
+ if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
+ out.setSize(out.size() - 1);
+ bibMap.pop_back();
+ wTags.pop_back();
+ }
+ skipTag = true;
+ }
+ }
tagLevel--;
wTag = -1;
}
else {
+ lastElementText = "";
tagLevel++;
wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
}
}
- out += t;
+ if (!skipTag) out += t;
tag = "";
}
else if (inTag) {
tag += orig[i];
}
else {
-// for texts without <w> tags
-// if (!tagLevel || wTag != -1) {
- if (wTag != -1 || orig[i] == ' ') {
+ if (
+ // for texts without <w> tags
+ (!wTagsPresent && (!tagLevel || wTag != -1))
+ // for texts with <w> tags
+ || ( wTagsPresent && (wTag != -1 || orig[i] == ' '))
+ ) {
bibMap.push_back(out.size());
wTags.push_back(wTag);
}
out += orig[i];
+ lastElementText += orig[i];
}
}
return out;
@@ -480,13 +538,32 @@ void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &
}
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
// TODO: this method needs some work,
// like putting multiple consecutive words
// together in one tag
+
+ ConfigEntMap exceptions;
+
+ if (exceptionFile) {
+ exceptions = exceptionFile->getSection("exceptions");
+ }
+
for (int i = 0; i < targetWordTags.size(); ++i) {
+ SWBuf wordTag = "";
if (targetWordTags[i] > -1) {
- insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+ wordTag = wordTags[targetWordTags[i]];
+ }
+ if (exceptionFile) {
+ SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
+ ConfigEntMap::const_iterator it = exceptions.find(key);
+ if (it != exceptions.end()) {
+ targetWordTags[i] = -2; // note that we are using an exception, not a mapping, not unset (-1)
+ wordTag = it->second;
+ }
+ }
+ if (wordTag.length()) {
+ insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags);
insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
}
}