From 2add4f87ae8a5c77122e472c53b430a64e931a48 Mon Sep 17 00:00:00 2001 From: "Troy A. Griffitts" Date: Thu, 13 Apr 2023 08:31:02 +0000 Subject: updated migratetags to allow a TEI xml file as input git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@554 07627401-56e2-0310-80f4-f8cd0041bdcd --- migratetags/migratetags.cpp | 195 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 170 insertions(+), 25 deletions(-) diff --git a/migratetags/migratetags.cpp b/migratetags/migratetags.cpp index 2e35a7d..22f73d2 100644 --- a/migratetags/migratetags.cpp +++ b/migratetags/migratetags.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -22,9 +23,11 @@ Matcher *matcher = new DefaultMatcher(); // hard code your from and to modules here or pass them on the command line with - SWBuf strongsSourceModuleName = "WHNU"; SWBuf targetModuleName = "NA28FromImp"; +SWBuf targetTEIFile = ""; const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ "; +//const char *ignoreSeries = ""; typedef vector BibMap; @@ -48,6 +51,7 @@ void usage(const char *progName, const char *error = 0) { fprintf(stderr, "\nusage: %s [options]\n", progName); fprintf(stderr, " -ss \t provide the Strong's source module name\n"); fprintf(stderr, " -t \t provide the target module name\n"); + fprintf(stderr, " -tei \t provide the target tei filename\n"); fprintf(stderr, " -e \t provide an ini-style .conf file with overriding tag exceptions.\n"); fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n"); fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n"); @@ -58,6 +62,122 @@ void usage(const char *progName, const char *error = 0) { } +SWModule *targetMod = 0; +bool getNextVerse(VerseKey *targetModKey, SWBuf *targetModText) { + static int z = 0; + static bool finished = false; + if (++z == 1) { + ((VerseKey *)targetMod->getKey())->setIntros(true); + targetMod->getKey()->setText("mat0.0"); + } + + // assert our source is in good condition to give us more data + if (finished) return false; + + // grab our raw, fully marked up TargetMod text for this verse + (*targetModText) = targetMod->getRawEntryBuf(); + (*targetModKey) = (*(targetMod->getKey())); + + // clear any error from retrieving text + targetMod->popError(); + (*targetMod)++; + finished = targetMod->popError(); + + return true; +} + +FileDesc *targetInput = 0; +bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) { + static bool finished = false; + static bool fileEnd = false; + static SWBuf line = ""; + + XMLTag lastAB(""); + (*targetModText) = ""; + + while (!fileEnd || line.size()) { + if (!line.size()) { + fileEnd = !FileMgr::getLine(targetInput, line); + if (!fileEnd) line.append("\n"); + } + int offset = line.indexOf(""); + if (offset < 0) offset = endOffset; + else if (endOffset > -1 && endOffset < offset) offset = endOffset; + if (offset > -1) { + targetModText->append(line, offset); + line << offset; + int end = line.indexOf(">"); + if (end > -1) { + SWBuf abText = ""; + abText.append(line, end+1); + XMLTag ab(abText); + targetModText->append(abText); + line << (end+1); + if (ab.isEndTag()) { + break; + } + lastAB = ab; + } + } + else { + targetModText->append(line); + line = ""; + } + } + + // assert our source is in good condition to give us more data + if (fileEnd && !line.size()) return false; + + // grab our raw, fully marked up TargetMod text for this verse + if (lastAB.isEndTag()) { + // we are just returning interverse material so targetModKey is out of bounds + // just set to any error + targetModKey->setError(-99); + } + else { + // + SWBuf id = lastAB.getAttribute("xml:id"); + SWBuf bkv = ""; + SWBuf bookName = ""; + SWBuf bookNum = ""; + SWBuf chapter = ""; + SWBuf verse = ""; + SWBuf segment = id.stripPrefix('-'); + if (!segment.size()) bkv = id; + if (!bkv.size() && !segment.startsWith("B")) { + segment = id.stripPrefix('-'); + } + else if (!bkv.size()) bkv = segment; + if (!bkv.size() && !segment.startsWith("B")) { + segment = id.stripPrefix('-'); + } + else if (!bkv.size()) bkv = segment; + // if we have more segments, find the last segment + // because this is likely the bookName + if (bkv.size() && id.size() && id != bkv) { + id.stripPrefix('-'); + id.stripPrefix('-'); + id.stripPrefix('-'); + id.stripPrefix('-'); + bookName = id; + } + if (bkv.size()) { + bkv << 1; + bookNum = bkv.stripPrefix('K'); + chapter = bkv.stripPrefix('V'); + verse = bkv; + + SWBuf osisID = (bookName.size() ? bookName : bookNum); + osisID.appendFormatted(".%s.%s", chapter.c_str(), verse.c_str()); + (*targetModKey) = osisID; + } + + } + + return true; +} + int main(int argc, char **argv) { const char *progName = argv[0]; @@ -83,6 +203,12 @@ int main(int argc, char **argv) { } else usage(progName, "-t argument requires a module name."); } + else if (!strcmp(argv[i], "-tei")) { + if ((i + 1) < argc) { + targetTEIFile = argv[++i]; + } + else usage(progName, "-tei argument requires a tei filename."); + } else if (!strcmp(argv[i], "-e")) { if (i+1 < argc) { optionExceptionFile.push_back(argv[++i]); @@ -94,13 +220,25 @@ int main(int argc, char **argv) { SWMgr lib; lib.setGlobalOption("Textual Variants", "Secondary Reading"); - SWModule *m = lib.getModule(targetModuleName); - if (!m) { - cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n"; - if (argc < 2) usage(progName, "Use -t to supply target module name"); - exit(1); + SWModule *m = 0; + if (targetTEIFile.size()) { + targetInput = FileMgr::getSystemFileMgr()->open(targetTEIFile, FileMgr::RDONLY); + if (!targetInput || targetInput->getFd() < 1) { + cerr << "\nERROR: couldn't open tei file: " << targetTEIFile << ".\n"; + usage(progName, "Use -tei to supply tei filename"); + exit(1); + } + } + else { + m = lib.getModule(targetModuleName); + if (!m) { + cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n"; + if (argc < 2) usage(progName, "Use -t to supply target module name"); + exit(1); + } + targetMod = m; } - SWModule &targetMod = *m; + m = lib.getModule(strongsSourceModuleName.c_str()); if (!m) { cerr << "\nERROR: couldn't find Strong's source module: " << strongsSourceModuleName.c_str() << ".\n"; @@ -115,15 +253,17 @@ int main(int argc, char **argv) { else (*exceptionFile) += SWConfig(fileName); } - // we'll do the whole Bible eventually, but let's just get one verse - // working well. - ((VerseKey *)targetMod.getKey())->setIntros(true); - targetMod.getKey()->setText("mat0.0"); // let's try this verse - int z = 0; - for (; -//!z && -!targetMod.popError(); targetMod++) { - z++; + VerseKey *targetModKey = (VerseKey *)(targetInput ? fromMod.createKey() : targetMod->createKey()); + targetModKey->setIntros(true); + SWBuf targetModText; + while ((targetInput ? getNextVerseTEI(targetModKey, &targetModText) : getNextVerse(targetModKey, &targetModText))) { + if (targetModKey->getError()) { + cout << targetModText; + cout << endl; + continue; + } + // we'll do the whole Bible eventually, but let's just get one verse + // working well. // XML word tags which should be placed in this verse (start tag) // eg., @@ -177,11 +317,13 @@ int main(int argc, char **argv) { bibMap.clear(); wTags.clear(); - fromMod.setKey(targetMod.getKey()); - cout << "$$$ " << targetMod.getKeyText() << endl; + fromMod.setKey(targetModKey); + if (!targetTEIFile.size()) { + cout << "$$$ " << targetModKey->getText() << endl; + } if (optionDebug) { - cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl; + cout << "\nProcessing Verse: " << targetModKey->getText() << endl; cout << "---------------------" << endl; cout << "\nOur strongsSourceModule Markup" << endl; @@ -192,7 +334,7 @@ if (optionDebug) { // grab our raw, fully marked up TargetMod text for this verse - SWBuf orig = targetMod.getRawEntryBuf(); + SWBuf orig = targetModText; if (optionDebug) { @@ -268,7 +410,7 @@ if (optionDebug) { // ok, now that we have our targetWordTags magically populated // let's do the grunt work of inserting the and tags - insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); + insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); if (optionDebug) { @@ -281,13 +423,13 @@ if (optionDebug) { for (int i = 0; i < targetWords.size(); ++i) { if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { if (!warned) { - cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; + cerr << "*** Error: didn't match all words: " << targetModKey->getText() << endl; cerr << strongsSourceModuleName.c_str() << ":"; for (int j = 0; j < fromWords.size(); ++j) { cerr << " " << fromWords[j]; } cerr << endl; - cerr << targetModuleName << ":"; + cerr << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << ":"; for (int j = 0; j < targetWords.size(); ++j) { cerr << " " << targetWords[j]; } @@ -303,8 +445,8 @@ if (optionDebug) { } } if (warned) { - cerr << "\n" << targetModuleName << " Tags:\n"; - VerseKey *vk = (VerseKey *)targetMod.getKey(); + cerr << "\n" << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << " Tags:\n"; + VerseKey *vk = (VerseKey *)targetModKey; for (int j = 0; j < targetWords.size(); ++j) { if (!strstr(ignoreSeries, targetWords[j])) { cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl; @@ -325,7 +467,10 @@ if (optionDebug) { cout << "\nAND... Here's our final output" << endl; cout << "---------------------" << endl; } - cout << newTargetModMarkup << endl; + cout << newTargetModMarkup; + if (!targetTEIFile.size()) { + cout << endl; + } if (optionDebug) { cout << endl; } -- cgit