#include <versekey.h>
#include <utf8greekaccents.h>
#include <swmgr.h>
#include <utilxml.h>
#include <swbuf.h>
#include <swconfig.h>
#include <swmodule.h>
#include <stringmgr.h>
#include <iostream>
#include <vector>
using namespace sword;
using namespace std;
#include "matchers/matcher.h"
// select your matcher here
//#include "matchers/gntmatcher.h"
#include "matchers/defaultmatcher.h"
Matcher *matcher = new DefaultMatcher();
// hard code your from and to modules here or pass them on the command line with -
SWBuf strongsSourceModuleName = "WHNU";
SWBuf targetModuleName = "NA28FromImp";
const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ ";
typedef vector<unsigned long> BibMap;
void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
// app options
bool optionFilterAccents = false;
bool optionFilterAppCrit = false;
bool optionDebug = false;
vector<SWBuf> optionExceptionFile;
SWConfig *exceptionFile = 0;
void usage(const char *progName, const char *error = 0) {
if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
fprintf(stderr, "\nusage: %s [options]\n", progName);
fprintf(stderr, " -ss <moduleName>\t provide the Strong's source module name\n");
fprintf(stderr, " -t <moduleName>\t provide the target module name\n");
fprintf(stderr, " -e <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n");
fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n");
fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n");
fprintf(stderr, " --help\t\t this usage message\n");
fprintf(stderr, "\n\n");
exit(-1);
}
int main(int argc, char **argv) {
const char *progName = argv[0];
for (int i = 1; i < argc; ++i) {
if (!strcmp(argv[i], "-v")) {
optionDebug = true;
}
else if (!strcmp(argv[i], "-fa")) {
optionFilterAccents = true;
}
else if (!strcmp(argv[i], "-fc")) {
optionFilterAppCrit = true;
}
else if (!strcmp(argv[i], "-ss")) {
if ((i + 1) < argc) {
strongsSourceModuleName = argv[++i];
}
else usage(progName, "-ss argument requires a module name.");
}
else if (!strcmp(argv[i], "-t")) {
if ((i + 1) < argc) {
targetModuleName = argv[++i];
}
else usage(progName, "-t argument requires a module name.");
}
else if (!strcmp(argv[i], "-e")) {
if (i+1 < argc) {
optionExceptionFile.push_back(argv[++i]);
}
else usage(progName, "-e argument requires a file name.");
}
else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
SWMgr lib;
lib.setGlobalOption("Textual Variants", "Secondary Reading");
SWModule *m = lib.getModule(targetModuleName);
if (!m) {
cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
if (argc < 2) usage(progName, "Use -t to supply target module name");
exit(1);
}
SWModule &targetMod = *m;
m = lib.getModule(strongsSourceModuleName.c_str());
if (!m) {
cerr << "\nERROR: couldn't find Strong's source module: " << strongsSourceModuleName.c_str() << ".\n";
if (argc < 2) usage(progName, "Use -ss to supply Strong's source module name");
exit(1);
}
SWModule &fromMod = *m;
for (int i = 0; i < optionExceptionFile.size(); ++i) {
SWBuf fileName = optionExceptionFile[i];
if (!i) exceptionFile = new SWConfig(fileName);
else (*exceptionFile) += SWConfig(fileName);
}
// we'll do the whole Bible eventually, but let's just get one verse
// working well.
((VerseKey *)targetMod.getKey())->setIntros(true);
targetMod.getKey()->setText("mat0.0"); // let's try this verse
int z = 0;
for (;
//!z &&
!targetMod.popError(); targetMod++) {
z++;
// XML word tags which should be placed in this verse (start tag)
// eg., <w lemma=...>
// pulled from FromMod
vector<XMLTag> wordTags;
// Just the raw canonical Bible text of this verse with no tags
// eg., "In the beginning God created the heavens and the earth."
SWBuf justTargetModBibleText = "";
// a mapping for each character in justTargetModBibleText to the real location
// in our out buffer. This allows us to insert our <w> and </w>
// tags in the correct place amongst the fully marked up
// TargetMod out buffer. This work is all done in the insert() method
// above
BibMap bibMap;
BibMap wTags;
// justTargetModBibleText (above) broken down into separate words
// ie. all words in the TargetMod from this verse
// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
vector<SWBuf> targetWords;
// where each corresponding targetWords[x] starts in justTargetModBibleText
// eg. for "In the beginning..."
// [0] = 0; [1] = 3; [2] = 7; ...
// Needed to pass to insert method so we know where
// to insert the <w> start tag
vector<int> targetWordStarts;
// same as targetWordStarts, but the end of each word
// eg. [0] = 1; [1] = 5; [2] = 15
// Needed to pass to insert method so we know where
// to insert the </w> end tag
vector<int> targetWordEnds;
// This is the doozy. This maps each TargetMod word to the correct
// wordTags entry.
vector<int> targetWordTags;
// Equivalent to targetWords above, but for the FromMod.
// Useful for helping determine matches to TargetMod words
vector<SWBuf> fromWords;
// Equivalent to targetWordTag which we need to produce,
// but this one is produced for us from the FromMod data
// If we can match a fromWords[x] entry, then we can assign
// targetWorkTags[ourMatch] = fromWordTags[x]
vector<int> fromWordTags;
bibMap.clear();
wTags.clear();
fromMod.setKey(targetMod.getKey());
cout << "$$$ " << targetMod.getKeyText() << endl;
if (optionDebug) {
cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
cout << "---------------------" << endl;
cout << "\nOur strongsSourceModule Markup" << endl;
cout << "---------------------" << endl;
cout << fromMod.getRawEntry() << endl;
cout << "---------------------" << endl;
}
// grab our raw, fully marked up TargetMod text for this verse
SWBuf orig = targetMod.getRawEntryBuf();
if (optionDebug) {
cout << "\nOur Original targetModule Markup" << endl;
cout << "---------------------" << endl;
cout << orig << endl;
cout << "---------------------" << endl;
}
if (optionFilterAppCrit) {
SWBuf o = orig;
const unsigned char* from = (unsigned char*)o.c_str();
orig = "";
while (*from) {
SW_u32 ch = getUniCharFromUTF8(&from, true);
// if ch is bad, then convert to replacement char
if (!ch) ch = 0xFFFD;
SWBuf checkChar;
getUTF8FromUniChar(ch, &checkChar);
if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
orig.append(checkChar);
}
}
// let's find where just the canonical text is amongst
// all our markup
// newTargetModMarkup will eventually hold our updated markup with
// the new <w> tags, but we'll start here by setting it to
// the processed original markup.
// on return, bibMap will be populated with each character
// and the corresponding location into newTargetModMarkup where
// the character resides.
SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
if (optionDebug) {
cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
cout << "---------------------" << endl;
cout << newTargetModMarkup << endl;
cout << "---------------------" << endl;
cout << "\nOur bibMap" << endl;
cout << "---------------------" << endl;
for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
cout << *it << " ";
}
cout << "\n---------------------" << endl;
}
// let's populate our TargetMod word data and fill in our
// justTargetModBibleText buffer
justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
if (optionDebug) {
cout << "\nJust targetModule Bible Text" << endl;
cout << "---------------------" << endl;
cout << justTargetModBibleText << endl;
cout << "---------------------" << endl;
}
// ok, now lets grab out the groovy data from the FromMod module
pullFromModData(fromMod, wordTags, fromWords, fromWordTags);
//
// ok, here's the real work.
//
// This method needs to guess which TargetMod words match which FromMod
// words and then point them to their same original language
// word tag by populating targetWordTags
//
matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
// ok, now that we have our targetWordTags magically populated
// let's do the grunt work of inserting the <w> and </w> tags
insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
if (optionDebug) {
cout << "\nHere's how we mapped things..." << endl;
cout << "---------------------" << endl;
cout << "Total wordTags: " << wordTags.size() << endl;
cout << "\nTargetMod Words: " << endl;
}
bool warned = false;
for (int i = 0; i < targetWords.size(); ++i) {
if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
if (!warned) {
cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
cerr << strongsSourceModuleName.c_str() << ":";
for (int j = 0; j < fromWords.size(); ++j) {
cerr << " " << fromWords[j];
}
cerr << endl;
cerr << targetModuleName << ":";
for (int j = 0; j < targetWords.size(); ++j) {
cerr << " " << targetWords[j];
}
cerr << endl;
cerr << endl;
cerr << "Unmatched Words:" << endl;
warned = true;
}
cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
}
if (optionDebug) {
cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
}
if (warned) {
cerr << "\n" << targetModuleName << " Tags:\n";
VerseKey *vk = (VerseKey *)targetMod.getKey();
for (int j = 0; j < targetWords.size(); ++j) {
if (!strstr(ignoreSeries, targetWords[j])) {
cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
}
}
cerr << "---------------------" << endl;
}
if (optionFilterAccents) {
UTF8GreekAccents filter;
filter.setOptionValue("off");
filter.processText(newTargetModMarkup);
}
if (optionDebug) {
cout << "---------------------" << endl;
cout << "\nAND... Here's our final output" << endl;
cout << "---------------------" << endl;
}
cout << newTargetModMarkup << endl;
if (optionDebug) {
cout << endl;
}
}
delete exceptionFile;
return 0;
}
// builds up bibMap to contain only characters of Biblical text
// and each character's corresponding real location in our output
// buffer (returned value)
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
SWBuf out = "";
SWBuf tag = "";
int tagLevel = 0;
int wTag = -1;
int inTag = 0;
bool wTagsPresent = orig.indexOf("<w") > -1;
SWBuf lastElementText = "";
for (int i = 0; i < orig.length(); ++i) {
if (orig[i] == '<') {
inTag = true;
}
else if (orig[i] == '>') {
inTag = false;
XMLTag t = tag.c_str();
bool skipTag = false;
if (!t.isEmpty()) {
if (t.isEndTag()) {
// clear out empty w tags
if (t.getName() && !strcmp("w", t.getName())) {
if (!lastElementText.size()) {
out.setSize(wTag);
if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
out.setSize(out.size() - 1);
bibMap.pop_back();
wTags.pop_back();
}
skipTag = true;
}
}
tagLevel--;
wTag = -1;
}
else {
lastElementText = "";
tagLevel++;
wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
}
}
if (!skipTag) out += t;
tag = "";
}
else if (inTag) {
tag += orig[i];
}
else {
if (
// for texts without <w> tags
(!wTagsPresent && (!tagLevel || wTag != -1))
// for texts with <w> tags
|| ( wTagsPresent && (wTag != -1 || orig[i] == ' '))
) {
bibMap.push_back(out.size());
wTags.push_back(wTag);
}
out += orig[i];
lastElementText += orig[i];
}
}
return out;
}
// Inserts addText into out buffer and adjusts Bible character pointers accordingly
//
void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
int to = 0;
if (!after && wTags[bibPos] != -1) {
to = wTags[bibPos] + 2;
addText--; // discard the '>'
addText << 2; // discard the '<w'
}
else {
to = bibMap[bibPos]+((after)?1:0);
}
if (!after || wTags[bibPos] == -1) {
out.insert(to, addText);
for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) {
bibMap[i] += addText.length();
if (wTags[i] != -1) wTags[i] += addText.length();
}
}
}
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
SWBuf bibWord = "";
SWBuf fromWord = "";
SWBuf bibText = "";
for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
/*
char *b1 = markupBuf.getRawData()+*it;
char *b2 = b1;
__u32 uc = getUniCharFromUTF8(&b2);
bool wordBreak = false;
if (uc) {
SWBuf u8c;
u8c.append(b1, b2-b1);
if (strstr(ignoreSeries, u8c.getRawData()))
}
*/
char c = markupBuf[*it];
if (c != ' ' && c != '.' && c != ';' && c != ',') {
if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
bibWord += c;
}
else {
if (bibWord.length()) {
targetWordEnds.push_back(bibText.length()-1);
targetWords.push_back(bibWord);
bibWord = "";
}
}
bibText += c;
}
if (bibWord.length()) {
targetWordEnds.push_back(bibText.length()-1);
targetWords.push_back(bibWord);
}
return bibText;
}
void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
fromMod.renderText(); // be sure FromMod has processed entry attributes
AttributeList &words = fromMod.getEntryAttributes()["Word"];
SWBuf fromWord = "";
SWBuf bibWord = "";
for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
// this is our new <w> XMLTag.
// attributes will be added below
XMLTag w("w");
// this only gives us word count, not if we have multiple entries per word
// don't use as loop
int parts = atoi(it->second["PartCount"]);
SWBuf lemma = "";
SWBuf morph = "";
bool found = true;
for (int i = 1; found; ++i) {
found = false;
SWBuf key = "";
key = SWBuf().setFormatted("Lemma.%d", i);
AttributeValue::iterator li = it->second.find(key);
if (i == 1 && li == it->second.end()) li = it->second.find("Lemma");
if (li != it->second.end()) {
found = true;
if (i > 1) lemma += " ";
key = SWBuf().setFormatted("LemmaClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass");
if (lci != it->second.end()) {
lemma += lci->second + ":";
}
lemma += li->second;
}
key = SWBuf().setFormatted("Morph.%d", i);
li = it->second.find(key);
if (i == 1 && li == it->second.end()) li = it->second.find("Morph");
if (li != it->second.end()) {
found = true;
if (i > 1) morph += " ";
key = SWBuf().setFormatted("MorphClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass");
if (lci != it->second.end()) {
morph += lci->second + ":";
}
morph += li->second;
}
// TODO: add src tags and maybe other attributes
}
if (lemma.length()) w.setAttribute("lemma", lemma);
if (morph.length()) w.setAttribute("morph", morph);
fromWord = it->second["Text"];
bibWord = "";
for (int j = 0; j < fromWord.length(); ++j) {
char c = fromWord[j];
if (c != ' ' && c != '.' && c != ';' && c != ',') {
bibWord += c;
}
else {
if (bibWord.length()) {
fromWords.push_back(bibWord);
fromWordTags.push_back(wordTags.size());
bibWord = "";
}
}
}
if (bibWord.length()) {
fromWords.push_back(bibWord);
fromWordTags.push_back(wordTags.size());
}
wordTags.push_back(w);
}
}
void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
// TODO: this method needs some work,
// like putting multiple consecutive words
// together in one tag
ConfigEntMap exceptions;
if (exceptionFile) {
exceptions = exceptionFile->getSection("exceptions");
}
for (int i = 0; i < targetWordTags.size(); ++i) {
SWBuf wordTag = "";
if (targetWordTags[i] > -1) {
wordTag = wordTags[targetWordTags[i]];
}
if (exceptionFile) {
SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
ConfigEntMap::const_iterator it = exceptions.find(key);
if (it != exceptions.end()) {
targetWordTags[i] = -2; // note that we are using an exception, not a mapping, not unset (-1)
wordTag = it->second;
}
}
if (wordTag.length()) {
insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags);
insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
}
}
}