path: root/migratetags/migratetags.cpp



#include <versekey.h>
#include <utf8greekaccents.h>
#include <swmgr.h>
#include <utilxml.h>
#include <swbuf.h>
#include <swconfig.h>
#include <swmodule.h>
#include <stringmgr.h>
#include <iostream>
#include <vector>

using namespace sword;
using namespace std;

#include "matchers/matcher.h"

// select your matcher here
//#include "matchers/gntmatcher.h"
#include "matchers/defaultmatcher.h"
Matcher *matcher = new DefaultMatcher();

// hard code your from and to modules here or pass them on the command line with -
SWBuf strongsSourceModuleName = "WHNU";
SWBuf targetModuleName = "NA28FromImp";


const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊  ";

typedef vector<unsigned long> BibMap;

void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);

SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);

// app options
bool optionFilterAccents = false;
bool optionFilterAppCrit = false;
bool optionDebug         = false;
vector<SWBuf> optionExceptionFile;
SWConfig *exceptionFile = 0;

void usage(const char *progName, const char *error = 0) {
	if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
	fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
	fprintf(stderr, "\nusage: %s [options]\n", progName);
	fprintf(stderr, "  -ss <moduleName>\t provide the Strong's source module name\n");
	fprintf(stderr, "  -t  <moduleName>\t provide the target module name\n");
	fprintf(stderr, "  -e  <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
	fprintf(stderr, "  -fa\t\t\t filter accents: remove Greek accents from final text\n");
	fprintf(stderr, "  -fc\t\t\t filter critical apparatus markers from final text\n");
	fprintf(stderr, "  -v\t\t\t verbose: print lots of information while processing\n");
	fprintf(stderr, "  --help\t\t this usage message\n");
	fprintf(stderr, "\n\n");
	exit(-1);
}


int main(int argc, char **argv) {
	const char *progName   = argv[0];
	for (int i = 1; i < argc; ++i) {
		if (!strcmp(argv[i], "-v")) {
			optionDebug = true;
		}
		else if (!strcmp(argv[i], "-fa")) {
			optionFilterAccents = true;
		}
		else if (!strcmp(argv[i], "-fc")) {
			optionFilterAppCrit = true;
		}
		else if (!strcmp(argv[i], "-ss")) {
			if ((i + 1) < argc) {
				strongsSourceModuleName = argv[++i];
			}
			else usage(progName, "-ss argument requires a module name.");
		}
		else if (!strcmp(argv[i], "-t")) {
			if ((i + 1) < argc) {
				targetModuleName = argv[++i];
			}
			else usage(progName, "-t argument requires a module name.");
		}
		else if (!strcmp(argv[i], "-e")) {
			if (i+1 < argc) {
				optionExceptionFile.push_back(argv[++i]);
			}
			else usage(progName, "-e argument requires a file name.");
		}
		else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
	}

	SWMgr lib;
	lib.setGlobalOption("Textual Variants", "Secondary Reading");
	SWModule *m = lib.getModule(targetModuleName);
	if (!m) {
		cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
		if (argc < 2) usage(progName, "Use -t to supply target module name");
		exit(1);
	}
	SWModule &targetMod = *m;
	m = lib.getModule(strongsSourceModuleName.c_str());
	if (!m) {
		cerr << "\nERROR: couldn't find Strong's source module: " << strongsSourceModuleName.c_str() << ".\n";
		if (argc < 2) usage(progName, "Use -ss to supply Strong's source module name");
		exit(1);
	}
	SWModule &fromMod = *m;

	for (int i = 0; i < optionExceptionFile.size(); ++i) {
		SWBuf fileName = optionExceptionFile[i];
		if (!i) exceptionFile = new SWConfig(fileName);
		else (*exceptionFile) += SWConfig(fileName);
	}

	// we'll do the whole Bible eventually, but let's just get one verse
	// working well.
	((VerseKey *)targetMod.getKey())->setIntros(true);
	targetMod.getKey()->setText("mat0.0");		// let's try this verse
	int z = 0;
	for (;
//!z &&
!targetMod.popError(); targetMod++) {
	z++;

		// XML word tags which should be placed in this verse (start tag)
		// eg., <w lemma=...>
		// pulled from FromMod
		vector<XMLTag> wordTags;

		// Just the raw canonical Bible text of this verse with no tags
		// eg., "In the beginning God created the heavens and the earth."
		SWBuf justTargetModBibleText = "";

		// a mapping for each character in justTargetModBibleText to the real location
		// in our out buffer.  This allows us to insert our <w> and </w>
		// tags in the correct place amongst the fully marked up
		// TargetMod out buffer.  This work is all done in the insert() method
		// above
		BibMap bibMap;
		BibMap wTags;

		// justTargetModBibleText (above) broken down into separate words
		// ie. all words in the TargetMod from this verse
		// eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
		vector<SWBuf> targetWords;

		// where each corresponding targetWords[x] starts in justTargetModBibleText
		// eg. for "In the beginning..."
		//         [0] = 0; [1] = 3; [2] = 7; ...
		// Needed to pass to insert method so we know where
		// to insert the <w> start tag
		vector<int> targetWordStarts;

		// same as targetWordStarts, but the end of each word
		// eg. [0] = 1; [1] = 5; [2] = 15
		// Needed to pass to insert method so we know where
		// to insert the </w> end tag
		vector<int> targetWordEnds;

		// This is the doozy.  This maps each TargetMod word to the correct
		// wordTags entry.
		vector<int> targetWordTags;

		// Equivalent to targetWords above, but for the FromMod.
		// Useful for helping determine matches to TargetMod words
		vector<SWBuf> fromWords;

		// Equivalent to targetWordTag which we need to produce,
		// but this one is produced for us from the FromMod data
		// If we can match a fromWords[x] entry, then we can assign
		// targetWorkTags[ourMatch] = fromWordTags[x]
		vector<int> fromWordTags;

		bibMap.clear();
		wTags.clear();

		fromMod.setKey(targetMod.getKey());
		cout << "$$$ " << targetMod.getKeyText() << endl;

if (optionDebug) {
		cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
		cout << "---------------------" << endl;

		cout << "\nOur strongsSourceModule Markup" << endl;
		cout << "---------------------" << endl;
		cout << fromMod.getRawEntry() << endl;
		cout << "---------------------" << endl;
}


		// grab our raw, fully marked up TargetMod text for this verse
		SWBuf orig = targetMod.getRawEntryBuf();


if (optionDebug) {
		cout << "\nOur Original targetModule Markup" << endl;
		cout << "---------------------" << endl;
		cout << orig << endl;
		cout << "---------------------" << endl;
}

		if (optionFilterAppCrit) {
			SWBuf o = orig;
			const unsigned char* from = (unsigned char*)o.c_str();
			orig = "";
			while (*from) {		
				SW_u32 ch = getUniCharFromUTF8(&from, true);
				// if ch is bad, then convert to replacement char
				if (!ch) ch = 0xFFFD;
				SWBuf checkChar;
				getUTF8FromUniChar(ch, &checkChar);
				if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
				orig.append(checkChar);
			}
		}

		// let's find where just the canonical text is amongst
		// all our markup
		// newTargetModMarkup will eventually hold our updated markup with
		// the new <w> tags, but we'll start here by setting it to
		// the processed original markup.
		// on return, bibMap will be populated with each character
		// and the corresponding location into newTargetModMarkup where
		// the character resides.
		SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);

if (optionDebug) {
		cout << "\nOur Original targetModule Markup After XMLTag-ifying" << endl;
		cout << "---------------------" << endl;
		cout << newTargetModMarkup << endl;
		cout << "---------------------" << endl;

		cout << "\nOur bibMap" << endl;
		cout << "---------------------" << endl;
		for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
			cout << *it << " ";
		}
		cout << "\n---------------------" << endl;
}

		// let's populate our TargetMod word data and fill in our
		// justTargetModBibleText buffer
		justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);

if (optionDebug) {
		cout << "\nJust targetModule Bible Text" << endl;
		cout << "---------------------" << endl;
		cout << justTargetModBibleText << endl;
		cout << "---------------------" << endl;
}

 
		// ok, now lets grab out the groovy data from the FromMod module
		pullFromModData(fromMod, wordTags, fromWords, fromWordTags);


		// 
		// ok, here's the real work.
		//
		// This method needs to guess which TargetMod words match which FromMod
		// words and then point them to their same original language
		// word tag by populating targetWordTags
		//
		matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);

		// ok, now that we have our targetWordTags magically populated
		// let's do the grunt work of inserting the <w> and </w> tags
		insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);


if (optionDebug) {
		cout << "\nHere's how we mapped things..." << endl;
		cout << "---------------------" << endl;
		cout << "Total wordTags: " << wordTags.size() << endl;
		cout << "\nTargetMod Words: " << endl;
}
		bool warned = false;
		for (int i = 0; i < targetWords.size(); ++i) {
			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
				if (!warned) {
					cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
					cerr << strongsSourceModuleName.c_str() << ":";
					for (int j = 0; j < fromWords.size(); ++j) {
						cerr << " " << fromWords[j];
					}
					cerr << endl;
					cerr << targetModuleName << ":";
					for (int j = 0; j < targetWords.size(); ++j) {
						cerr << " " << targetWords[j];
					}
					cerr << endl;
					cerr << endl;
					cerr << "Unmatched Words:" << endl;
					warned = true;
				}
				cerr << "  " << i << ": " <<  targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
			}
if (optionDebug) {
			cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
		}
		if (warned) {
			cerr << "\n" << targetModuleName << " Tags:\n";
			VerseKey *vk = (VerseKey *)targetMod.getKey();
			for (int j = 0; j < targetWords.size(); ++j) {
				if (!strstr(ignoreSeries, targetWords[j])) {
					cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
				}
			}
			cerr << "---------------------" << endl;
		}

		if (optionFilterAccents) {
			UTF8GreekAccents filter;
			filter.setOptionValue("off");
			filter.processText(newTargetModMarkup);
		}

if (optionDebug) {
		cout << "---------------------" << endl;
		
		cout << "\nAND... Here's our final output" << endl;
		cout << "---------------------" << endl;
}
		cout << newTargetModMarkup << endl;
if (optionDebug) {
		cout << endl;
}
	}

	delete exceptionFile;

	return 0;
}


// builds up bibMap to contain only characters of Biblical text
// and each character's corresponding real location in our output
// buffer (returned value)
SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
	SWBuf out = "";
	SWBuf tag = "";
	int tagLevel = 0;
	int wTag = -1;
	int inTag = 0;
	bool wTagsPresent = orig.indexOf("<w") > -1;
	SWBuf lastElementText = "";
	for (int i = 0; i < orig.length(); ++i) {
		if (orig[i] == '<') {
			inTag = true;
		}
		else if (orig[i] == '>') {
			inTag = false;
			XMLTag t = tag.c_str();
			bool skipTag = false;
			if (!t.isEmpty()) {
				if (t.isEndTag()) {
					// clear out empty w tags
					if (t.getName() && !strcmp("w", t.getName())) {
						if (!lastElementText.size()) {
							out.setSize(wTag);
							if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') {
								out.setSize(out.size() - 1);
								bibMap.pop_back();
								wTags.pop_back();
							}
							skipTag = true;
						}
					}
					tagLevel--;
					wTag = -1;
				}
				else {
					lastElementText = "";
					tagLevel++;
					wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
				}
			}
			if (!skipTag) out += t;
			tag = "";
		}
		else if (inTag) {
			tag += orig[i];
		}
		else {
			if (
				   // for texts without <w> tags
				   (!wTagsPresent && (!tagLevel || wTag != -1))
				   // for texts with <w> tags
				|| ( wTagsPresent && (wTag != -1 || orig[i] == ' '))
			) {
				bibMap.push_back(out.size());
				wTags.push_back(wTag);
			}
			out += orig[i];
			lastElementText += orig[i];
		}
	}
	return out;
}


// Inserts addText into out buffer and adjusts Bible character pointers accordingly
//
void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
	int to = 0;
	if (!after && wTags[bibPos] != -1) {
		to = wTags[bibPos] + 2;
		addText--; // discard the '>'
		addText << 2; // discard the '<w'
	}
	else {
		to = bibMap[bibPos]+((after)?1:0);
	}
	if (!after || wTags[bibPos] == -1) {
		out.insert(to, addText);
		for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) {
			bibMap[i] += addText.length();
			if (wTags[i] != -1) wTags[i] += addText.length();
		}
	}
}


SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
	SWBuf bibWord = "";
	SWBuf fromWord = "";
	SWBuf bibText = "";
	for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
/*
		char *b1 = markupBuf.getRawData()+*it;
		char *b2 = b1;
		__u32 uc = getUniCharFromUTF8(&b2);
		bool wordBreak = false;
		if (uc) {
			SWBuf u8c;
			u8c.append(b1, b2-b1);
			if (strstr(ignoreSeries, u8c.getRawData()))
		}
*/
		char c = markupBuf[*it];
		if (c != ' ' && c != '.' && c != ';' && c != ',') {
			if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
			bibWord += c;
		}
		else {
			if (bibWord.length()) {
				targetWordEnds.push_back(bibText.length()-1);
				targetWords.push_back(bibWord);
				bibWord = "";
			}
		}
		bibText += c;
	}
	if (bibWord.length()) {
		targetWordEnds.push_back(bibText.length()-1);
		targetWords.push_back(bibWord);
	}
	return bibText;
}


void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
	fromMod.renderText();	// be sure FromMod has processed entry attributes
	AttributeList &words = fromMod.getEntryAttributes()["Word"];
	SWBuf fromWord = "";
	SWBuf bibWord = "";
	for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
		// this is our new <w> XMLTag.
		// attributes will be added below
		XMLTag w("w");
		// this only gives us word count, not if we have multiple entries per word
		// don't use as loop
		int parts = atoi(it->second["PartCount"]);
		SWBuf lemma = "";
		SWBuf morph = "";
		bool found = true;
		for (int i = 1; found; ++i) {
			found = false;
			SWBuf key = "";
			key = SWBuf().setFormatted("Lemma.%d", i);
			AttributeValue::iterator li = it->second.find(key);
			if (i == 1 && li == it->second.end()) li = it->second.find("Lemma");
			if (li != it->second.end()) {
				found = true;
				if (i > 1) lemma += " ";
				key = SWBuf().setFormatted("LemmaClass.%d", i);
				AttributeValue::iterator lci = it->second.find(key);
				if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass");
				if (lci != it->second.end()) {
					lemma += lci->second + ":";
				}
				lemma += li->second;
			}
			key = SWBuf().setFormatted("Morph.%d", i);
			li = it->second.find(key);
			if (i == 1 && li == it->second.end()) li = it->second.find("Morph");
			if (li != it->second.end()) {
				found = true;
				if (i > 1) morph += " ";
				key = SWBuf().setFormatted("MorphClass.%d", i);
				AttributeValue::iterator lci = it->second.find(key);
				if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass");
				if (lci != it->second.end()) {
					morph += lci->second + ":";
				}
				morph += li->second;
			}
			// TODO: add src tags and maybe other attributes
		}

		if (lemma.length()) w.setAttribute("lemma", lemma);
		if (morph.length()) w.setAttribute("morph", morph);


		fromWord = it->second["Text"];
		bibWord = "";
		for (int j = 0; j < fromWord.length(); ++j) {
			char c = fromWord[j];
			if (c != ' ' && c != '.' && c != ';' && c != ',') {
				bibWord += c;
			}
			else {
				if (bibWord.length()) {
					fromWords.push_back(bibWord);
					fromWordTags.push_back(wordTags.size());
					bibWord = "";
				}
			}
		}
		if (bibWord.length()) {
			fromWords.push_back(bibWord);
			fromWordTags.push_back(wordTags.size());
		}

		wordTags.push_back(w);
	}
}


void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
	// TODO: this method needs some work,
	// like putting multiple consecutive words
	// together in one tag

	ConfigEntMap exceptions;

	if (exceptionFile) {
		exceptions = exceptionFile->getSection("exceptions");
	}

	for (int i = 0; i < targetWordTags.size(); ++i) {
		SWBuf wordTag = "";
		if (targetWordTags[i] > -1) {
			wordTag = wordTags[targetWordTags[i]];
		}
		if (exceptionFile) {
			SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i);
			ConfigEntMap::const_iterator it = exceptions.find(key);
			if (it != exceptions.end()) {
				targetWordTags[i] = -2;	// note that we are using an exception, not a mapping, not unset (-1)
				wordTag = it->second;
			}
		}
		if (wordTag.length()) {
			insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags);
			insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
		}
	}
}