diff options
author | Troy A. Griffitts <scribe@crosswire.org> | 2013-10-23 14:33:38 +0000 |
---|---|---|
committer | Troy A. Griffitts <scribe@crosswire.org> | 2013-10-23 14:33:38 +0000 |
commit | e9f75761bbd24bb89a1c031d6bf6749e022e2549 (patch) | |
tree | 0f87f2d7c8fb370b98547edf76b68c42d7554eee | |
parent | 52bd2b309a8fe80357e35f018807a24fc8575042 (diff) | |
download | sword-tools-e9f75761bbd24bb89a1c031d6bf6749e022e2549.tar.gz |
committed first cut at tag migration tool to move <w> tags from one module to another
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@470 07627401-56e2-0310-80f4-f8cd0041bdcd
-rw-r--r-- | migratetags/Makefile | 11 | ||||
-rw-r--r-- | migratetags/esvtag.cpp | 389 |
2 files changed, 400 insertions, 0 deletions
diff --git a/migratetags/Makefile b/migratetags/Makefile new file mode 100644 index 0000000..8958367 --- /dev/null +++ b/migratetags/Makefile @@ -0,0 +1,11 @@ +TARGETS= esvtag + +all: $(TARGETS) + +clean: + rm $(TARGETS) + +.cpp: + g++ -g `pkg-config --cflags sword` $< -o $@ `pkg-config --libs sword` + + diff --git a/migratetags/esvtag.cpp b/migratetags/esvtag.cpp new file mode 100644 index 0000000..3b86f70 --- /dev/null +++ b/migratetags/esvtag.cpp @@ -0,0 +1,389 @@ +#include <versekey.h> +#include <swmgr.h> +#include <utilxml.h> +#include <swbuf.h> +#include <swmodule.h> +#include <iostream> +#include <vector> + +using namespace sword; +using namespace std; + +typedef vector<unsigned long> BibMap; + +void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false); +int compare(const SWBuf &s1, const SWBuf &s2); + +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap); +SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds); +void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags); +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds); + + +// +// This is where the magic happens +// +// we must point each esv word to an XMLTag +// +// when the magic is done, and your guess is made +// populate esvWordTags with the integer offset +// into wordTags for which XMLTag you think it should +// be. +// +void matchWords(vector<int> &esvWordTags, const vector<SWBuf> &esvWords, const vector<SWBuf> &kjvWords, const vector<int> &kjvWordTags) { + + // initialize our results to all -1 so we can pop around and set + // words as we find them, and know which ones we haven't yet set + for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1); + + + // poor effort attempt + int j = 0; + for (int i = 0; i < esvWords.size(); i++) { + while (true) { + int match = compare(esvWords[i], kjvWords[j]); + // if we have a better than 75% match of sequencial characters + // then we'll say we have a match + if (match > 75) { + esvWordTags[i] = kjvWordTags[j++]; + break; + } + // TOTRY: maybe check one word before and after? + // + // be creative! + // + } + } +} + + +int main(int argc, char **argv) { + VerseKey vk; + SWMgr lib; + SWModule &esv = *lib.getModule("ESV"); + SWModule &kjv = *lib.getModule("KJV"); + + // we'll do the whole Bible eventually, but let's just get one verse + // working well. + esv.setKey("gen1.1"); // lets try this verse +// for (esv = TOP; !esv.Error(); esv++) { + + // XML word tags which should be placed in this verse (start tag) + // eg., <w lemma=...> + // pulled from KJV + vector<XMLTag> wordTags; + + // Just the raw canonical Bible text of this verse with no tags + // eg., "In the beginning God created the heavens and the earth." + SWBuf justESVBibleText = ""; + + // a mapping for each character in justESVBibleText to the real location + // in our out buffer. This allows us to insert our <w> and </w> + // tags in the correct place amongst the fully marked up + // ESV out buffer. This work is all done in the insert() method + // above + BibMap bibMap; + + // justESVBibleText (above) broken down into separate words + // ie. all words in the ESV from this verse + // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... + vector<SWBuf> esvWords; + + // where each corresponding esvWords[x] starts in justESVBibleText + // eg. for "In the beginning..." + // [0] = 0; [1] = 3; [2] = 7; ... + // Needed to pass to insert method so we know where + // to insert the <w> start tag + vector<int> esvWordStarts; + + // same as esvWordStarts, but the end of each word + // eg. [0] = 1; [1] = 5; [2] = 15 + // Needed to pass to insert method so we know where + // to insert the </w> end tag + vector<int> esvWordEnds; + + // This is the doozy. This maps each ESV word to the correct + // wordTags entry. + vector<int> esvWordTags; + + // Equivalent to esvWords above, but for the KJV. + // Useful for helping determine matches to ESV words + vector<SWBuf> kjvWords; + + // Equivalent to esvWordTag which we need to produce, + // but this one is produced for us from the KJV data + // If we can match a kjvWords[x] entry, then we can assign + // esvWorkTags[ourMatch] = kjvWordTags[x] + vector<int> kjvWordTags; + + bibMap.clear(); + + kjv.setKey(esv.getKey()); + + cout << "\nProcessing Verse: " << esv.getKeyText() << endl; + cout << "---------------------" << endl; + + cout << "\nOur KJV Verse Markup" << endl; + cout << "---------------------" << endl; + cout << kjv.getRawEntry() << endl; + cout << "---------------------" << endl; + + + // grab our raw, fully marked up ESV text for this verse + SWBuf orig = esv.getRawEntryBuf(); + + cout << "\nOur Original ESV Markup" << endl; + cout << "---------------------" << endl; + cout << orig << endl; + cout << "---------------------" << endl; + + // let's find where just the canonical text is amongst + // all our markup + // newESVMarkup will eventually hold our updated markup with + // the new <w> tags, but we'll start here by setting it to + // the processed original markup. + // on return, bibMap will be populated with each character + // and the corresponding location into newESVMarkup where + // the character resides. + SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap); + + cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl; + cout << "---------------------" << endl; + cout << newESVMarkup << endl; + cout << "---------------------" << endl; + + // let's populate or ESV word data and fill in our + // justESVBibleText buffer + justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds); + + cout << "\nJust ESV Bible Text" << endl; + cout << "---------------------" << endl; + cout << justESVBibleText << endl; + cout << "---------------------" << endl; + + + // ok, now lets grab out the groovy data from the KJV module + pullKJVData(kjv, wordTags, kjvWords, kjvWordTags); + + + // + // ok, here's the real work. + // + // This method needs to guess which ESV words match which KJV + // words and then point them to their same original language + // word tag by populating esvWordTags + // + matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags); + + // ok, now that we have our esvWordTags magically populated + // let's do the grunt work of inserting the <w> and </w> tags + insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds); + + + cout << "\nHere's how you mapped things..." << endl; + cout << "---------------------" << endl; + cout << "Total wordTags: " << wordTags.size() << endl; + cout << "\nESV Words: " << endl; + for (int i = 0; i < esvWords.size(); i++) { + cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl; + } + cout << "---------------------" << endl; + + cout << "\nAND... Here's your final output" << endl; + cout << "---------------------" << endl; + cout << newESVMarkup << endl; + cout << endl; +// } + return 0; +} + + +// builds up bibMap to contain only characters of Biblical text +// and each character's corresponding real location in our output +// buffer (returned value) +SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) { + SWBuf out = ""; + SWBuf tag = ""; + int tagLevel = 0; + int inTag = 0; + for (int i = 0; i < orig.length(); i++) { + if (orig[i] == '<') { + inTag = true; + } + else if (orig[i] == '>') { + inTag = false; + XMLTag t = tag.c_str(); + if (!t.isEmpty()) { + if (t.isEndTag()) { + tagLevel--; + } + else { + tagLevel++; + } + } + out += t; + tag = ""; + } + else if (inTag) { + tag += orig[i]; + } + else { + if (!tagLevel) { + bibMap.push_back(out.size()); + } + out += orig[i]; + } + } + return out; +} + + +// Inserts addText into out buffer and adjusts Bible character pointers accordingly +// +void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) { + out.insert(bibMap[bibPos]+((after)?1:0), addText); + for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) { + bibMap[i] += addText.length(); + } +} + + +// Compares 2 words and tries to give a percentage assurance of a match +// TODO: could use more smarts here +// +int compare(const SWBuf &s1, const SWBuf &s2) { + int retVal = 0; + SWBuf largest = (s1.length() > s2.length()) ? s1 : s2; + SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1; + int matches = 0; + int j = 0; + for (int i = 0; i < smallest.length() && j < largest.length(); i++) { + while (j < largest.length()) { + if (smallest[i] == largest[j++]) { + matches++; + break; + } + } + } + return (((float)matches) / largest.length()) * 100; +} + + +SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds) { + SWBuf bibWord = ""; + SWBuf kjvWord = ""; + SWBuf bibText = ""; + for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { + char c = markupBuf[*it]; + if ((c >= 'a' && c <='z') || + (c >= 'A' && c <='Z') + ) { + if (!bibWord.length()) esvWordStarts.push_back(bibText.length()); + bibWord += c; + } + else { + if (bibWord.length()) { + esvWordEnds.push_back(bibText.length()-1); + esvWords.push_back(bibWord); + bibWord = ""; + } + } + bibText += c; + } + if (bibWord.length()) { + esvWordEnds.push_back(bibText.length()-1); + esvWords.push_back(bibWord); + } + return bibText; +} + + +void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags) { + kjv.RenderText(); // be sure KJV has processed entry attributes + AttributeList &words = kjv.getEntryAttributes()["Word"]; + SWBuf kjvWord = ""; + SWBuf bibWord = ""; + for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { + // this is our new <w> XMLTag. + // attributes will be added below + XMLTag w("w"); + int parts = atoi(it->second["PartCount"]); + SWBuf lemma = ""; + SWBuf morph = ""; + for (int i = 1; i <= parts; i++) { + SWBuf key = ""; + key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); + AttributeValue::iterator li = it->second.find(key); + if (li != it->second.end()) { + if (i > 1) lemma += " "; + key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + if (lci != it->second.end()) { + lemma += lci->second + ":"; + } + lemma += li->second; + } + key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); + li = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && li == it->second.end()) { + li = it->second.find("Morph"); + } + if (li != it->second.end()) { + if (i > 1) morph += " "; + key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); + AttributeValue::iterator lci = it->second.find(key); + // silly. sometimes morph counts don't equal lemma counts + if (i == 1 && parts != 1 && lci == it->second.end()) { + lci = it->second.find("MorphClass"); + } + if (lci != it->second.end()) { + morph += lci->second + ":"; + } + morph += li->second; + } + // TODO: add src tags and maybe other attributes + } + + if (lemma.length()) w.setAttribute("lemma", lemma); + if (morph.length()) w.setAttribute("morph", morph); + + + kjvWord = it->second["Text"]; + bibWord = ""; + for (int j = 0; j < kjvWord.length(); j++) { + char c = kjvWord[j]; + if ((c >= 'a' && c <='z') || + (c >= 'A' && c <='Z') + ) { + bibWord += c; + } + else { + if (bibWord.length()) { + kjvWords.push_back(bibWord); + kjvWordTags.push_back(wordTags.size()); + bibWord = ""; + } + } + } + if (bibWord.length()) { + kjvWords.push_back(bibWord); + kjvWordTags.push_back(wordTags.size()); + } + + wordTags.push_back(w); + } +} + + +void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds) { + // TODO: this method needs some work, + // like putting multiple consecutive words + // together in one tag + for (int i = 0; i < esvWordTags.size(); i++) { + if (esvWordTags[i] > -1) { + insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap); + insert("</w>", markupBuf, esvWordEnds[i], bibMap, true); + } + } +} |