#include #include #include #include #include #include #include using namespace sword; using namespace std; typedef vector BibMap; void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false); int compare(const SWBuf &s1, const SWBuf &s2); SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap); SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &esvWords, vector &esvWordStarts, vector &esvWordEnds); void pullKJVData(SWModule &kjv, vector&wordTags, vector &kjvWords, vector &kjvWordTags); void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector &esvWordTags, const vector &wordTags, const vector &esvWordStarts, const vector &esvWordEnds); // // This is where the magic happens // // we must point each esv word to an XMLTag // // when the magic is done, and your guess is made // populate esvWordTags with the integer offset // into wordTags for which XMLTag you think it should // be. // void matchWords(vector &esvWordTags, const vector &esvWords, const vector &kjvWords, const vector &kjvWordTags) { // initialize our results to all -1 so we can pop around and set // words as we find them, and know which ones we haven't yet set for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1); // poor effort attempt int j = 0; for (int i = 0; i < esvWords.size(); i++) { while (true) { int match = compare(esvWords[i], kjvWords[j]); // if we have a better than 75% match of sequencial characters // then we'll say we have a match if (match > 75) { esvWordTags[i] = kjvWordTags[j++]; break; } // TOTRY: maybe check one word before and after? // // be creative! // } } } int main(int argc, char **argv) { VerseKey vk; SWMgr lib; SWModule &esv = *lib.getModule("ESV"); SWModule &kjv = *lib.getModule("KJV"); // we'll do the whole Bible eventually, but let's just get one verse // working well. esv.setKey("gen1.1"); // lets try this verse // for (esv = TOP; !esv.Error(); esv++) { // XML word tags which should be placed in this verse (start tag) // eg., // pulled from KJV vector wordTags; // Just the raw canonical Bible text of this verse with no tags // eg., "In the beginning God created the heavens and the earth." SWBuf justESVBibleText = ""; // a mapping for each character in justESVBibleText to the real location // in our out buffer. This allows us to insert our and // tags in the correct place amongst the fully marked up // ESV out buffer. This work is all done in the insert() method // above BibMap bibMap; // justESVBibleText (above) broken down into separate words // ie. all words in the ESV from this verse // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... vector esvWords; // where each corresponding esvWords[x] starts in justESVBibleText // eg. for "In the beginning..." // [0] = 0; [1] = 3; [2] = 7; ... // Needed to pass to insert method so we know where // to insert the start tag vector esvWordStarts; // same as esvWordStarts, but the end of each word // eg. [0] = 1; [1] = 5; [2] = 15 // Needed to pass to insert method so we know where // to insert the end tag vector esvWordEnds; // This is the doozy. This maps each ESV word to the correct // wordTags entry. vector esvWordTags; // Equivalent to esvWords above, but for the KJV. // Useful for helping determine matches to ESV words vector kjvWords; // Equivalent to esvWordTag which we need to produce, // but this one is produced for us from the KJV data // If we can match a kjvWords[x] entry, then we can assign // esvWorkTags[ourMatch] = kjvWordTags[x] vector kjvWordTags; bibMap.clear(); kjv.setKey(esv.getKey()); cout << "\nProcessing Verse: " << esv.getKeyText() << endl; cout << "---------------------" << endl; cout << "\nOur KJV Verse Markup" << endl; cout << "---------------------" << endl; cout << kjv.getRawEntry() << endl; cout << "---------------------" << endl; // grab our raw, fully marked up ESV text for this verse SWBuf orig = esv.getRawEntryBuf(); cout << "\nOur Original ESV Markup" << endl; cout << "---------------------" << endl; cout << orig << endl; cout << "---------------------" << endl; // let's find where just the canonical text is amongst // all our markup // newESVMarkup will eventually hold our updated markup with // the new tags, but we'll start here by setting it to // the processed original markup. // on return, bibMap will be populated with each character // and the corresponding location into newESVMarkup where // the character resides. SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap); cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl; cout << "---------------------" << endl; cout << newESVMarkup << endl; cout << "---------------------" << endl; // let's populate or ESV word data and fill in our // justESVBibleText buffer justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds); cout << "\nJust ESV Bible Text" << endl; cout << "---------------------" << endl; cout << justESVBibleText << endl; cout << "---------------------" << endl; // ok, now lets grab out the groovy data from the KJV module pullKJVData(kjv, wordTags, kjvWords, kjvWordTags); // // ok, here's the real work. // // This method needs to guess which ESV words match which KJV // words and then point them to their same original language // word tag by populating esvWordTags // matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags); // ok, now that we have our esvWordTags magically populated // let's do the grunt work of inserting the and tags insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds); cout << "\nHere's how you mapped things..." << endl; cout << "---------------------" << endl; cout << "Total wordTags: " << wordTags.size() << endl; cout << "\nESV Words: " << endl; for (int i = 0; i < esvWords.size(); i++) { cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl; } cout << "---------------------" << endl; cout << "\nAND... Here's your final output" << endl; cout << "---------------------" << endl; cout << newESVMarkup << endl; cout << endl; // } return 0; } // builds up bibMap to contain only characters of Biblical text // and each character's corresponding real location in our output // buffer (returned value) SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) { SWBuf out = ""; SWBuf tag = ""; int tagLevel = 0; int inTag = 0; for (int i = 0; i < orig.length(); i++) { if (orig[i] == '<') { inTag = true; } else if (orig[i] == '>') { inTag = false; XMLTag t = tag.c_str(); if (!t.isEmpty()) { if (t.isEndTag()) { tagLevel--; } else { tagLevel++; } } out += t; tag = ""; } else if (inTag) { tag += orig[i]; } else { if (!tagLevel) { bibMap.push_back(out.size()); } out += orig[i]; } } return out; } // Inserts addText into out buffer and adjusts Bible character pointers accordingly // void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) { out.insert(bibMap[bibPos]+((after)?1:0), addText); for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) { bibMap[i] += addText.length(); } } // Compares 2 words and tries to give a percentage assurance of a match // TODO: could use more smarts here // int compare(const SWBuf &s1, const SWBuf &s2) { int retVal = 0; SWBuf largest = (s1.length() > s2.length()) ? s1 : s2; SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1; int matches = 0; int j = 0; for (int i = 0; i < smallest.length() && j < largest.length(); i++) { while (j < largest.length()) { if (smallest[i] == largest[j++]) { matches++; break; } } } return (((float)matches) / largest.length()) * 100; } SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &esvWords, vector &esvWordStarts, vector &esvWordEnds) { SWBuf bibWord = ""; SWBuf kjvWord = ""; SWBuf bibText = ""; for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { char c = markupBuf[*it]; if ((c >= 'a' && c <='z') || (c >= 'A' && c <='Z') ) { if (!bibWord.length()) esvWordStarts.push_back(bibText.length()); bibWord += c; } else { if (bibWord.length()) { esvWordEnds.push_back(bibText.length()-1); esvWords.push_back(bibWord); bibWord = ""; } } bibText += c; } if (bibWord.length()) { esvWordEnds.push_back(bibText.length()-1); esvWords.push_back(bibWord); } return bibText; } void pullKJVData(SWModule &kjv, vector&wordTags, vector &kjvWords, vector &kjvWordTags) { kjv.RenderText(); // be sure KJV has processed entry attributes AttributeList &words = kjv.getEntryAttributes()["Word"]; SWBuf kjvWord = ""; SWBuf bibWord = ""; for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { // this is our new XMLTag. // attributes will be added below XMLTag w("w"); int parts = atoi(it->second["PartCount"]); SWBuf lemma = ""; SWBuf morph = ""; for (int i = 1; i <= parts; i++) { SWBuf key = ""; key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i); AttributeValue::iterator li = it->second.find(key); if (li != it->second.end()) { if (i > 1) lemma += " "; key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i); AttributeValue::iterator lci = it->second.find(key); if (lci != it->second.end()) { lemma += lci->second + ":"; } lemma += li->second; } key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i); li = it->second.find(key); // silly. sometimes morph counts don't equal lemma counts if (i == 1 && parts != 1 && li == it->second.end()) { li = it->second.find("Morph"); } if (li != it->second.end()) { if (i > 1) morph += " "; key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i); AttributeValue::iterator lci = it->second.find(key); // silly. sometimes morph counts don't equal lemma counts if (i == 1 && parts != 1 && lci == it->second.end()) { lci = it->second.find("MorphClass"); } if (lci != it->second.end()) { morph += lci->second + ":"; } morph += li->second; } // TODO: add src tags and maybe other attributes } if (lemma.length()) w.setAttribute("lemma", lemma); if (morph.length()) w.setAttribute("morph", morph); kjvWord = it->second["Text"]; bibWord = ""; for (int j = 0; j < kjvWord.length(); j++) { char c = kjvWord[j]; if ((c >= 'a' && c <='z') || (c >= 'A' && c <='Z') ) { bibWord += c; } else { if (bibWord.length()) { kjvWords.push_back(bibWord); kjvWordTags.push_back(wordTags.size()); bibWord = ""; } } } if (bibWord.length()) { kjvWords.push_back(bibWord); kjvWordTags.push_back(wordTags.size()); } wordTags.push_back(w); } } void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector &esvWordTags, const vector &wordTags, const vector &esvWordStarts, const vector &esvWordEnds) { // TODO: this method needs some work, // like putting multiple consecutive words // together in one tag for (int i = 0; i < esvWordTags.size(); i++) { if (esvWordTags[i] > -1) { insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap); insert("", markupBuf, esvWordEnds[i], bibMap, true); } } }