#include #include #include #include #include #include #include #include #include #include #include using namespace sword; using namespace std; class PhraseCount { public: PhraseCount() : count(0) {} int count; vector with; }; class Word { public: Word() : utf8("") , strong("") , freq(0) , def("") {} SWBuf utf8; SWBuf strong; int freq; // from stongs lex SWBuf def; // computed ourselves map kjvFreq; }; string itoa(int v) { stringstream str; str << v; return str.str(); } bool compareFreq(const Word &w1, const Word &w2) { return w1.freq > w2.freq; } bool compareKJVFreq(const map::const_iterator &i1, const map::const_iterator &i2) { return i1->second.count > i2->second.count; } SWBuf prettyKJVFreq(map in) { SWBuf retVal; vector::const_iterator> sorted; for (map::const_iterator it = in.begin(); it != in.end(); it++) { // combine cap words with lowercase, if exists SWBuf k = it->first; if (k.size() && toupper(k[0]) == k[0] && k != "God" && k != "Lord") { k[0] = tolower(k[0]); if (k != it->first) { map::iterator i = in.find(k); if (i != in.end()) { i->second.count += it->second.count; // don't include us in the list cuz we added our freq to another continue; } } } sorted.push_back(it); } sort(sorted.begin(), sorted.end(), compareKJVFreq); for (vector::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) { if (retVal.size()) retVal += "; "; // prepend 'with other strongs' if present if ((*it)->second.with.size()) { retVal += "[+"; for (int i = 0; i < (*it)->second.with.size(); i++) { retVal.appendFormatted(" %s", (*it)->second.with[i].c_str()); } retVal += " ] "; } retVal.appendFormatted("%s (%d)", (*it)->first.c_str(), (*it)->second.count); } return retVal; } SWBuf escapedUTF8(SWBuf inText) { static UTF8UTF16 convert; convert.processText(inText); SWBuf retBuf; for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) { if (*i < 128) { retBuf += (char)*i; } else { retBuf.appendFormatted("\\u%.4x", *i); // change hex alpha values to upper case for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) { retBuf[i] = toupper(retBuf[i]); } } } return retBuf; } void outputCSV(vector &wordList) { for (vector::iterator it = wordList.begin(); it != wordList.end(); it++) { Word &w = (*it); // cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; cout << w.freq << "|" << w.utf8.c_str() << "|" << w.strong << "|" << prettyKJVFreq(w.kjvFreq).c_str() << "|" << w.def << "\n"; } std::cout << std::endl; } /** * output our flashcard .flash file format * * wordList - duh * outputDir - directory path where to write files, e.g. "./hebFreq" * kjvFreq - if true, process KJV translation frequencies and use these as * the word answers; otherwise, use short strongs defs. * maxPerLesson - maximum number of words per lesson * */ void outputFlash(const vector &wordList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) { ThMLPlain strip; ofstream ofile; int wordCount = 0; int lessonNumber = 0; int startFreq = 0; int lastFreq = 0; vector::const_iterator it = wordList.begin(); while (it != wordList.end()) { const Word &w = (*it); if (!wordCount) { SWBuf fname = outputDir; fname += "/lesson"; fname.appendFormatted("%d", lessonNumber); fname += ".flash"; ofile.open(fname); startFreq = w.freq; } SWBuf word = w.utf8; word.trim(); SWBuf answers = ""; answers.trim(); // if we want answers as KJV phrases if (kjvFreq) { answers = prettyKJVFreq(w.kjvFreq); if (answers.size() > 200) answers.size(200); } // if we would rather have short strongs else { SWBuf answers = w.def; strip.processText(answers); // remove html tags answers.replaceBytes("\n\r", ' '); // remove newlines } // be sure we have both a word and an answer if (word.size() && answers.size()) { ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n"; ofile << "answers" << wordCount << "=" << answers << "\n"; lastFreq = w.freq; wordCount++; } it++; if (it == wordList.end() || wordCount >= maxPerLesson) { // close lesson SWBuf lessonTitle = ""; lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq); ofile << lessonTitle; ofile << "wordCount=" << wordCount << "\n"; ofile.close(); wordCount = 0; lessonNumber++; } } } /** * do the work * * range - the range of verses to process (e.g. "gen-mal") * addAll - if we should add all words in our lexicon for the testaments * included in the range even if they don't exist in the text * */ vector processWords(const char *range, bool addAll = true) { SWMgr manager; SWModule &bible = *manager.getModule("KJV"); map wordList; SWConfig hutf8("hwords.conf"); SWConfig hdefs("hdefs.conf"); SWConfig gutf8("gwords.conf"); SWConfig gdefs("gdefs.conf"); VerseKey parser; ListKey r = parser.ParseVerseList(range, 0, true); r.Persist(true); bible.setKey(r); for (bible = TOP; !bible.Error(); bible++) { bible.RenderText(); // force an entry lookup to resolve key to something in the index AttributeList &words = bible.getEntryAttributes()["Word"]; for (AttributeList::iterator word = words.begin(); word != words.end(); word++) { SWBuf partCount = word->second["PartCount"]; int parts = atoi(partCount.c_str()); if (parts < 1) parts = 1; // build a list of all lemmas for use later in 'with' // i.e. 'translated xxx with Gnnnn1, Gnnnn2' list lemmas; for (int i = 1; i <= parts; i++) { SWBuf lemKey = "Lemma"; if (parts > 1) lemKey.appendFormatted(".%d", i); lemmas.push_back(word->second[lemKey]); } for (int i = 1; i <= parts; i++) { SWBuf lemKey = "Lemma"; if (parts > 1) lemKey.appendFormatted(".%d", i); SWBuf strong = word->second[lemKey]; SWBuf text = word->second["Text"]; if ((parts > 2) && (strong == "G3588")) { text = "[article]"; } else { text.trim(); // trim punctuation from end while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1); if (!text.size()) text = "[Untranslated]"; } wordList[strong].kjvFreq[text].count++; if (parts > 1) { list withoutMe = lemmas; withoutMe.remove(strong); wordList[strong].kjvFreq[text].with = vector(withoutMe.begin(), withoutMe.end()); } wordList[strong].freq++; } } } if (addAll) { // first use utf8 list to iterate and add utf8 entries.\ // this assures we have an entry for every word, even if it is not // present in the module r = TOP; if (VerseKey(r).Testament() == 1) { for (ConfigEntMap::iterator it = hutf8["words"].begin(); it != hutf8["words"].end(); it++) { wordList[(SWBuf)"H"+it->first].utf8 = it->second; } } r = BOTTOM; if (VerseKey(r).Testament() == 2) { for (ConfigEntMap::iterator it = gutf8["words"].begin(); it != gutf8["words"].end(); it++) { wordList[(SWBuf)"G"+it->first].utf8 = it->second; } } } vector sorted; for (map::iterator it = wordList.begin(); it != wordList.end(); it++) { // pull strongs key from map and populate Word SWBuf s = it->first; it->second.strong = s; // populate lex defs it->second.def = (s[0] == 'G') ? gdefs["defs"][(s << 1).c_str()] : hdefs["defs"][(s << 1).c_str()]; // put only word in sorted container sorted.push_back(it->second); } sort(sorted.begin(), sorted.end(), compareFreq); return sorted; } int main(int argc, char **argv) { outputFlash(processWords("gen-mal"), "hebFreqKJV" , true); outputFlash(processWords("gen-mal"), "hebFreq" , false); outputFlash(processWords("mat-rev"), "greekFreqKJV", true); outputFlash(processWords("mat-rev"), "greekFreq" , false); // outputCSV(processWords("mat-rev")); return 0; }