/****************************************************************************** * flash.cpp - Automation of flashcards generation * * Copyright 2007 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * Contributors: * Lyndon Drake * Troy A. Griffitts */ #include #include #include #include #include #include #include #include #include #include #include #include using namespace sword; using namespace std; namespace { // used to hold a KJV translation phrase for a greek/hebrew word // and any greek/hebrew words combined to make this KJV phrase // e.g. hO QEOS = QEOS: [+ hO ]: God class Phrase { public: Phrase() : phrase("") {} SWBuf phrase; vector with; inline bool operator ==(const Phrase &other) const { return !compare(other); } inline bool operator !=(const Phrase &other) const { return compare(other); } inline bool operator > (const Phrase &other) const { return compare(other) > 0; } inline bool operator < (const Phrase &other) const { return compare(other) < 0; } inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; } inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; } int compare(const Phrase &right) const { int c = phrase.compare(right.phrase); if (c) return c; vector::const_iterator lit = with.begin(); vector::const_iterator rit = right.with.begin(); while (lit != with.end() && rit != right.with.end()) { c = lit->compare(*rit); if (c) return c; lit++; rit++; } if (lit != with.end()) return 1; if (rit != right.with.end()) return -1; return 0; } }; // KJV phrases and their occurance frequency typedef map KJVPhrases; // primary result class class Word { public: Word() : utf8("") , strong("") , freq(0) , def("") {} // lexical form of this word in utf8 greek/hebrew SWBuf utf8; // strongs number for this word (e.g. G3588) SWBuf strong; // frequency of occurance in the iterated text int freq; // definition pulled from short strongs def SWBuf def; // kjv translation phrases and their frequencies KJVPhrases kjvFreq; }; string itoa(int v) { stringstream str; str << v; return str.str(); } SWConfig greek("greek.conf"); SWConfig hebrew("hebrew.conf"); bool compareFreq(const Word &w1, const Word &w2) { if (w1.freq != w2.freq) return w1.freq > w2.freq; SWBuf s1 = w1.strong; SWBuf s2 = w2.strong; s1 << 1; s2 << 1; return atoi(s2) > atoi(s1); } bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) { return i1->second > i2->second; } // sort and pretty up all the KJV phrases for a word into a nice output buffer SWBuf prettyKJVFreq(KJVPhrases in) { SWBuf retVal; vector sorted; for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) { // combine cap words with lowercase, if exists Phrase k = it->first; if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") { k.phrase[0] = tolower(k.phrase[0]); if (k != it->first) { KJVPhrases::iterator i = in.find(k); if (i != in.end()) { i->second += it->second; // don't include us in the list cuz we added our freq to another continue; } } } sorted.push_back(it); } sort(sorted.begin(), sorted.end(), compareKJVFreq); for (vector::const_iterator it = sorted.begin(); it != sorted.end(); it++) { if (retVal.size()) retVal += "; "; // prepend 'with other strongs' if present if ((*it)->first.with.size()) { retVal += "[+"; for (int i = 0; i < (*it)->first.with.size(); i++) { retVal.appendFormatted(" %s", (*it)->first.with[i].c_str()); } retVal += " ] "; } retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second); } return retVal; } // take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars // java .properties files wants this format (flashcard .flash lessons use this format) SWBuf escapedUTF8(SWBuf inText) { static UTF8UTF16 convert; convert.processText(inText); SWBuf retBuf; for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) { if (*i < 128) { retBuf += (char)*i; } else { retBuf.appendFormatted("\\u%.4x", *i); // change hex alpha values to upper case for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) { retBuf[i] = toupper(retBuf[i]); } } } return retBuf; } } // END anonymous namespace // output a simple CSV ('|' separated really) format for importing into OOo or excel void outputCSV(const vector &wordList) { // output header cout << "FreqKJV|" << "Greek|" << "Strongs|" << "TranslationInAV" << "\n"; /* << "FreqKJV|" << "PointedHeb|" << "Meaning|" << "Strongs|" << "Language|" << "TWOT|" << "Form|" << "GkRelated|" << "FullerMeaning|" << "UnpointedHeb|" << "CALUnpointedAscii|" << "TABSUnpointedAscii|" << "Transliteration|" << "Phonetic|" << "Notes|" << "FullMeaning|" << "TranslationInAV" << "\n"; */ for (vector::const_iterator it = wordList.begin(); it != wordList.end(); it++) { const Word &w = (*it); SWBuf s = w.strong; char gh = s[0]; if (gh == 'G' || gh == 'H') { s << 1; } s = itoa(atoi(s.c_str())).c_str(); cout << w.freq << "|" << escapedUTF8(w.utf8).c_str() << "|" << w.strong << "|" << prettyKJVFreq(w.kjvFreq).c_str() /* << w.freq << "|" << hebrew[s]["UTF8"] << "|" << hebrew[s]["Meaning"] << "|" << s << "|" << hebrew[s]["Language"] << "|" << hebrew[s]["TWOT"] << "|" << hebrew[s]["Form"] << "|" << hebrew[s]["GkRelated"] << "|" << hebrew[s]["FullerMeaning"] << "|" << hebrew[s]["UnpointedHeb"] << "|" << hebrew[s]["CALUnpointedAscii"] << "|" << hebrew[s]["TABSUnpointedAscii"] << "|" << hebrew[s]["Transliteration"] << "|" << hebrew[s]["Phonetic"] << "|" << hebrew[s]["Notes"] << "|" << hebrew[s]["FullMeaning"] << "|" << hebrew[s]["TranslationInAV"] */ << "\n"; } std::cout << std::endl; } /** * output our flashcard .flash file format * * wordList - duh * outputDir - directory path where to write files, e.g. "./hebFreq" * kjvFreq - if true, process KJV translation frequencies and use these as * the word answers; otherwise, use short strongs defs. * maxPerLesson - maximum number of words per lesson * */ void outputFlash(const vector &wordList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25, const char *fontName=0) { ThMLPlain strip; ofstream ofile; int wordCount = 0; int lessonNumber = 0; int startFreq = 0; int lastFreq = 0; vector::const_iterator it = wordList.begin(); while (it != wordList.end()) { const Word &w = (*it); if (!wordCount) { SWBuf fname = outputDir; fname += "/lesson"; fname.appendFormatted("%.3d", lessonNumber); fname += ".flash"; ofile.open(fname); startFreq = w.freq; } SWBuf word = w.utf8; word.trim(); SWBuf answers = ""; answers.trim(); // if we want answers as KJV phrases if (kjvFreq) { answers = prettyKJVFreq(w.kjvFreq); if (answers.size() > 200) answers.size(200); } // if we would rather have short strongs else { answers = w.def; strip.processText(answers); // remove html tags answers.replaceBytes("\n\r", ' '); // remove newlines } // be sure we have both a word and an answer if (word.size() && answers.size()) { ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n"; ofile << "answers" << wordCount << "=" << answers << "\n"; lastFreq = w.freq; wordCount++; } it++; if (it == wordList.end() || wordCount >= maxPerLesson) { // close lesson SWBuf lessonTitle = ""; lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq); ofile << lessonTitle; ofile << "wordCount=" << wordCount << "\n"; if (fontName) { ofile << "lessonFont=" << fontName << "\n"; } ofile.close(); wordCount = 0; lessonNumber++; } } } /** * do the work * * range - the range of verses to process (e.g. "gen-mal") * addAll - if we should add all words in our lexicon for the testaments * included in the range even if they don't exist in the text * (useful for generating complete OT or NT strongs word lists) * */ vector processWords(const char *range, bool addAll = true, SWBuf modName = "KJV") { SWMgr manager; map wordList; SWModule *tmpBible = manager.getModule(modName); if (!tmpBible) { cerr << "Unable to locate KJV module" << endl; exit(1); } SWModule &bible = *tmpBible; VerseKey *parser = (VerseKey *)bible.createKey(); ListKey r = parser->parseVerseList(range, 0, true); for (r = TOP; !r.popError(); r++) { bible.setKey(r); bible.renderText(); // force an entry lookup to resolve key to something in the index AttributeList &words = bible.getEntryAttributes()["Word"]; for (AttributeList::iterator word = words.begin(); word != words.end(); word++) { SWBuf partCount = word->second["PartCount"]; int parts = atoi(partCount.c_str()); if (parts < 1) parts = 1; // build a list of all lemmas for use later in 'with' // i.e. 'translated xxx with Gnnnn1, Gnnnn2' list lemmas; for (int i = 1; i <= parts; i++) { SWBuf lemKey = "Lemma"; if (parts > 1) lemKey.appendFormatted(".%d", i); lemmas.push_back(word->second[lemKey]); } for (int i = 1; i <= parts; i++) { SWBuf lemKey = "Lemma"; if (parts > 1) lemKey.appendFormatted(".%d", i); SWBuf strong = word->second[lemKey]; SWBuf text = word->second["Text"]; if (strong == "G3588") { text = "[article]"; } else { text.trim(); // trim punctuation from end while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1); if (!text.size()) text = "[Untranslated]"; } Phrase p; p.phrase = text; if ((parts > 1) && (strong != "G3588")) { // lets build our 'with' list excluding ourselves list withoutMe = lemmas; withoutMe.remove(strong); // special handling of article. We don't want [+ 3588] all over the place withoutMe.remove("G3588"); p.with = vector(withoutMe.begin(), withoutMe.end()); } wordList[strong].kjvFreq[p]++; wordList[strong].freq++; } } } if (addAll) { // first use utf8 list to iterate and add utf8 entries.\ // this assures we have an entry for every word, even if it is not // present in the module r = TOP; if (VerseKey(r).getTestament() == 1) { for (SectionMap::iterator it = hebrew.Sections.begin(); it != hebrew.Sections.end(); it++) { wordList[(SWBuf)"H0"+it->first].utf8; // just access to be sure created. We'll add later. } } r = BOTTOM; if (VerseKey(r).getTestament() == 2) { for (SectionMap::iterator it = greek.Sections.begin(); it != greek.Sections.end(); it++) { wordList[(SWBuf)"G"+it->first].utf8; } } } vector sorted; for (map::iterator it = wordList.begin(); it != wordList.end(); it++) { // pull strongs key from map and populate Word SWBuf s = it->first; //cout << s.c_str() << "\n"; it->second.strong = s; char gh = s[0]; if (gh == 'G' || gh == 'H') { s << 1; } s.setFormatted("%d", atoi(s.c_str())); // populate lex defs it->second.def = (gh == 'G') ? greek[s]["Meaning"] : hebrew[s]["Meaning"]; // populate utf-8 it->second.utf8 = (gh == 'G') ? greek[s]["UTF8"] : hebrew[s]["UTF8"]; // put only word in sorted container sorted.push_back(it->second); } sort(sorted.begin(), sorted.end(), compareFreq); delete parser; return sorted; } void usage(const char *app, const char *error = 0) { if (error) fprintf(stderr, "\n%s: %s\n", app, error); fprintf(stderr, "\nusage: %s [OPTIONS]\n", app); fprintf(stderr, " -c\t\t\t generate CSV file instead of flashcards\n"); fprintf(stderr, " -o \t directory to output files\n"); fprintf(stderr, " -w \t number of words per lesson (default 25)\n"); fprintf(stderr, " -d \t\t definition to use (default k):\n"); fprintf(stderr, "\t\t\t\t m - short meaning; k - KJV collation\n"); fprintf(stderr, " -r <\"range\">\t\t verse range\n"); fprintf(stderr, " -m <\"module name\">\t\t module name to use (default \"KJV\")\n"); fprintf(stderr, " -f \t\t include special font name for lesson\n"); fprintf(stderr, " -h\t\t\t display this help message\n\n"); exit(-1); } int main(int argc, char **argv) { const int REQUIRED = 0; // Let's test our command line arguments if (argc < REQUIRED+1) { // this never happens since we don't have any required argument count usage(*argv); } // variables for arguments, holding defaults const char* program = argv[0]; bool csv = false; char def = 'k'; SWBuf range = "Mat-Rev"; SWBuf fontName = ""; SWBuf outDir = "."; SWBuf modName = "KJV"; int count = 25; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-c")) { csv = true; } else if (!strcmp(argv[i], "-w")) { if (i+1 < argc) { count = atoi(argv[++i]); if (count > 0) continue; } usage(*argv, "-d requires one of "); } else if (!strcmp(argv[i], "-d")) { if (i+1 < argc) { def = argv[++i][0]; if ((def == 'k') || (def == 'm')) continue; } usage(*argv, "-d requires one of "); } else if (!strcmp(argv[i], "-r")) { if (i+1 < argc) range = argv[++i]; else usage(*argv, "-r requires <\"range\">"); } else if (!strcmp(argv[i], "-m")) { if (i+1 < argc) modName = argv[++i]; else usage(*argv, "-m requires <\"module name\">"); } else if (!strcmp(argv[i], "-f")) { if (i+1 < argc) fontName = argv[++i]; else usage(*argv, "-f requires <\"fontName\">"); } else if (!strcmp(argv[i], "-o")) { if (i+1 < argc) outDir = argv[++i]; else usage(*argv, "-o requires "); } else if ((!strcmp(argv[i], "-h")) || (!strcmp(argv[i], "-?")) || (!strcmp(argv[i], "--help")) ) { usage(*argv); } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } if (csv) { outputCSV(processWords(range, true, modName)); } else { outputFlash(processWords(range, true, modName), outDir, (def == 'k'), count, (fontName.length()?fontName.c_str():0)); } return 0; }