/******************************************************************************
* flash.cpp - Automation of flashcards generation
*
* Copyright 2007 CrossWire Bible Society (http://www.crosswire.org)
* CrossWire Bible Society
* P. O. Box 2528
* Tempe, AZ 85280-2528
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contributors:
* Lyndon Drake <lyndon at arotau dot com>
* Troy A. Griffitts <scribe at crosswire dot org>
*/
#include <map>
#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>
#include <fstream>
#include <swmgr.h>
#include <swbuf.h>
#include <swmodule.h>
#include <utf8utf16.h>
#include <versekey.h>
#include <thmlplain.h>
using namespace sword;
using namespace std;
namespace {
// used to hold a KJV translation phrase for a greek/hebrew word
// and any greek/hebrew words combined to make this KJV phrase
// e.g. hO QEOS = QEOS: [+ hO ]: God
class Phrase {
public:
Phrase()
: phrase("")
{}
SWBuf phrase;
vector<SWBuf> with;
inline bool operator ==(const Phrase &other) const { return !compare(other); }
inline bool operator !=(const Phrase &other) const { return compare(other); }
inline bool operator > (const Phrase &other) const { return compare(other) > 0; }
inline bool operator < (const Phrase &other) const { return compare(other) < 0; }
inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; }
inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; }
int compare(const Phrase &right) const {
int c = phrase.compare(right.phrase);
if (c) return c;
vector<SWBuf>::const_iterator lit = with.begin();
vector<SWBuf>::const_iterator rit = right.with.begin();
while (lit != with.end() && rit != right.with.end()) {
c = lit->compare(*rit);
if (c) return c;
lit++; rit++;
}
if (lit != with.end()) return 1;
if (rit != right.with.end()) return -1;
return 0;
}
};
// KJV phrases and their occurance frequency
typedef map<Phrase, int> KJVPhrases;
// primary result class
class Word {
public:
Word()
: utf8("")
, strong("")
, freq(0)
, def("")
{}
// lexical form of this word in utf8 greek/hebrew
SWBuf utf8;
// strongs number for this word (e.g. G3588)
SWBuf strong;
// frequency of occurance in the iterated text
int freq;
// definition pulled from short strongs def
SWBuf def;
// kjv translation phrases and their frequencies
KJVPhrases kjvFreq;
};
string itoa(int v) { stringstream str; str << v; return str.str(); }
SWConfig greek("greek.conf");
SWConfig hebrew("hebrew.conf");
bool compareFreq(const Word &w1, const Word &w2) {
if (w1.freq != w2.freq) return w1.freq > w2.freq;
SWBuf s1 = w1.strong;
SWBuf s2 = w2.strong;
s1 << 1;
s2 << 1;
return atoi(s2) > atoi(s1);
}
bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) {
return i1->second > i2->second;
}
// sort and pretty up all the KJV phrases for a word into a nice output buffer
SWBuf prettyKJVFreq(KJVPhrases in) {
SWBuf retVal;
vector<KJVPhrases::const_iterator> sorted;
for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) {
// combine cap words with lowercase, if exists
Phrase k = it->first;
if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") {
k.phrase[0] = tolower(k.phrase[0]);
if (k != it->first) {
KJVPhrases::iterator i = in.find(k);
if (i != in.end()) {
i->second += it->second;
// don't include us in the list cuz we added our freq to another
continue;
}
}
}
sorted.push_back(it);
}
sort(sorted.begin(), sorted.end(), compareKJVFreq);
for (vector<KJVPhrases::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
if (retVal.size()) retVal += "; ";
// prepend 'with other strongs' if present
if ((*it)->first.with.size()) {
retVal += "[+";
for (int i = 0; i < (*it)->first.with.size(); i++) {
retVal.appendFormatted(" %s", (*it)->first.with[i].c_str());
}
retVal += " ] ";
}
retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second);
}
return retVal;
}
// take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars
// java .properties files wants this format (flashcard .flash lessons use this format)
SWBuf escapedUTF8(SWBuf inText) {
static UTF8UTF16 convert;
convert.processText(inText);
SWBuf retBuf;
for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) {
if (*i < 128) {
retBuf += (char)*i;
}
else {
retBuf.appendFormatted("\\u%.4x", *i);
// change hex alpha values to upper case
for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) {
retBuf[i] = toupper(retBuf[i]);
}
}
}
return retBuf;
}
} // END anonymous namespace
// output a simple CSV ('|' separated really) format for importing into OOo or excel
void outputCSV(const vector<Word> &wordList) {
// output header
cout
<< "FreqKJV|"
<< "PointedHeb|"
<< "Meaning|"
<< "Strongs|"
<< "Language|"
<< "TWOT|"
<< "Form|"
<< "GkRelated|"
<< "FullerMeaning|"
<< "UnpointedHeb|"
<< "CALUnpointedAscii|"
<< "TABSUnpointedAscii|"
<< "Transliteration|"
<< "Phonetic|"
<< "Notes|"
<< "FullMeaning|"
<< "TranslationInAV"
<< "\n";
for (vector<Word>::const_iterator it = wordList.begin(); it != wordList.end(); it++) {
const Word &w = (*it);
SWBuf s = w.strong;
char gh = s[0];
if (gh == 'G' || gh == 'H') {
s << 1;
}
s = itoa(atoi(s.c_str())).c_str();
// cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n";
cout
<< w.freq << "|"
<< hebrew[s]["UTF8"] << "|"
<< hebrew[s]["Meaning"] << "|"
<< s << "|"
<< hebrew[s]["Language"] << "|"
<< hebrew[s]["TWOT"] << "|"
<< hebrew[s]["Form"] << "|"
<< hebrew[s]["GkRelated"] << "|"
<< hebrew[s]["FullerMeaning"] << "|"
<< hebrew[s]["UnpointedHeb"] << "|"
<< hebrew[s]["CALUnpointedAscii"] << "|"
<< hebrew[s]["TABSUnpointedAscii"] << "|"
<< hebrew[s]["Transliteration"] << "|"
<< hebrew[s]["Phonetic"] << "|"
<< hebrew[s]["Notes"] << "|"
<< hebrew[s]["FullMeaning"] << "|"
<< hebrew[s]["TranslationInAV"]
<< "\n";
}
std::cout << std::endl;
}
/**
* output our flashcard .flash file format
*
* wordList - duh
* outputDir - directory path where to write files, e.g. "./hebFreq"
* kjvFreq - if true, process KJV translation frequencies and use these as
* the word answers; otherwise, use short strongs defs.
* maxPerLesson - maximum number of words per lesson
*
*/
void outputFlash(const vector<Word> &wordList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25, const char *fontName=0) {
ThMLPlain strip;
ofstream ofile;
int wordCount = 0;
int lessonNumber = 0;
int startFreq = 0;
int lastFreq = 0;
vector<Word>::const_iterator it = wordList.begin();
while (it != wordList.end()) {
const Word &w = (*it);
if (!wordCount) {
SWBuf fname = outputDir;
fname += "/lesson";
fname.appendFormatted("%.3d", lessonNumber);
fname += ".flash";
ofile.open(fname);
startFreq = w.freq;
}
SWBuf word = w.utf8;
word.trim();
SWBuf answers = "";
answers.trim();
// if we want answers as KJV phrases
if (kjvFreq) {
answers = prettyKJVFreq(w.kjvFreq);
if (answers.size() > 200) answers.size(200);
}
// if we would rather have short strongs
else {
answers = w.def;
strip.processText(answers); // remove html tags
answers.replaceBytes("\n\r", ' '); // remove newlines
}
// be sure we have both a word and an answer
if (word.size() && answers.size()) {
ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n";
ofile << "answers" << wordCount << "=" << answers << "\n";
lastFreq = w.freq;
wordCount++;
}
it++;
if (it == wordList.end() || wordCount >= maxPerLesson) {
// close lesson
SWBuf lessonTitle = "";
lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq);
ofile << lessonTitle;
ofile << "wordCount=" << wordCount << "\n";
if (fontName) {
ofile << "lessonFont=" << fontName << "\n";
}
ofile.close();
wordCount = 0;
lessonNumber++;
}
}
}
/**
* do the work
*
* range - the range of verses to process (e.g. "gen-mal")
* addAll - if we should add all words in our lexicon for the testaments
* included in the range even if they don't exist in the text
* (useful for generating complete OT or NT strongs word lists)
*
*/
vector<Word> processWords(const char *range, bool addAll = true) {
SWMgr manager;
map<SWBuf, Word> wordList;
SWModule *tmpBible = manager.getModule("KJV");
if (!tmpBible) {
cerr << "Unable to locate KJV module" << endl;
exit(1);
}
SWModule &bible = *tmpBible;
VerseKey parser;
ListKey r = parser.ParseVerseList(range, 0, true);
r.Persist(true);
bible.setKey(r);
for (bible = TOP; !bible.Error(); bible++) {
bible.RenderText(); // force an entry lookup to resolve key to something in the index
AttributeList &words = bible.getEntryAttributes()["Word"];
for (AttributeList::iterator word = words.begin(); word != words.end(); word++) {
SWBuf partCount = word->second["PartCount"];
int parts = atoi(partCount.c_str());
if (parts < 1) parts = 1;
// build a list of all lemmas for use later in 'with'
// i.e. 'translated xxx with Gnnnn1, Gnnnn2'
list<SWBuf> lemmas;
for (int i = 1; i <= parts; i++) {
SWBuf lemKey = "Lemma";
if (parts > 1) lemKey.appendFormatted(".%d", i);
lemmas.push_back(word->second[lemKey]);
}
for (int i = 1; i <= parts; i++) {
SWBuf lemKey = "Lemma";
if (parts > 1) lemKey.appendFormatted(".%d", i);
SWBuf strong = word->second[lemKey];
SWBuf text = word->second["Text"];
if (strong == "G3588") {
text = "[article]";
}
else {
text.trim();
// trim punctuation from end
while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1);
if (!text.size()) text = "[Untranslated]";
}
Phrase p;
p.phrase = text;
if ((parts > 1) && (strong != "G3588")) {
// lets build our 'with' list excluding ourselves
list<SWBuf> withoutMe = lemmas;
withoutMe.remove(strong);
// special handling of article. We don't want [+ 3588] all over the place
withoutMe.remove("G3588");
p.with = vector<SWBuf>(withoutMe.begin(), withoutMe.end());
}
wordList[strong].kjvFreq[p]++;
wordList[strong].freq++;
}
}
}
if (addAll) {
// first use utf8 list to iterate and add utf8 entries.\
// this assures we have an entry for every word, even if it is not
// present in the module
r = TOP;
if (VerseKey(r).Testament() == 1) {
for (SectionMap::iterator it = hebrew.Sections.begin(); it != hebrew.Sections.end(); it++) {
wordList[(SWBuf)"H0"+it->first].utf8; // just access to be sure created. We'll add later.
}
}
r = BOTTOM;
if (VerseKey(r).Testament() == 2) {
for (SectionMap::iterator it = greek.Sections.begin(); it != greek.Sections.end(); it++) {
wordList[(SWBuf)"G"+it->first].utf8;
}
}
}
vector<Word> sorted;
for (map<SWBuf, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
// pull strongs key from map and populate Word
SWBuf s = it->first;
//cout << s.c_str() << "\n";
it->second.strong = s;
char gh = s[0];
if (gh == 'G' || gh == 'H') {
s << 1;
}
s.setFormatted("%d", atoi(s.c_str()));
// populate lex defs
it->second.def = (gh == 'G')
? greek[s]["Meaning"]
: hebrew[s]["Meaning"];
// populate utf-8
it->second.utf8 = (gh == 'G')
? greek[s]["UTF8"]
: hebrew[s]["UTF8"];
// put only word in sorted container
sorted.push_back(it->second);
}
sort(sorted.begin(), sorted.end(), compareFreq);
return sorted;
}
void usage(const char *app, const char *error = 0) {
if (error) fprintf(stderr, "\n%s: %s\n", app, error);
fprintf(stderr, "\nusage: %s [OPTIONS]\n", app);
fprintf(stderr, " -c\t\t\t generate CSV file instead of flashcards\n");
fprintf(stderr, " -o <outputDir>\t directory to output files\n");
fprintf(stderr, " -w <wordCount>\t number of words per lesson (default 25)\n");
fprintf(stderr, " -d <m|k>\t\t definition to use (default k):\n");
fprintf(stderr, "\t\t\t\t m - short meaning; k - KJV collation\n");
fprintf(stderr, " -r <\"range\">\t\t verse range\n");
fprintf(stderr, " -f <fontName>\t\t include special font name for lesson\n");
fprintf(stderr, " -h\t\t\t display this help message\n\n");
exit(-1);
}
int main(int argc, char **argv) {
// Let's test our command line arguments
if (argc < 1) {
usage(*argv);
}
// variables for arguments, holding defaults
const char* program = argv[0];
bool csv = false;
char def = 'k';
SWBuf range = "Mat-Rev";
SWBuf fontName = "";
SWBuf outDir = ".";
int count = 25;
for (int i = 1; i < argc; i++) {
if (!strcmp(argv[i], "-c")) {
csv = true;
}
else if (!strcmp(argv[i], "-w")) {
if (i+1 < argc) {
count = atoi(argv[++i]);
if (count > 0) continue;
}
usage(*argv, "-d requires one of <m|k>");
}
else if (!strcmp(argv[i], "-d")) {
if (i+1 < argc) {
def = argv[++i][0];
if ((def == 'k') || (def == 'm')) continue;
}
usage(*argv, "-d requires one of <m|k>");
}
else if (!strcmp(argv[i], "-r")) {
if (i+1 < argc) range = argv[++i];
else usage(*argv, "-r requires <\"range\">");
}
else if (!strcmp(argv[i], "-f")) {
if (i+1 < argc) fontName = argv[++i];
else usage(*argv, "-f requires <\"fontName\">");
}
else if (!strcmp(argv[i], "-o")) {
if (i+1 < argc) outDir = argv[++i];
else usage(*argv, "-o requires <outputDir>");
}
else if ((!strcmp(argv[i], "-h"))
|| (!strcmp(argv[i], "-?"))
|| (!strcmp(argv[i], "--help"))
) {
usage(*argv);
}
else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
if (csv) {
outputCSV(processWords(range));
}
else {
outputFlash(processWords(range), outDir, (def == 'k'), count, (fontName.length()?fontName.c_str():0));
}
return 0;
}