Started rework of flash tools to work with

combined lemma of the new KJV module rev. git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@95 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Troy A. Griffitts <scribe@crosswire.org> 2007-09-02 18:27:12 +0000
committer: Troy A. Griffitts <scribe@crosswire.org> 2007-09-02 18:27:12 +0000
commit: b11ba6b03dfe8ecdcaa8a71747df6da1fcf21112 (patch)
tree: 52c10ba77e33db35cf7beb9bdc1e9d8d2993e268 /flashtools
parent: 50905796dd888f347111ed05da4cce83efe4d315 (diff)
download: sword-tools-b11ba6b03dfe8ecdcaa8a71747df6da1fcf21112.tar.gz
2 files changed, 180 insertions, 81 deletions
diff --git a/flashtools/Makefile b/flashtools/Makefile
index b8683c9..dc41e0a 100644
--- a/flashtools/Makefile
+++ b/flashtools/Makefile
@@ -1,7 +1,9 @@
 TARGETS= flash
 all: $(TARGETS)
+	mkdir -p hebFreq hebFreqKJV greekFreq greekFreqKJV
 
 clean:
+	rm -rf hebFreq hebFreqKJV greekFreq greekFreqKJV
 	rm $(TARGETS)
 
 .cpp:
diff --git a/flashtools/flash.cpp b/flashtools/flash.cpp
index eedc959..b4fd726 100644
--- a/flashtools/flash.cpp
+++ b/flashtools/flash.cpp
@@ -14,46 +14,55 @@
 using namespace sword;
 using namespace std;
 
+class PhraseCount {
+public:
+	PhraseCount()
+		: count(0)
+	{}
+	int count;
+	vector<SWBuf> with;
+};
+
 class Word {
 public:
 	Word()
 		: utf8("")
-		, strong(0)
+		, strong("")
 		, freq(0)
-		, kjvTrans("")
+		, def("")
 	{}
 	SWBuf utf8;
-	int strong;
+	SWBuf strong;
 	int freq;
 	// from stongs lex
-	SWBuf kjvTrans;
+	SWBuf def;
 	// computed ourselves
-	map<SWBuf, int> kjvFreq;
+	map<SWBuf, PhraseCount> kjvFreq;
 };
 
 string itoa(int v) { stringstream str; str << v; return str.str(); }
 
-bool compareFreq(const Word *w1, const Word *w2) {
-	return w1->freq > w2->freq;
+bool compareFreq(const Word &w1, const Word &w2) {
+	return w1.freq > w2.freq;
 }
 
-bool compareKJVFreq(const map<SWBuf, int>::const_iterator &i1, const map<SWBuf, int>::const_iterator &i2) {
-	return i1->second > i2->second;
+bool compareKJVFreq(const map<SWBuf, PhraseCount>::const_iterator &i1, const map<SWBuf, PhraseCount>::const_iterator &i2) {
+	return i1->second.count > i2->second.count;
 }
 
-SWBuf prettyKJVFreq(map<SWBuf, int> &in) {
+SWBuf prettyKJVFreq(map<SWBuf, PhraseCount> in) {
 	SWBuf retVal;
-	vector<map<SWBuf, int>::const_iterator> sorted;
-	for (map<SWBuf, int>::const_iterator it = in.begin(); it != in.end(); it++) {
+	vector<map<SWBuf, PhraseCount>::const_iterator> sorted;
+	for (map<SWBuf, PhraseCount>::const_iterator it = in.begin(); it != in.end(); it++) {
 		// combine cap words with lowercase, if exists
-		if (toupper(it->first[0]) == it->first[0] && it->first != "God" && it->first != "Lord") {
-			SWBuf key = it->first;
-			key[0] = tolower(key[0]);
-			if (key != it->first) {
-				map<SWBuf, int>::iterator i = in.find(key);
+		SWBuf k = it->first;
+		if (k.size() && toupper(k[0]) == k[0] && k != "God" && k != "Lord") {
+			k[0] = tolower(k[0]);
+			if (k != it->first) {
+				map<SWBuf, PhraseCount>::iterator i = in.find(k);
 				if (i != in.end()) {
-					i->second += it->second;
-					// don't include us in the list cuz we added out freq to another
+					i->second.count += it->second.count;
+					// don't include us in the list cuz we added our freq to another
 					continue;
 				}
 			}
@@ -61,9 +70,17 @@ SWBuf prettyKJVFreq(map<SWBuf, int> &in) {
 		sorted.push_back(it);
 	}
 	sort(sorted.begin(), sorted.end(), compareKJVFreq);
-	for (vector<map<SWBuf, int>::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
+	for (vector<map<SWBuf, PhraseCount>::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
 		if (retVal.size()) retVal += "; ";
-		retVal.appendFormatted("%s (%d)", (*it)->first.c_str(), (*it)->second);
+		// prepend 'with other strongs' if present
+		if ((*it)->second.with.size()) {
+			retVal += "[+";
+			for (int i = 0; i < (*it)->second.with.size(); i++) {
+				retVal.appendFormatted(" %s", (*it)->second.with[i].c_str());
+			}
+			retVal += " ] ";
+		}
+		retVal.appendFormatted("%s (%d)", (*it)->first.c_str(), (*it)->second.count);
 	}
 	return retVal;
 }
@@ -88,49 +105,67 @@ SWBuf escapedUTF8(SWBuf inText) {
 }
 
 
-void outputCSV(vector<Word *> &wordList) {
-	for (vector<Word *>::iterator it = wordList.begin(); it != wordList.end(); it++) {
-		Word *w = (*it);
+void outputCSV(vector<Word> &wordList) {
+	for (vector<Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
+		Word &w = (*it);
 //		cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n";
-		cout << w->freq << "|" << w->utf8.c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "|" << w->kjvTrans << "\n";
+		cout << w.freq << "|" << w.utf8.c_str() << "|" << w.strong << "|" << prettyKJVFreq(w.kjvFreq).c_str() << "|" << w.def << "\n";
 	}
 	std::cout << std::endl;
 }
 
 
-void outputFlash(vector<Word *> &wordList, int maxPerLesson) {
+/**
+ * output our flashcard .flash file format
+ *
+ * wordList - duh
+ * outputDir - directory path where to write files, e.g. "./hebFreq"
+ * kjvFreq - if true, process KJV translation frequencies and use these as
+ *		the word answers; otherwise, use short strongs defs.
+ * maxPerLesson - maximum number of words per lesson
+ *
+ */
+void outputFlash(const vector<Word> &wordList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) {
 	ThMLPlain strip;
 	ofstream ofile;
-	int wordCount = 0;
+	int wordCount    = 0;
 	int lessonNumber = 0;
-	int startFreq = 0;
-	int lastFreq = 0;
+	int startFreq    = 0;
+	int lastFreq     = 0;
 
-	vector<Word *>::iterator it = wordList.begin();
+	vector<Word>::const_iterator it = wordList.begin();
 	while (it != wordList.end()) {
-		Word *w = (*it);
+		const Word &w = (*it);
 		if (!wordCount) {
-			SWBuf fname = "lesson";
+			SWBuf fname = outputDir;
+			fname += "/lesson";
 			fname.appendFormatted("%d", lessonNumber);
 			fname += ".flash";
 			ofile.open(fname);
-			startFreq = w->freq;
+			startFreq = w.freq;
 		}
 
-		// use if you want answers as KJV phrases
-		SWBuf answers = prettyKJVFreq(w->kjvFreq);
-		if (answers.size() > 200) answers.size(200);
-
-		// use if you would rather have short strongs
-//		SWBuf answers = w->kjvTrans;
-//		strip.processText(answers);	// remove html tags
-//		answers.replaceBytes("\n\r", ' ');	// remove newlines
+		SWBuf word = w.utf8;
+		word.trim();
+		SWBuf answers = "";
+		answers.trim();
+		// if we want answers as KJV phrases
+		if (kjvFreq) {
+			answers = prettyKJVFreq(w.kjvFreq);
+			if (answers.size() > 200) answers.size(200);
+		}
+		// if we would rather have short strongs
+		else {
+			SWBuf answers = w.def;
+			strip.processText(answers);	// remove html tags
+			answers.replaceBytes("\n\r", ' ');	// remove newlines
+		}
 
 		// be sure we have both a word and an answer
-		if (w->utf8.trim().size() && answers.trim().size()) {
-			ofile << "word" << wordCount << "=" << escapedUTF8(w->utf8) << "\n";
+		if (word.size() && answers.size()) {
+			ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n";
 			ofile << "answers" << wordCount << "=" << answers << "\n";
-			lastFreq = w->freq;
+			lastFreq = w.freq;
 			wordCount++;
 		}
 
@@ -149,53 +184,115 @@ void outputFlash(vector<Word *> &wordList, int maxPerLesson) {
 	} 
 }
 
-
-int main(int argc, char **argv)
-{
-	
+/**
+ * do the work
+ *
+ * range - the range of verses to process (e.g. "gen-mal")
+ * addAll - if we should add all words in our lexicon for the testaments
+ *		included in the range even if they don't exist in the text
+ *
+ */
+vector<Word> processWords(const char *range, bool addAll = true) {
 	SWMgr manager;
-	SWModule *bible = manager.getModule("KJV");
-	map<int, Word> wordList;
+	SWModule &bible = *manager.getModule("KJV");
+	map<SWBuf, Word> wordList;
 
-	SWConfig utf8("hwords.conf");
-	SWConfig defs("hdefs.conf");
-//	SWConfig utf8("gwords.conf");
-//	SWConfig defs("gdefs.conf");
+	SWConfig hutf8("hwords.conf");
+	SWConfig hdefs("hdefs.conf");
+	SWConfig gutf8("gwords.conf");
+	SWConfig gdefs("gdefs.conf");
 
-	for (bible->setKey("gen.1.1"); ((VerseKey*)bible->getKey())->Testament() == 1; (*bible)++) {
-//	for (bible->setKey("mat.1.1"); !bible->Error(); (*bible)++) {
-		bible->RenderText();		// force an entry lookup to resolve key to something in the index
+	VerseKey parser;
+	ListKey r = parser.ParseVerseList(range, 0, true);
+	r.Persist(true);
+	bible.setKey(r);
+	for (bible = TOP; !bible.Error(); bible++) {
+		bible.RenderText();		// force an entry lookup to resolve key to something in the index
 
-		AttributeList &words = bible->getEntryAttributes()["Word"];
+		AttributeList &words = bible.getEntryAttributes()["Word"];
 		for (AttributeList::iterator word = words.begin(); word != words.end(); word++) {
-			SWBuf strong = word->second["Lemma"];
-			SWBuf text = word->second["Text"];
-			text.trim();
-			// trim punctuation from end
-			while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1);
-			if (!text.size()) text = "[Untranslated]";
-			strong << 1;
-			wordList[atoi(strong.c_str())].freq++;
-			wordList[atoi(strong.c_str())].kjvFreq[text]++;
-//			cout << strong << "\n";
+			SWBuf partCount = word->second["PartCount"];
+			int parts = atoi(partCount.c_str());
+			if (parts < 1) parts = 1;
+
+			// build a list of all lemmas for use later in 'with'
+			// i.e. 'translated xxx with Gnnnn1, Gnnnn2'
+			list<SWBuf> lemmas;
+			for (int i = 1; i <= parts; i++) {
+				SWBuf lemKey = "Lemma";
+				if (parts > 1) lemKey.appendFormatted(".%d", i);
+				lemmas.push_back(word->second[lemKey]);
+			}
+
+			for (int i = 1; i <= parts; i++) {
+				SWBuf lemKey = "Lemma";
+				if (parts > 1) lemKey.appendFormatted(".%d", i);
+				SWBuf strong = word->second[lemKey];
+				SWBuf text = word->second["Text"];
+				if ((parts > 2) && (strong == "G3588")) {
+					text = "[article]";
+				}
+				else {
+					text.trim();
+					// trim punctuation from end
+					while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1);
+					if (!text.size()) text = "[Untranslated]";
+				}
+				wordList[strong].kjvFreq[text].count++;
+				if (parts > 1) {
+					list<SWBuf> withoutMe = lemmas;
+					withoutMe.remove(strong);
+					wordList[strong].kjvFreq[text].with = vector<SWBuf>(withoutMe.begin(), withoutMe.end());
+				}
+				wordList[strong].freq++;
+			}
 		}
 	}
-	// first use utf8 list to iterate and add utf8 entries.\
-	// this assures we have an entry for every word, even it it is not
-	// present in the module
-	for (ConfigEntMap::iterator it = utf8["words"].begin(); it != utf8["words"].end(); it++) {
-		wordList[atoi(it->first)].utf8 = it->second;
+	
+	if (addAll) {
+		// first use utf8 list to iterate and add utf8 entries.\
+		// this assures we have an entry for every word, even if it is not
+		// present in the module
+		r = TOP;
+		if (VerseKey(r).Testament() == 1) {
+			for (ConfigEntMap::iterator it = hutf8["words"].begin(); it != hutf8["words"].end(); it++) {
+				wordList[(SWBuf)"H"+it->first].utf8 = it->second;
+			}
+		}
+		r = BOTTOM;
+		if (VerseKey(r).Testament() == 2) {
+			for (ConfigEntMap::iterator it = gutf8["words"].begin(); it != gutf8["words"].end(); it++) {
+				wordList[(SWBuf)"G"+it->first].utf8 = it->second;
+			}
+		}
 	}
-	vector<Word *> sorted;
-	for (map<int, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
-		it->second.strong = it->first;
-		it->second.kjvTrans = defs["defs"][itoa(it->first).c_str()];
-		sorted.push_back(&it->second);
+
+	vector<Word> sorted;
+	for (map<SWBuf, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
+		// pull strongs key from map and populate Word
+		SWBuf s = it->first;
+		it->second.strong = s;
+		// populate lex defs
+		it->second.def = (s[0] == 'G') ?
+			gdefs["defs"][(s << 1).c_str()] :
+			hdefs["defs"][(s << 1).c_str()];
+		// put only word in sorted container
+		sorted.push_back(it->second);
 	}
-	
 	sort(sorted.begin(), sorted.end(), compareFreq);
-//	outputCSV(sorted);
-	outputFlash(sorted, 25);
+
+	return sorted;
+}
+
+
+int main(int argc, char **argv)
+{
+	outputFlash(processWords("gen-mal"), "hebFreqKJV"  , true);
+	outputFlash(processWords("gen-mal"), "hebFreq"     , false);
+	outputFlash(processWords("mat-rev"), "greekFreqKJV", true);
+	outputFlash(processWords("mat-rev"), "greekFreq"   , false);
+//	outputCSV(processWords("mat-rev"));
+
 	return 0;
 }
author	Troy A. Griffitts <scribe@crosswire.org>	2007-09-02 18:27:12 +0000
committer	Troy A. Griffitts <scribe@crosswire.org>	2007-09-02 18:27:12 +0000
commit	b11ba6b03dfe8ecdcaa8a71747df6da1fcf21112 (patch)
tree	52c10ba77e33db35cf7beb9bdc1e9d8d2993e268 /flashtools
parent	50905796dd888f347111ed05da4cce83efe4d315 (diff)
download	sword-tools-b11ba6b03dfe8ecdcaa8a71747df6da1fcf21112.tar.gz