summaryrefslogblamecommitdiffstats
path: root/flashtools/flash.cpp
blob: 012496d10a37499e4d21a180c939f14509679640 (plain) (tree)





















                                                                               



                   
                  




                      
                     
                      



                      



                                                                
       

                            
          
                     
                           




















                                                                                          

  



                                            

            

                          
                            
                         
                         
          

                                                         
                   

                                                    
                     

                                                      
                 

                                                   
                  


                                                        

  
 

                                                                    
 

                                                  

 


                                                                                                 

 


                                                                              
                     

                                                                                
                                                              


                                                                                                                        
                                             
                                                                    
                                                    
                                                                
                                                                                                        






                                                           
                                                                                                                
                                                  
                                                          
                                               
                                       

                                                                                            


                                        
                                                                                              



                      


                                                                                         



















                                                                                   
                                                                                    


                                                                                        
                                                                                                                                               
                                                                                                                                              




                               










                                                                                                                         

                        
                             
                             

                             
 
                                                           
                                      
                                      
                                 

                                                


                                                                  
                                           

                 










                                                                    
                                        


                                                                                  

                                                            

                                                                                         
                                                                                  
                                          

















                                                                                                                          
 





                                                                          
                                                                            


                                                                  
                      
                                  
 



                                      






                                                              
 





                                                                                                                 
 
                                                                          
                                                                                                 


























                                                                                                                                                     

                                                
                                                
                                                                                         

                                                                       
                                                                                                   
                                 
                                                              

                                                        

                 
















                                                                                                                    
         











                                                                                            
         
                                                        












                                                                    

                 
 
/******************************************************************************
 *  flash.cpp - Automation of flashcards generation 
 *
 * Copyright 2007 CrossWire Bible Society (http://www.crosswire.org)
 *	CrossWire Bible Society
 *	P. O. Box 2528
 *	Tempe, AZ  85280-2528
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * Contributors:
 *	Lyndon Drake <lyndon at arotau dot com>
 *	Troy A. Griffitts <scribe at crosswire dot org>
 */

#include <map>
#include <vector>
#include <iostream>
#include <sstream>
#include <fstream>

#include <swmgr.h>
#include <swbuf.h>
#include <swmodule.h>
#include <utf8utf16.h>
#include <versekey.h>
#include <thmlplain.h>

using namespace sword;
using namespace std;

// used to hold a KJV translation phrase for a greek/hebrew word
// and any greek/hebrew words combined to make this KJV phrase
// e.g. hO QEOS = QEOS: [+ hO ]: God
class Phrase {
public:
	Phrase()
		: phrase("")
	{}
	SWBuf phrase;
	vector<SWBuf> with;
	inline bool operator ==(const Phrase &other) const { return !compare(other); }
	inline bool operator !=(const Phrase &other) const { return compare(other); }
	inline bool operator > (const Phrase &other) const { return compare(other) > 0; }
	inline bool operator < (const Phrase &other) const { return compare(other) < 0; }
	inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; }
	inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; }

	int compare(const Phrase &right) const {
		int c = phrase.compare(right.phrase);
		if (c) return c;
		vector<SWBuf>::const_iterator lit = with.begin();
		vector<SWBuf>::const_iterator rit = right.with.begin();
		while (lit != with.end() && rit != right.with.end()) {
			c = lit->compare(*rit);
			if (c) return c;
			lit++; rit++;
		}
		if (lit !=       with.end()) return  1;
		if (rit != right.with.end()) return -1;
		return 0;
	}
};

// KJV phrases and their occurance frequency
typedef map<Phrase, int> KJVPhrases;

// primary result class
class Word {
public:
	Word()
		: utf8("")
		, strong("")
		, freq(0)
		, def("")
	{}

	// lexical form of this word in utf8 greek/hebrew
	SWBuf utf8;

	// strongs number for this word (e.g. G3588)
	SWBuf strong;

	// frequency of occurance in the iterated text
	int freq;

	// definition pulled from short strongs def
	SWBuf def;

	// kjv translation phrases and their frequencies
	KJVPhrases kjvFreq;
};


string itoa(int v) { stringstream str; str << v; return str.str(); }


bool compareFreq(const Word &w1, const Word &w2) {
	return w1.freq > w2.freq;
}


bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) {
	return i1->second > i2->second;
}


// sort and pretty up all the KJV phrases for a word into a nice output buffer
SWBuf prettyKJVFreq(KJVPhrases in) {
	SWBuf retVal;
	vector<KJVPhrases::const_iterator> sorted;
	for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) {
		// combine cap words with lowercase, if exists
		Phrase k = it->first;
		if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") {
			k.phrase[0] = tolower(k.phrase[0]);
			if (k != it->first) {
				KJVPhrases::iterator i = in.find(k);
				if (i != in.end()) {
					i->second += it->second;
					// don't include us in the list cuz we added our freq to another
					continue;
				}
			}
		}
		sorted.push_back(it);
	}
	sort(sorted.begin(), sorted.end(), compareKJVFreq);
	for (vector<KJVPhrases::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
		if (retVal.size()) retVal += "; ";
		// prepend 'with other strongs' if present
		if ((*it)->first.with.size()) {
			retVal += "[+";
			for (int i = 0; i < (*it)->first.with.size(); i++) {
				retVal.appendFormatted(" %s", (*it)->first.with[i].c_str());
			}
			retVal += " ] ";
		}
		retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second);
	}
	return retVal;
}


// take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars
// java .properties files wants this format (flashcard .flash lessons use this format)
SWBuf escapedUTF8(SWBuf inText) {
	static UTF8UTF16 convert;
	convert.processText(inText);
	SWBuf retBuf;
	for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) {
		if (*i < 128) {
			retBuf += (char)*i;
		}
		else {
			retBuf.appendFormatted("\\u%.4x", *i);
			// change hex alpha values to upper case
			for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) {
				retBuf[i] = toupper(retBuf[i]);
			}
		}
	}
	return retBuf;
}


// output a simple CSV ('|' separated really) format for importing into OOo or excel
void outputCSV(vector<Word> &wordList) {
	for (vector<Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
		Word &w = (*it);
//		cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n";
		cout << w.freq << "|" << w.utf8.c_str() << "|" << w.strong << "|" << prettyKJVFreq(w.kjvFreq).c_str() << "|" << w.def << "\n";
	}
	std::cout << std::endl;
}


/**
 * output our flashcard .flash file format
 *
 * wordList - duh
 * outputDir - directory path where to write files, e.g. "./hebFreq"
 * kjvFreq - if true, process KJV translation frequencies and use these as
 *		the word answers; otherwise, use short strongs defs.
 * maxPerLesson - maximum number of words per lesson
 *
 */
void outputFlash(const vector<Word> &wordList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) {
	ThMLPlain strip;
	ofstream ofile;
	int wordCount    = 0;
	int lessonNumber = 0;
	int startFreq    = 0;
	int lastFreq     = 0;

	vector<Word>::const_iterator it = wordList.begin();
	while (it != wordList.end()) {
		const Word &w = (*it);
		if (!wordCount) {
			SWBuf fname = outputDir;
			fname += "/lesson";
			fname.appendFormatted("%d", lessonNumber);
			fname += ".flash";
			ofile.open(fname);
			startFreq = w.freq;
		}

		SWBuf word = w.utf8;
		word.trim();
		SWBuf answers = "";
		answers.trim();
		// if we want answers as KJV phrases
		if (kjvFreq) {
			answers = prettyKJVFreq(w.kjvFreq);
			if (answers.size() > 200) answers.size(200);
		}
		// if we would rather have short strongs
		else {
			answers = w.def;
			strip.processText(answers);	// remove html tags
			answers.replaceBytes("\n\r", ' ');	// remove newlines
		}

		// be sure we have both a word and an answer
		if (word.size() && answers.size()) {
			ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n";
			ofile << "answers" << wordCount << "=" << answers << "\n";
			lastFreq = w.freq;
			wordCount++;
		}

		it++;

		if (it == wordList.end() || wordCount >= maxPerLesson) {
			// close lesson
			SWBuf lessonTitle = "";
			lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq);
			ofile << lessonTitle;
			ofile << "wordCount=" << wordCount << "\n";
			ofile.close();
			wordCount = 0;
			lessonNumber++;
		}
	} 
}


/**
 * do the work
 *
 * range - the range of verses to process (e.g. "gen-mal")
 * addAll - if we should add all words in our lexicon for the testaments
 *		included in the range even if they don't exist in the text
 *		(useful for generating complete OT or NT strongs word lists)
 *
 */
vector<Word> processWords(const char *range, bool addAll = true) {
	SWMgr manager;
	map<SWBuf, Word> wordList;

	SWConfig hutf8("hwords.conf");
	SWConfig hdefs("hdefs.conf");
	SWConfig gutf8("gwords.conf");
	SWConfig gdefs("gdefs.conf");
	
	SWModule *tmpBible = manager.getModule("KJV");
	if (!tmpBible) {
		cerr << "Unable to locate KJV module" << endl;
		exit(1);
	}
	SWModule &bible = *tmpBible;

	VerseKey parser;
	ListKey r = parser.ParseVerseList(range, 0, true);
	r.Persist(true);
	bible.setKey(r);
	for (bible = TOP; !bible.Error(); bible++) {
		bible.RenderText();		// force an entry lookup to resolve key to something in the index

		AttributeList &words = bible.getEntryAttributes()["Word"];
		for (AttributeList::iterator word = words.begin(); word != words.end(); word++) {
			SWBuf partCount = word->second["PartCount"];
			int parts = atoi(partCount.c_str());
			if (parts < 1) parts = 1;

			// build a list of all lemmas for use later in 'with'
			// i.e. 'translated xxx with Gnnnn1, Gnnnn2'
			list<SWBuf> lemmas;
			for (int i = 1; i <= parts; i++) {
				SWBuf lemKey = "Lemma";
				if (parts > 1) lemKey.appendFormatted(".%d", i);
				lemmas.push_back(word->second[lemKey]);
			}

			for (int i = 1; i <= parts; i++) {
				SWBuf lemKey = "Lemma";
				if (parts > 1) lemKey.appendFormatted(".%d", i);
				SWBuf strong = word->second[lemKey];
				SWBuf text = word->second["Text"];
				if ((parts > 2) && (strong == "G3588")) {
					text = "[article]";
				}
				else {
					text.trim();
					// trim punctuation from end
					while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1);
					if (!text.size()) text = "[Untranslated]";
				}
				Phrase p;
				p.phrase = text;
				if (parts > 1) {
					// lets build our 'with' list excluding ourselves
					list<SWBuf> withoutMe = lemmas;
					withoutMe.remove(strong);
					p.with = vector<SWBuf>(withoutMe.begin(), withoutMe.end());
				}
				wordList[strong].kjvFreq[p]++;
				wordList[strong].freq++;
			}
		}
	}
	
	if (addAll) {
		// first use utf8 list to iterate and add utf8 entries.\
		// this assures we have an entry for every word, even if it is not
		// present in the module
		r = TOP;
		if (VerseKey(r).Testament() == 1) {
			for (ConfigEntMap::iterator it = hutf8["words"].begin(); it != hutf8["words"].end(); it++) {
				wordList[(SWBuf)"H"+it->first].utf8 = it->second;
			}
		}
		r = BOTTOM;
		if (VerseKey(r).Testament() == 2) {
			for (ConfigEntMap::iterator it = gutf8["words"].begin(); it != gutf8["words"].end(); it++) {
				wordList[(SWBuf)"G"+it->first].utf8 = it->second;
			}
		}
	}

	vector<Word> sorted;
	for (map<SWBuf, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
		// pull strongs key from map and populate Word
		SWBuf s = it->first;
		it->second.strong = s;
		// populate lex defs
		it->second.def = (s[0] == 'G') ?
			gdefs["defs"][(s << 1).c_str()] :
			hdefs["defs"][(s << 1).c_str()];
		// put only word in sorted container
		sorted.push_back(it->second);
	}
	sort(sorted.begin(), sorted.end(), compareFreq);

	return sorted;
}


int main(int argc, char **argv)
{
	outputFlash(processWords("gen-mal"), "hebFreqKJV"  , true);
	outputFlash(processWords("gen-mal"), "hebFreq"     , false);
	outputFlash(processWords("mat-rev"), "greekFreqKJV", true);
	outputFlash(processWords("mat-rev"), "greekFreq"   , false);
//	outputCSV(processWords("mat-rev"));

	return 0;
}