diff options
author | Troy A. Griffitts <scribe@crosswire.org> | 2007-05-22 18:24:38 +0000 |
---|---|---|
committer | Troy A. Griffitts <scribe@crosswire.org> | 2007-05-22 18:24:38 +0000 |
commit | cc0ac51d3a7e4dd8ea61a0279ec31d429436da38 (patch) | |
tree | 9de56099ce641eec45e535c28225453869bebd32 /flashtools/flash.cpp | |
parent | ae3edd5f218a2fa6be1d104965c221076f750f48 (diff) | |
download | sword-tools-cc0ac51d3a7e4dd8ea61a0279ec31d429436da38.tar.gz |
Added facility to collate words from sword module
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@85 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'flashtools/flash.cpp')
-rw-r--r-- | flashtools/flash.cpp | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/flashtools/flash.cpp b/flashtools/flash.cpp new file mode 100644 index 0000000..49468bc --- /dev/null +++ b/flashtools/flash.cpp @@ -0,0 +1,131 @@ +#include <map> +#include <vector> +#include <iostream> +#include <sstream> + +#include <swmgr.h> +#include <swbuf.h> +#include <swmodule.h> +#include <utf8utf16.h> + +using namespace sword; +using namespace std; + +class Word { +public: +Word() + : utf8("") + , strong(0) + , freq(0) + , kjvTrans("") +{} +SWBuf utf8; +int strong; +int freq; +// from stongs lex +SWBuf kjvTrans; +// computed ourselves +map<SWBuf, int> kjvFreq; +}; + +string itoa(int v) { stringstream str; str << v; return str.str(); } + +bool compareFreq(const Word *w1, const Word *w2) { + return w1->freq > w2->freq; +} + +bool compareKJVFreq(const map<SWBuf, int>::const_iterator &i1, const map<SWBuf, int>::const_iterator &i2) { + return i1->second > i2->second; +} + +SWBuf prettyKJVFreq(map<SWBuf, int> &in) { + SWBuf retVal; + vector<map<SWBuf, int>::const_iterator> sorted; + for (map<SWBuf, int>::const_iterator it = in.begin(); it != in.end(); it++) { + // combine cap words with lowercase, if exists + if (toupper(it->first[0]) == it->first[0]) { + SWBuf key = it->first; + key[0] = tolower(key[0]); + if (key != it->first) { + map<SWBuf, int>::iterator i = in.find(key); + if (i != in.end()) { + i->second += it->second; + // don't include us in the list cuz we added out freq to another + continue; + } + } + } + sorted.push_back(it); + } + sort(sorted.begin(), sorted.end(), compareKJVFreq); + for (vector<map<SWBuf, int>::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) { + if (retVal.size()) retVal += "; "; + retVal.appendFormatted("%s (%d)", (*it)->first.c_str(), (*it)->second); + } + return retVal; +} + +SWBuf escapedUTF8(SWBuf inText) { + static UTF8UTF16 convert; + convert.processText(inText); + SWBuf retBuf; + for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) { + if (*i < 128) { + retBuf += (char)*i; + } + else { + retBuf.appendFormatted("\\u%.4x", *i); + // change hex alpha values to upper case + for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) { + retBuf[i] = toupper(retBuf[i]); + } + } + } + return retBuf; +} + + +int main(int argc, char **argv) +{ + + SWMgr manager; + SWModule *bible; + SWConfig utf8("gwords.conf"); + SWConfig defs("gdefs.conf"); + map<int, Word> wordList; + + bible = manager.getModule("KJV"); + + for (bible->setKey("matt.1.1"); !bible->Error(); (*bible)++) { + bible->RenderText(); // force an entry lookup to resolve key to something in the index + + AttributeList &words = bible->getEntryAttributes()["Word"]; + for (AttributeList::iterator word = words.begin(); word != words.end(); word++) { + SWBuf strong = word->second["Lemma"]; + SWBuf text = word->second["Text"]; + text.trim(); + if (!text.size()) text = "[Untranslated]"; + strong << 1; + wordList[atoi(strong.c_str())].freq++; + wordList[atoi(strong.c_str())].kjvFreq[text]++; +// cout << strong << "\n"; + } + } + vector<Word *> sorted; + for (map<int, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) { + it->second.strong = it->first; + it->second.kjvTrans = defs["defs"][itoa(it->first).c_str()]; + it->second.utf8 = utf8["words"][itoa(it->first).c_str()]; + sorted.push_back(&it->second); + } + + sort(sorted.begin(), sorted.end(), compareFreq); + + for (vector<Word *>::iterator it = sorted.begin(); it != sorted.end(); it++) { + Word *w = (*it); +// cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; + cout << w->freq << "|" << w->utf8.c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; + } + std::cout << std::endl; + return 0; +} |