diff options
author | Troy A. Griffitts <scribe@crosswire.org> | 2007-09-07 22:44:35 +0000 |
---|---|---|
committer | Troy A. Griffitts <scribe@crosswire.org> | 2007-09-07 22:44:35 +0000 |
commit | 46847b3cb05501d3d891f356dd50c5fff36b2843 (patch) | |
tree | 0e21d38d48f95221390770651e3ef97c624b858c /textsstats/stats.cpp | |
parent | c249f2401aec8638a0ae1487cc1e91ea196c63d0 (diff) | |
download | sword-tools-46847b3cb05501d3d891f356dd50c5fff36b2843.tar.gz |
Added some basic greek text statistical analysis
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@98 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'textsstats/stats.cpp')
-rw-r--r-- | textsstats/stats.cpp | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/textsstats/stats.cpp b/textsstats/stats.cpp new file mode 100644 index 0000000..c2b7576 --- /dev/null +++ b/textsstats/stats.cpp @@ -0,0 +1,358 @@ +/****************************************************************************** + * flash.cpp - Automation of flashcards generation + * + * Copyright 2007 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contributors: + * Lyndon Drake <lyndon at arotau dot com> + * Troy A. Griffitts <scribe at crosswire dot org> + */ + +#include <map> +#include <vector> +#include <iostream> +#include <sstream> +#include <fstream> + +#include <swmgr.h> +#include <swbuf.h> +#include <swmodule.h> +#include <utf8utf16.h> +#include <utf16utf8.h> +#include <versekey.h> +#include <thmlplain.h> + +using namespace sword; +using namespace std; + +namespace { + const int GREEK_START = 0x370; + const int GREEK_END = 0x3FF; +}; + +// used to hold a KJV translation phrase for a greek/hebrew word +// and any greek/hebrew words combined to make this KJV phrase +// e.g. hO QEOS = QEOS: [+ hO ]: God +class Phrase { +public: + Phrase() + : phrase("") + {} + SWBuf phrase; + vector<SWBuf> with; + inline bool operator ==(const Phrase &other) const { return !compare(other); } + inline bool operator !=(const Phrase &other) const { return compare(other); } + inline bool operator > (const Phrase &other) const { return compare(other) > 0; } + inline bool operator < (const Phrase &other) const { return compare(other) < 0; } + inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; } + inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; } + + int compare(const Phrase &right) const { + int c = phrase.compare(right.phrase); + if (c) return c; + vector<SWBuf>::const_iterator lit = with.begin(); + vector<SWBuf>::const_iterator rit = right.with.begin(); + while (lit != with.end() && rit != right.with.end()) { + c = lit->compare(*rit); + if (c) return c; + lit++; rit++; + } + if (lit != with.end()) return 1; + if (rit != right.with.end()) return -1; + return 0; + } +}; + +// KJV phrases and their occurance frequency +typedef map<Phrase, int> KJVPhrases; + +// primary result class +class Word { +public: + Word() + : utf8("") + , strong("") + , freq(0) + , def("") + {} + + // lexical form of this word in utf8 greek/hebrew + SWBuf utf8; + vector<unsigned short> utf16; + + // strongs number for this word (e.g. G3588) + SWBuf strong; + + // frequency of occurance in the iterated text + int freq; + + // definition pulled from short strongs def + SWBuf def; + + // kjv translation phrases and their frequencies + KJVPhrases kjvFreq; +}; + + +string itoa(int v) { stringstream str; str << v; return str.str(); } + + +bool compareFreq(const Word &w1, const Word &w2) { + return w1.freq > w2.freq; +} + + +bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) { + return i1->second > i2->second; +} + + +// sort and pretty up all the KJV phrases for a word into a nice output buffer +SWBuf prettyKJVFreq(KJVPhrases in) { + SWBuf retVal; + vector<KJVPhrases::const_iterator> sorted; + for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) { + // combine cap words with lowercase, if exists + Phrase k = it->first; + if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") { + k.phrase[0] = tolower(k.phrase[0]); + if (k != it->first) { + KJVPhrases::iterator i = in.find(k); + if (i != in.end()) { + i->second += it->second; + // don't include us in the list cuz we added our freq to another + continue; + } + } + } + sorted.push_back(it); + } + sort(sorted.begin(), sorted.end(), compareKJVFreq); + for (vector<KJVPhrases::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) { + if (retVal.size()) retVal += "; "; + // prepend 'with other strongs' if present + if ((*it)->first.with.size()) { + retVal += "[+"; + for (int i = 0; i < (*it)->first.with.size(); i++) { + retVal.appendFormatted(" %s", (*it)->first.with[i].c_str()); + } + retVal += " ] "; + } + retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second); + } + return retVal; +} + + +// take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars +// java .properties files wants this format (flashcard .flash lessons use this format) +SWBuf escapedUTF8(SWBuf inText) { + static UTF8UTF16 convert; + convert.processText(inText); + SWBuf retBuf; + for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) { + if (*i < 128) { + retBuf += (char)*i; + } + else { + retBuf.appendFormatted("\\u%.4x", *i); + // change hex alpha values to upper case + for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) { + retBuf[i] = toupper(retBuf[i]); + } + } + } + return retBuf; +} + +SWBuf toUTF8(const vector<unsigned short> &utf16) { + static UTF16UTF8 convert; + SWBuf retVal; + retVal.size((utf16.size()+1)*2); + unsigned short *i = (unsigned short *)retVal.getRawData(); + int j; + for (j = 0; j < utf16.size(); j++) { + i[j] = utf16[j]; + } + i[j] = 0; + convert.processText(retVal); + return retVal; +} + +// output a simple CSV ('|' separated really) format for importing into OOo or excel +void outputCSV(const vector<Word> &seqList) { + for (vector<Word>::const_iterator it = seqList.begin(); it != seqList.end(); it++) { + const Word &w = (*it); +// cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; + cout << w.freq << "|" << toUTF8(w.utf16).c_str() << "|" << w.utf16.size() << "\n"; + } + std::cout << std::endl; +} + + +/** + * output our flashcard .flash file format + * + * seqList - duh + * outputDir - directory path where to write files, e.g. "./hebFreq" + * kjvFreq - if true, process KJV translation frequencies and use these as + * the word answers; otherwise, use short strongs defs. + * maxPerLesson - maximum number of words per lesson + * + */ +void outputFlash(const vector<Word> &seqList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) { + ThMLPlain strip; + ofstream ofile; + int wordCount = 0; + int lessonNumber = 0; + int startFreq = 0; + int lastFreq = 0; + + vector<Word>::const_iterator it = seqList.begin(); + while (it != seqList.end()) { + const Word &w = (*it); + if (!wordCount) { + SWBuf fname = outputDir; + fname += "/lesson"; + fname.appendFormatted("%d", lessonNumber); + fname += ".flash"; + ofile.open(fname); + startFreq = w.freq; + } + + SWBuf word = w.utf8; + word.trim(); + SWBuf answers = ""; + answers.trim(); + // if we want answers as KJV phrases + if (kjvFreq) { + answers = prettyKJVFreq(w.kjvFreq); + if (answers.size() > 200) answers.size(200); + } + // if we would rather have short strongs + else { + answers = w.def; + strip.processText(answers); // remove html tags + answers.replaceBytes("\n\r", ' '); // remove newlines + } + + // be sure we have both a word and an answer + if (word.size() && answers.size()) { + ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n"; + ofile << "answers" << wordCount << "=" << answers << "\n"; + lastFreq = w.freq; + wordCount++; + } + + it++; + + if (it == seqList.end() || wordCount >= maxPerLesson) { + // close lesson + SWBuf lessonTitle = ""; + lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq); + ofile << lessonTitle; + ofile << "wordCount=" << wordCount << "\n"; + ofile.close(); + wordCount = 0; + lessonNumber++; + } + } +} + + +/** + * do the work + * + * range - the range of verses to process (e.g. "gen-mal") + * addAll - if we should add all words in our lexicon for the testaments + * included in the range even if they don't exist in the text + * (useful for generating complete OT or NT strongs word lists) + * + */ +vector<Word> processSequences(const char *range, int seqLength) { + SWMgr manager; + manager.setGlobalOption("Greek Accents", "Off"); + UTF8UTF16 toUTF16; + + map<vector<unsigned short>, Word> seqList; + + SWModule *tmpBible = manager.getModule("WHNU"); + if (!tmpBible) { + cerr << "Unable to locate WHNU module" << endl; + exit(1); + } + SWModule &bible = *tmpBible; + + VerseKey parser; + ListKey r = parser.ParseVerseList(range, 0, true); + r.Persist(true); + bible.setKey(r); + for (bible = TOP; !bible.Error(); bible++) { + bible.RenderText(); // force an entry lookup to resolve key to something in the index + SWBuf text = bible.StripText(); + toUTF16.processText(text); + for (unsigned short *i = (unsigned short *)text.getRawData(); *i; i++) { + vector<unsigned short> seq; + int j; + for (j = 0; ((j < seqLength) && (i[j] >= GREEK_START) && (i[j] <= GREEK_END)); j++) { + seq.push_back(i[j]); + } + if (seq.size() == seqLength) { + seqList[seq].freq++; + } + else { + if (!i[j]) { + // we don't need to process the rest of this text as all remaining seq lengths will fail + break; + } + } + } + } + + vector<Word> sorted; + for (map<vector<unsigned short>, Word>::iterator it = seqList.begin(); it != seqList.end(); it++) { + // pull utf16 key from map and populate Word + it->second.utf16 = it->first; + // put only word in sorted container + sorted.push_back(it->second); + } + sort(sorted.begin(), sorted.end(), compareFreq); + + return sorted; +} + + +int main(int argc, char **argv) +{ + int minLength = 1; + int maxLength = 3; + char *range = "mat-rev"; + + if (argc > 1) minLength = atoi(argv[1]); + if (argc > 2) maxLength = atoi(argv[2]); + if (argc > 3) range = argv[3]; + + vector<Word> results; + for (int i = minLength; i <= maxLength; i++) { + vector<Word> pass = processSequences(range, i); + results.insert(results.end(), pass.begin(), pass.end()); + } + + sort(results.begin(), results.end(), compareFreq); + outputCSV(results); + + return 0; +} + |