1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
#include <map>
#include <vector>
#include <iostream>
#include <sstream>
#include <swmgr.h>
#include <swbuf.h>
#include <swmodule.h>
#include <utf8utf16.h>
#include <versekey.h>
using namespace sword;
using namespace std;
class Word {
public:
Word()
: utf8("")
, strong(0)
, freq(0)
, kjvTrans("")
{}
SWBuf utf8;
int strong;
int freq;
// from stongs lex
SWBuf kjvTrans;
// computed ourselves
map<SWBuf, int> kjvFreq;
};
string itoa(int v) { stringstream str; str << v; return str.str(); }
bool compareFreq(const Word *w1, const Word *w2) {
return w1->freq > w2->freq;
}
bool compareKJVFreq(const map<SWBuf, int>::const_iterator &i1, const map<SWBuf, int>::const_iterator &i2) {
return i1->second > i2->second;
}
SWBuf prettyKJVFreq(map<SWBuf, int> &in) {
SWBuf retVal;
vector<map<SWBuf, int>::const_iterator> sorted;
for (map<SWBuf, int>::const_iterator it = in.begin(); it != in.end(); it++) {
// combine cap words with lowercase, if exists
if (toupper(it->first[0]) == it->first[0]) {
SWBuf key = it->first;
key[0] = tolower(key[0]);
if (key != it->first) {
map<SWBuf, int>::iterator i = in.find(key);
if (i != in.end()) {
i->second += it->second;
// don't include us in the list cuz we added out freq to another
continue;
}
}
}
sorted.push_back(it);
}
sort(sorted.begin(), sorted.end(), compareKJVFreq);
for (vector<map<SWBuf, int>::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
if (retVal.size()) retVal += "; ";
retVal.appendFormatted("%s (%d)", (*it)->first.c_str(), (*it)->second);
}
return retVal;
}
SWBuf escapedUTF8(SWBuf inText) {
static UTF8UTF16 convert;
convert.processText(inText);
SWBuf retBuf;
for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) {
if (*i < 128) {
retBuf += (char)*i;
}
else {
retBuf.appendFormatted("\\u%.4x", *i);
// change hex alpha values to upper case
for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) {
retBuf[i] = toupper(retBuf[i]);
}
}
}
return retBuf;
}
int main(int argc, char **argv)
{
SWMgr manager;
SWModule *bible;
SWConfig utf8("hwords.conf");
SWConfig defs("hdefs.conf");
map<int, Word> wordList;
bible = manager.getModule("KJV");
for (bible->setKey("gen.1.1"); ((VerseKey*)bible->getKey())->Testament() == 1; (*bible)++) {
// for (bible->setKey("mat.1.1"); !bible->Error(); (*bible)++) {
bible->RenderText(); // force an entry lookup to resolve key to something in the index
AttributeList &words = bible->getEntryAttributes()["Word"];
for (AttributeList::iterator word = words.begin(); word != words.end(); word++) {
SWBuf strong = word->second["Lemma"];
SWBuf text = word->second["Text"];
text.trim();
// trim punctuation from end
while (text.size() && (strchr(".;,?-!\"()[]{}':/\t\r\n ", text[text.size()-1]))) text.setSize(text.size()-1);
if (!text.size()) text = "[Untranslated]";
strong << 1;
wordList[atoi(strong.c_str())].freq++;
wordList[atoi(strong.c_str())].kjvFreq[text]++;
// cout << strong << "\n";
}
}
// first use utf8 list to iterate and add utf8 entries.\
// this assures we have an entry for every word, even it it is not
// present in the module
for (ConfigEntMap::iterator it = utf8["words"].begin(); it != utf8["words"].end(); it++) {
wordList[atoi(it->first)].utf8 = it->second;
}
vector<Word *> sorted;
for (map<int, Word>::iterator it = wordList.begin(); it != wordList.end(); it++) {
it->second.strong = it->first;
it->second.kjvTrans = defs["defs"][itoa(it->first).c_str()];
sorted.push_back(&it->second);
}
sort(sorted.begin(), sorted.end(), compareFreq);
return 0;
}
void outputCSV(Vector<Word *>wordList) {
for (vector<Word *>::iterator it = wordList.begin(); it != wordList.end(); it++) {
Word *w = (*it);
// cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n";
cout << w->freq << "|" << w->utf8.c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "|" << w->kjvTrans << "\n";
}
std::cout << std::endl;
}
|