Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

rawtext.cpp

00001 /******************************************************************************
00002  *  rawtext.cpp - code for class 'RawText'- a module that reads raw text files:
00003  *                ot and nt using indexs ??.bks ??.cps ??.vss
00004  */
00005 
00006 
00007 #include <stdio.h>
00008 #include <fcntl.h>
00009 
00010 #ifndef __GNUC__
00011 #include <io.h>
00012 #else
00013 #include <unistd.h>
00014 #endif
00015 
00016 #include <string.h>
00017 #include <utilfuns.h>
00018 #include <rawverse.h>
00019 #include <rawtext.h>
00020 
00021 #include <map>
00022 #include <list>
00023 #include <algorithm>
00024 #include <regex.h>      // GNU
00025 
00026 #ifndef O_BINARY
00027 #define O_BINARY 0
00028 #endif
00029 
00030 /******************************************************************************
00031  * RawText Constructor - Initializes data for instance of RawText
00032  *
00033  * ENT: iname - Internal name for module
00034  *      idesc - Name to display to user for module
00035  *      idisp    - Display object to use for displaying
00036  */
00037 
00038 RawText::RawText(const char *ipath, const char *iname, const char *idesc, SWDisplay *idisp, SWTextEncoding enc, SWTextDirection dir, SWTextMarkup mark, const char* ilang)
00039                 : SWText(iname, idesc, idisp, enc, dir, mark, ilang),
00040           RawVerse(ipath) {
00041           
00042         string fname;
00043         fname = path;
00044         char ch = fname.c_str()[strlen(fname.c_str())-1];
00045         if ((ch != '/') && (ch != '\\'))
00046                 fname += "/";
00047         
00048         for (int loop = 0; loop < 2; loop++) {
00049         fastSearch[loop] = 0;
00050                 string fastidxname =(fname + ((loop)?"ntwords.dat":"otwords.dat"));
00051                 if (!access(fastidxname.c_str(), 04)) {
00052                         fastidxname = (fname + ((loop)?"ntwords.idx":"otwords.idx"));
00053                         if (!access(fastidxname.c_str(), 04))
00054                                 fastSearch[loop] = new RawStr((fname + ((loop)?"ntwords":"otwords")).c_str());
00055                 }
00056         }
00057 }
00058 
00059 
00060 /******************************************************************************
00061  * RawText Destructor - Cleans up instance of RawText
00062  */
00063 
00064 RawText::~RawText()
00065 {
00066         if (fastSearch[0])
00067                 delete fastSearch[0];
00068 
00069         if (fastSearch[1])
00070                 delete fastSearch[1];
00071 }
00072 
00073 
00074 /******************************************************************************
00075  * RawText::operator char *     - Returns the correct verse when char * cast
00076  *                                      is requested
00077  *
00078  * RET: string buffer with verse
00079  */
00080 
00081 char *RawText::getRawEntry() {
00082         long  start = 0;
00083         unsigned short size = 0;
00084         VerseKey *key = 0;
00085 
00086         // see if we have a VerseKey * or decendant
00087 #ifndef _WIN32_WCE
00088         try {
00089 #endif
00090                 key = SWDYNAMIC_CAST(VerseKey, this->key);
00091 #ifndef _WIN32_WCE
00092         }
00093         catch ( ... ) { }
00094 #endif
00095         // if we don't have a VerseKey * decendant, create our own
00096         if (!key)
00097                 key = new VerseKey(this->key);
00098 
00099         findoffset(key->Testament(), key->Index(), &start, &size);
00100         entrySize = size;        // support getEntrySize call
00101 
00102         unsigned long newsize = (size + 2) * FILTERPAD;
00103         if (newsize > entrybufallocsize) {
00104                 if (entrybuf)
00105                         delete [] entrybuf;
00106                 entrybuf = new char [ newsize ];
00107                 entrybufallocsize = newsize;
00108         }
00109         *entrybuf = 0;
00110 
00111         gettext(key->Testament(), start, (size + 2), entrybuf);
00112 
00113         rawFilter(entrybuf, size, key);
00114 
00115         if (!isUnicode())
00116                 preptext(entrybuf);
00117 
00118         if (this->key != key) // free our key if we created a VerseKey
00119                 delete key;
00120 
00121         return entrybuf;
00122 }
00123 
00124 
00125 signed char RawText::createSearchFramework() {
00126         SWKey *savekey = 0;
00127         SWKey *searchkey = 0;
00128         SWKey textkey;
00129         char *word = 0;
00130         char *wordBuf = 0;
00131 
00132         // dictionary holds words associated with a list
00133         // containing every module position that contains
00134         // the word.  [0] Old Testament; [1] NT
00135         map < string, list<long> > dictionary[2];
00136 
00137 
00138         // save key information so as not to disrupt original
00139         // module position
00140         if (!key->Persist()) {
00141                 savekey = CreateKey();
00142                 *savekey = *key;
00143         }
00144         else    savekey = key;
00145 
00146         searchkey = (key->Persist())?key->clone():0;
00147         if (searchkey) {
00148                 searchkey->Persist(1);
00149                 SetKey(*searchkey);
00150         }
00151 
00152         // position module at the beginning
00153         *this = TOP;
00154 
00155         VerseKey *lkey = (VerseKey *)key;
00156 
00157         // iterate thru each entry in module
00158         while (!Error()) {
00159                 long index = lkey->Index();
00160                 wordBuf = (char *)calloc(sizeof(char), strlen(StripText()) + 1);
00161                 strcpy(wordBuf, StripText());
00162 
00163                 // grab each word from the text
00164                 word = strtok(wordBuf, " !.,?;:()-=+/\\|{}[]\"<>");
00165                 while (word) {
00166 
00167                         // make work upper case
00168                         for (unsigned int i = 0; i < strlen(word); i++)
00169                                 word[i] = SW_toupper(word[i]);
00170 
00171                         // lookup word in dictionary (or make entry in dictionary
00172                         // for this word) and add this module position (index) to
00173                         // the word's associated list of module positions
00174                         dictionary[lkey->Testament()-1][word].push_back(index);
00175                         word = strtok(NULL, " !.,?;:()-=+/\\|{}[]\"<>");
00176                 }
00177                 free(wordBuf);
00178                 (*this)++;
00179         }
00180 
00181         // reposition module back to where it was before we were called
00182         SetKey(*savekey);
00183 
00184         if (!savekey->Persist())
00185                 delete savekey;
00186 
00187         if (searchkey)
00188                 delete searchkey;
00189 
00190         
00191         // --------- Let's output an index from our dictionary -----------
00192         int datfd;
00193         int idxfd;
00194         map < string, list<long> >::iterator it;
00195         list<long>::iterator it2;
00196         unsigned long offset, entryoff;
00197         unsigned short size;
00198 
00199         string fname;
00200         fname = path;
00201         char ch = fname.c_str()[strlen(fname.c_str())-1];
00202         if ((ch != '/') && (ch != '\\'))
00203                 fname += "/";
00204 
00205         // for old and new testament do...
00206         for (int loop = 0; loop < 2; loop++) {
00207                 if ((datfd = open((fname + ((loop)?"ntwords.dat":"otwords.dat")).c_str(), O_CREAT|O_WRONLY|O_BINARY, 00644 )) == -1)
00208                         return -1;
00209                 if ((idxfd = open((fname + ((loop)?"ntwords.idx":"otwords.idx")).c_str(), O_CREAT|O_WRONLY|O_BINARY, 00644 )) == -1) {
00210                         close(datfd);
00211                         return -1;
00212                 }
00213 
00214                 // iterate thru each word in the dictionary
00215                 for (it = dictionary[loop].begin(); it != dictionary[loop].end(); it++) {
00216                         printf("%s: ", it->first.c_str());
00217 
00218                         // get our current offset in our word.dat file and write this as the start
00219                         // of the next entry in our database
00220                         offset = lseek(datfd, 0, SEEK_CUR);
00221                         write(idxfd, &offset, 4);
00222 
00223                         // write our word out to the word.dat file, delineating with a \n
00224                         write(datfd, it->first.c_str(), strlen(it->first.c_str()));
00225                         write(datfd, "\n", 1);
00226 
00227                         // force our mod position list for this word to be unique (remove
00228                         // duplicates that may exist if the word was found more than once
00229                         // in the verse
00230                         it->second.unique();
00231 
00232                         // iterate thru each mod position for this word and output it to
00233                         // our database
00234                         unsigned short count = 0;
00235                         for (it2 = it->second.begin(); it2 != it->second.end(); it2++) {
00236                                 entryoff= *it2;
00237                                 write(datfd, &entryoff, 4);
00238                                 count++;
00239                         }
00240                         
00241                         // now see what our new position is in our word.dat file and
00242                         // determine the size of this database entry
00243                         size = lseek(datfd, 0, SEEK_CUR) - offset;
00244 
00245                         // store the size of this database entry
00246                         write(idxfd, &size, 2);
00247                         printf("%d entries (size: %d)\n", count, size);
00248                 }
00249                 close(datfd);
00250                 close(idxfd);
00251         }
00252         return 0;
00253 }
00254 
00255 
00256 /******************************************************************************
00257  * SWModule::Search     - Searches a module for a string
00258  *
00259  * ENT: istr            - string for which to search
00260  *      searchType      - type of search to perform
00261  *                              >=0 - regex
00262  *                              -1  - phrase
00263  *                              -2  - multiword
00264  *      flags           - options flags for search
00265  *      justCheckIfSupported    - if set, don't search, only tell if this
00266  *                                                      function supports requested search.
00267  *
00268  * RET: listkey set to verses that contain istr
00269  */
00270 
00271 ListKey &RawText::Search(const char *istr, int searchType, int flags, SWKey *scope, bool *justCheckIfSupported, void (*percent)(char, void *), void *percentUserData)
00272 {
00273         listkey.ClearList();
00274 
00275         if ((fastSearch[0]) && (fastSearch[1])) {
00276 
00277                 switch (searchType) {
00278                 case -2: {
00279 
00280                         if ((flags & REG_ICASE) != REG_ICASE)   // if haven't chosen to
00281                                                                                         // ignore case
00282                                 break; // can't handle fast case sensitive searches
00283 
00284                         // test to see if our scope for this search is bounded by a
00285                         // VerseKey
00286                         VerseKey *testKeyType = 0;
00287 #ifndef _WIN32_WCE
00288                         try {
00289 #endif
00290                                 testKeyType = SWDYNAMIC_CAST(VerseKey, ((scope)?scope:key));
00291 #ifndef _WIN32_WCE
00292                         }
00293                         catch ( ... ) {}
00294 #endif
00295                         // if we don't have a VerseKey * decendant we can't handle
00296                         // because of scope.
00297                         // In the future, add bool SWKey::isValid(const char *tryString);
00298                         if (!testKeyType)
00299                                 break;
00300 
00301 
00302                         // check if we just want to see if search is supported.
00303                         // If we've gotten this far, then it is supported.
00304                         if (justCheckIfSupported) {
00305                                 *justCheckIfSupported = true;
00306                                 return listkey;
00307                         }
00308 
00309                         SWKey saveKey = *testKeyType; // save current place
00310 
00311                         char error = 0;
00312                         char **words = 0;
00313                         char *wordBuf = 0;
00314                         int wordCount = 0;
00315                         long start;
00316                         unsigned short size;
00317                         char *idxbuf = 0;
00318                         char *datbuf = 0;
00319                         list <long> indexes;
00320                         list <long> indexes2;
00321                         VerseKey vk;
00322                         vk = TOP;
00323 
00324                         (*percent)(10, percentUserData);
00325 
00326                         // toupper our copy of search string
00327                         stdstr(&wordBuf, istr);
00328                         for (unsigned int i = 0; i < strlen(wordBuf); i++)
00329                                 wordBuf[i] = SW_toupper(wordBuf[i]);
00330 
00331                         // get list of individual words
00332                         words = (char **)calloc(sizeof(char *), 10);
00333                         int allocWords = 10;
00334                         words[wordCount] = strtok(wordBuf, " ");
00335                         while (words[wordCount]) {
00336                                 wordCount++;
00337                                 if (wordCount == allocWords) {
00338                                         allocWords+=10;
00339                                         words = (char **)realloc(words, sizeof(char *)*allocWords);
00340                                 }
00341                                 words[wordCount] = strtok(NULL, " ");
00342                         }
00343 
00344                         (*percent)(20, percentUserData);
00345 
00346                         // clear our result set
00347                         indexes.erase(indexes.begin(), indexes.end());
00348 
00349                         // search both old and new testament indexes
00350                         for (int j = 0; j < 2; j++) {
00351                                 // iterate thru each word the user passed to us.
00352                                 for (int i = 0; i < wordCount; i++) {
00353 
00354                                         // clear this word's result set
00355                                         indexes2.erase(indexes2.begin(), indexes2.end());
00356                                         error = 0;
00357 
00358                                         // iterate thru every word in the database that starts
00359                                         // with our search word
00360                                         for (int away = 0; !error; away++) {
00361                                                 idxbuf = 0;
00362                                                 
00363                                                 // find our word in the database and jump ahead _away_
00364                                                 error = fastSearch[j]->findoffset(words[i], &start, &size, away);
00365 
00366                                                 // get the word from the database
00367                                                 fastSearch[j]->getidxbufdat(start, &idxbuf);
00368 
00369                                                 // check to see if it starts with our target word
00370                                                 if (strlen(idxbuf) > strlen(words[i]))
00371                                                         idxbuf[strlen(words[i])] = 0;
00372 //                                              else    words[i][strlen(idxbuf)] = 0;
00373                                                 if (!strcmp(idxbuf, words[i])) {
00374 
00375                                                         // get data for this word from database
00376                                                         free(idxbuf);
00377                                                         idxbuf = (char *)calloc(size+2, 1);
00378                                                         datbuf = (char *)calloc(size+2, 1);
00379                                                         fastSearch[j]->gettext(start, size + 2, idxbuf, datbuf);
00380 
00381                                                         // we know that the data consists of sizof(long)
00382                                                         // records each a valid module position that constains
00383                                                         // this word
00384                                                         //
00385                                                         // iterate thru each of these module positions
00386                                                         long *keyindex = (long *)datbuf;
00387                                                         while (keyindex < (long *)(datbuf + size - (strlen(idxbuf) + 1))) {
00388                                                                 if (i) {        // if we're not on our first word
00389 
00390                                                                         // check to see if this word is already in the result set.
00391                                                                         // This is our AND functionality
00392                                                                         if (find(indexes.begin(), indexes.end(), *keyindex) != indexes.end())
00393                                                                                 // add to new result set
00394                                                                                 indexes2.push_back(*keyindex);
00395                                                                 }
00396                                                                 else    indexes2.push_back(*keyindex);
00397                                                                 keyindex++;
00398                                                         }
00399                                                         free(datbuf);
00400                                                 }
00401                                                 else error = 1; // no more matches
00402                                                 free(idxbuf);
00403                                         }
00404 
00405                                         // make new result set final result set
00406                                         indexes = indexes2;
00407 
00408                                         percent((char)(20 + (float)((j*wordCount)+i)/(wordCount * 2) * 78), percentUserData);
00409                                 }
00410 
00411                                 // indexes contains our good verses, lets return them in a listkey
00412                                 indexes.sort();
00413 
00414                                 // iterate thru each good module position that meets the search
00415                                 for (list <long>::iterator it = indexes.begin(); it != indexes.end(); it++) {
00416 
00417                                         // set a temporary verse key to this module position
00418                                         vk.Testament(j+1);
00419                                         vk.Error();
00420                                         vk.Index(*it);
00421 
00422                                         // check scope
00423                                         // Try to set our scope key to this verse key
00424                                         if (scope) {
00425                                                 *testKeyType = vk;
00426 
00427                                                 // check to see if it set ok and if so, add to our return list
00428                                                 if (*testKeyType == vk)
00429                                                         listkey << (const char *) vk;
00430                                         }
00431                                         else listkey << (const char*) vk;
00432                                 }
00433                         }
00434                         (*percent)(98, percentUserData);
00435 
00436                         free(words);
00437                         free(wordBuf);
00438 
00439                         *testKeyType = saveKey; // set current place back to original
00440 
00441                         listkey = TOP;
00442                         (*percent)(100, percentUserData);
00443                         return listkey;
00444                 }
00445 
00446                 default:
00447                         break;
00448                 }
00449         }
00450 
00451         // check if we just want to see if search is supported
00452         if (justCheckIfSupported) {
00453                 *justCheckIfSupported = false;
00454                 return listkey;
00455         }
00456 
00457         // if we don't support this search, fall back to base class
00458         return SWModule::Search(istr, searchType, flags, scope, justCheckIfSupported, percent, percentUserData);
00459 }
00460 
00461 #ifdef _MSC_VER
00462 SWModule &RawText::operator =(SW_POSITION p) {
00463 #else
00464 RawText &RawText::operator =(SW_POSITION p) {
00465 #endif
00466         SWModule::operator =(p);
00467         return *this;
00468 }
00469 
00470 SWModule &RawText::setentry(const char *inbuf, long len) {
00471         VerseKey *key = 0;
00472         // see if we have a VerseKey * or decendant
00473 #ifndef _WIN32_WCE
00474         try {
00475 #endif
00476                 key = SWDYNAMIC_CAST(VerseKey, this->key);
00477 #ifndef _WIN32_WCE
00478         }
00479         catch ( ... ) {}
00480 #endif
00481         // if we don't have a VerseKey * decendant, create our own
00482         if (!key)
00483                 key = new VerseKey(this->key);
00484 
00485         settext(key->Testament(), key->Index(), inbuf, len);
00486 
00487         if (this->key != key) // free our key if we created a VerseKey
00488                 delete key;
00489 
00490         return *this;
00491 }
00492 
00493 SWModule &RawText::operator <<(const char *inbuf) {
00494         return setentry(inbuf, 0);
00495 }
00496 
00497 
00498 SWModule &RawText::operator <<(const SWKey *inkey) {
00499         VerseKey *destkey = 0;
00500         const VerseKey *srckey = 0;
00501         // see if we have a VerseKey * or decendant
00502 #ifndef _WIN32_WCE
00503         try {
00504 #endif
00505                 destkey = SWDYNAMIC_CAST(VerseKey, this->key);
00506 #ifndef _WIN32_WCE
00507         }
00508         catch ( ... ) {}
00509 #endif
00510         // if we don't have a VerseKey * decendant, create our own
00511         if (!destkey)
00512                 destkey = new VerseKey(this->key);
00513 
00514         // see if we have a VerseKey * or decendant
00515 #ifndef _WIN32_WCE
00516         try {
00517 #endif
00518                 srckey = SWDYNAMIC_CAST(VerseKey, inkey);
00519 #ifndef _WIN32_WCE
00520         }
00521         catch ( ... ) {}
00522 #endif
00523         // if we don't have a VerseKey * decendant, create our own
00524         if (!srckey)
00525                 srckey = new VerseKey(inkey);
00526 
00527         linkentry(destkey->Testament(), destkey->Index(), srckey->Index());
00528 
00529         if (this->key != destkey) // free our key if we created a VerseKey
00530                 delete destkey;
00531 
00532         if (inkey != srckey) // free our key if we created a VerseKey
00533                 delete srckey;
00534 
00535         return *this;
00536 }
00537 
00538 
00539 /******************************************************************************
00540  * RawText::deleteEntry - deletes this entry
00541  *
00542  * RET: *this
00543  */
00544 
00545 void RawText::deleteEntry() {
00546 
00547         VerseKey *key = 0;
00548 
00549 #ifndef _WIN32_WCE
00550         try {
00551 #endif
00552                 key = SWDYNAMIC_CAST(VerseKey, this->key);
00553 #ifndef _WIN32_WCE
00554         }
00555         catch ( ... ) {}
00556 #endif
00557         if (!key)
00558                 key = new VerseKey(this->key);
00559 
00560         settext(key->Testament(), key->Index(), "");
00561 
00562         if (key != this->key)
00563                 delete key;
00564 }
00565 
00566 /******************************************************************************
00567  * RawText::operator += - Increments module key a number of entries
00568  *
00569  * ENT: increment       - Number of entries to jump forward
00570  *
00571  * RET: *this
00572  */
00573 
00574 SWModule &RawText::operator +=(int increment)
00575 {
00576         long  start;
00577         unsigned short size;
00578         VerseKey *tmpkey = 0;
00579 
00580 #ifndef _WIN32_WCE
00581         try {
00582 #endif
00583                 tmpkey = SWDYNAMIC_CAST(VerseKey, key);
00584 #ifndef _WIN32_WCE
00585         }
00586         catch ( ... ) {}
00587 #endif
00588         if (!tmpkey)
00589                 tmpkey = new VerseKey(key);
00590 
00591         findoffset(tmpkey->Testament(), tmpkey->Index(), &start, &size);
00592 
00593         SWKey lastgood = *tmpkey;
00594         while (increment) {
00595                 long laststart = start;
00596                 unsigned short lastsize = size;
00597                 SWKey lasttry = *tmpkey;
00598                 (increment > 0) ? (*key)++ : (*key)--;
00599                 if (tmpkey != key)
00600                         delete tmpkey;
00601                 tmpkey = 0;
00602 #ifndef _WIN32_WCE
00603                 try {
00604 #endif
00605                         tmpkey = SWDYNAMIC_CAST(VerseKey, key);
00606 #ifndef _WIN32_WCE
00607                 }
00608                 catch ( ... ) {}
00609 #endif
00610                 if (!tmpkey)
00611                         tmpkey = new VerseKey(key);
00612 
00613                 if ((error = key->Error())) {
00614                         *key = lastgood;
00615                         break;
00616                 }
00617                 long index = tmpkey->Index();
00618                 findoffset(tmpkey->Testament(), index, &start, &size);
00619                 if ((((laststart != start) || (lastsize != size))||(!skipConsecutiveLinks)) && (start >= 0) && (size)) {
00620                         increment += (increment < 0) ? 1 : -1;
00621                         lastgood = *tmpkey;
00622                 }
00623         }
00624         error = (error) ? KEYERR_OUTOFBOUNDS : 0;
00625 
00626         if (tmpkey != key)
00627                 delete tmpkey;
00628 
00629         return *this;
00630 }

Generated on Thu Jun 20 22:13:00 2002 for The Sword Project by doxygen1.2.15