thmlgbf.cpp Source File

00001 /***************************************************************************
00002                      thmlgbf.cpp  -  ThML to GBF filter
00003                              -------------------
00004     begin                : 1999-10-28
00005     copyright            : 2001 by CrossWire Bible Society
00006  ***************************************************************************/
00007 
00008 /***************************************************************************
00009  *                                                                         *
00010  *   This program is free software; you can redistribute it and/or modify  *
00011  *   it under the terms of the GNU General Public License as published by  *
00012  *   the Free Software Foundation; either version 2 of the License, or     *
00013  *   (at your option) any later version.                                   *
00014  *                                                                         *
00015  ***************************************************************************/
00016 
00017 #include <stdlib.h>
00018 #include <string.h>
00019 #include <thmlgbf.h>
00020 
00021 
00022 ThMLGBF::ThMLGBF()
00023 {
00024 }
00025 
00026 
00027 char ThMLGBF::ProcessText(char *text, int maxlen)
00028 {
00029         char *to, *from, token[2048];
00030         int tokpos = 0;
00031         bool intoken    = false;
00032         int len;
00033         bool ampersand = false;
00034         bool sechead = false;
00035         bool title = false;  
00036 
00037         len = strlen(text) + 1;                                         // shift string to right of buffer
00038         if (len < maxlen) {
00039                 memmove(&text[maxlen - len], text, len);
00040                 from = &text[maxlen - len];
00041         }
00042         else    from = text;                                                    // -------------------------------
00043         for (to = text; *from; from++) {
00044                 if (*from == '<') {
00045                         intoken = true;
00046                         tokpos = 0;
00047                         token[0] = 0;
00048                         token[1] = 0;
00049                         token[2] = 0;
00050                         ampersand = false;
00051                         continue;
00052                 }
00053                 else if (*from == '&') {
00054                         intoken = true;
00055                         tokpos = 0;
00056                         memset(token, 0, 2048);
00057                         ampersand = true;
00058                         continue;
00059                 }
00060                 if (*from == ';' && ampersand) {
00061                         intoken = false;
00062         
00063                         if (!strncmp("nbsp", token, 4)) *to++ = ' ';
00064                         else if (!strncmp("quot", token, 4)) *to++ = '"';
00065                         else if (!strncmp("amp", token, 3)) *to++ = '&';
00066                         else if (!strncmp("lt", token, 2)) *to++ = '<';
00067                         else if (!strncmp("gt", token, 2)) *to++ = '>';
00068                         else if (!strncmp("brvbar", token, 6)) *to++ = '|';
00069                         else if (!strncmp("sect", token, 4)) *to++ = '§';
00070                         else if (!strncmp("copy", token, 4)) *to++ = '©';
00071                         else if (!strncmp("laquo", token, 5)) *to++ = '«';
00072                         else if (!strncmp("reg", token, 3)) *to++ = '®';
00073                         else if (!strncmp("acute", token, 5)) *to++ = '´';
00074                         else if (!strncmp("para", token, 4)) *to++ = '¶';
00075                         else if (!strncmp("raquo", token, 5)) *to++ = '»';
00076                         
00077                         else if (!strncmp("Aacute", token, 6)) *to++ = 'Á';
00078                         else if (!strncmp("Agrave", token, 6)) *to++ = 'À';
00079                         else if (!strncmp("Acirc", token, 5)) *to++ = 'Â';
00080                         else if (!strncmp("Auml", token, 4)) *to++ = 'Ä';
00081                         else if (!strncmp("Atilde", token, 6)) *to++ = 'Ã';
00082                         else if (!strncmp("Aring", token, 5)) *to++ = 'Å';
00083                         else if (!strncmp("aacute", token, 6)) *to++ = 'á';
00084                         else if (!strncmp("agrave", token, 6)) *to++ = 'à';
00085                         else if (!strncmp("acirc", token, 5)) *to++ = 'â';
00086                         else if (!strncmp("auml", token, 4)) *to++ = 'ä';
00087                         else if (!strncmp("atilde", token, 6)) *to++ = 'ã';
00088                         else if (!strncmp("aring", token, 5)) *to++ = 'å';
00089                         else if (!strncmp("Eacute", token, 6)) *to++ = 'É';
00090                         else if (!strncmp("Egrave", token, 6)) *to++ = 'È';
00091                         else if (!strncmp("Ecirc", token, 5)) *to++ = 'Ê';
00092                         else if (!strncmp("Euml", token, 4)) *to++ = 'Ë';
00093                         else if (!strncmp("eacute", token, 6)) *to++ = 'é';
00094                         else if (!strncmp("egrave", token, 6)) *to++ = 'è';
00095                         else if (!strncmp("ecirc", token, 5)) *to++ = 'ê';
00096                         else if (!strncmp("euml", token, 4)) *to++ = 'ë';
00097                         else if (!strncmp("Iacute", token, 6)) *to++ = 'Í';
00098                         else if (!strncmp("Igrave", token, 6)) *to++ = 'Ì';
00099                         else if (!strncmp("Icirc", token, 5)) *to++ = 'Î';
00100                         else if (!strncmp("Iuml", token, 4)) *to++ = 'Ï';
00101                         else if (!strncmp("iacute", token, 6)) *to++ = 'í';
00102                         else if (!strncmp("igrave", token, 6)) *to++ = 'ì';
00103                         else if (!strncmp("icirc", token, 5)) *to++ = 'î';
00104                         else if (!strncmp("iuml", token, 4)) *to++ = 'ï';
00105                         else if (!strncmp("Oacute", token, 6)) *to++ = 'Ó';
00106                         else if (!strncmp("Ograve", token, 6)) *to++ = 'Ò';
00107                         else if (!strncmp("Ocirc", token, 5)) *to++ = 'Ô';
00108                         else if (!strncmp("Ouml", token, 4)) *to++ = 'Ö';
00109                         else if (!strncmp("Otilde", token, 6)) *to++ = 'Õ';
00110                         else if (!strncmp("oacute", token, 6)) *to++ = 'ó';
00111                         else if (!strncmp("ograve", token, 6)) *to++ = 'ò';
00112                         else if (!strncmp("ocirc", token, 5)) *to++ = 'ô';
00113                         else if (!strncmp("ouml", token, 4)) *to++ = 'ö';
00114                         else if (!strncmp("otilde", token, 6)) *to++ = 'õ';
00115                         else if (!strncmp("Uacute", token, 6)) *to++ = 'Ú';
00116                         else if (!strncmp("Ugrave", token, 6)) *to++ = 'Ù';
00117                         else if (!strncmp("Ucirc", token, 5)) *to++ = 'Û';
00118                         else if (!strncmp("Uuml", token, 4)) *to++ = 'Ü';
00119                         else if (!strncmp("uacute", token, 6)) *to++ = 'ú';
00120                         else if (!strncmp("ugrave", token, 6)) *to++ = 'ù';
00121                         else if (!strncmp("ucirc", token, 5)) *to++ = 'û';
00122                         else if (!strncmp("uuml", token, 4)) *to++ = 'ü';
00123                         else if (!strncmp("Yacute", token, 6)) *to++ = 'Ý';
00124                         else if (!strncmp("yacute", token, 6)) *to++ = 'ý';
00125                         else if (!strncmp("yuml", token, 4)) *to++ = 'ÿ';
00126                         
00127                         else if (!strncmp("deg", token, 3)) *to++ = '°';
00128                         else if (!strncmp("plusmn", token, 6)) *to++ = '±';
00129                         else if (!strncmp("sup2", token, 4)) *to++ = '²';
00130                         else if (!strncmp("sup3", token, 4)) *to++ = '³';
00131                         else if (!strncmp("sup1", token, 4)) *to++ = '¹';
00132                         else if (!strncmp("nbsp", token, 4)) *to++ = 'º';
00133                         else if (!strncmp("pound", token, 5)) *to++ = '£';
00134                         else if (!strncmp("cent", token, 4)) *to++ = '¢';
00135                         else if (!strncmp("frac14", token, 6)) *to++ = '¼';
00136                         else if (!strncmp("frac12", token, 6)) *to++ = '½';
00137                         else if (!strncmp("frac34", token, 6)) *to++ = '¾';
00138                         else if (!strncmp("iquest", token, 6)) *to++ = '¿';
00139                         else if (!strncmp("iexcl", token, 5)) *to++ = '¡';
00140                         else if (!strncmp("ETH", token, 3)) *to++ = 'Ð';
00141                         else if (!strncmp("eth", token, 3)) *to++ = 'ð';
00142                         else if (!strncmp("THORN", token, 5)) *to++ = 'Þ';
00143                         else if (!strncmp("thorn", token, 5)) *to++ = 'þ';
00144                         else if (!strncmp("AElig", token, 5)) *to++ = 'Æ';
00145                         else if (!strncmp("aelig", token, 5)) *to++ = 'æ';
00146                         else if (!strncmp("Oslash", token, 6)) *to++ = 'Ø';
00147                         else if (!strncmp("curren", token, 6)) *to++ = '¤';
00148                         else if (!strncmp("Ccedil", token, 6)) *to++ = 'Ç';
00149                         else if (!strncmp("ccedil", token, 6)) *to++ = 'ç';
00150                         else if (!strncmp("szlig", token, 5)) *to++ = 'ß';
00151                         else if (!strncmp("Ntilde", token, 6)) *to++ = 'Ñ';
00152                         else if (!strncmp("ntilde", token, 6)) *to++ = 'ñ';
00153                         else if (!strncmp("yen", token, 3)) *to++ = '¥';
00154                         else if (!strncmp("not", token, 3)) *to++ = '¬';
00155                         else if (!strncmp("ordf", token, 4)) *to++ = 'ª';
00156                         else if (!strncmp("uml", token, 3)) *to++ = '¨';
00157                         else if (!strncmp("shy", token, 3)) *to++ = '';
00158                         else if (!strncmp("macr", token, 4)) *to++ = '¯';
00159                         continue;
00160                 
00161                 }
00162                 else if (*from == '>' && !ampersand) {
00163                         intoken = false;
00164                         // process desired tokens
00165                         if (!strncmp(token, "sync type=\"Strongs\" value=\"", 27)) {
00166                                 *to++ = '<';
00167                                 *to++ = 'W';
00168                                 for (unsigned int i = 27; token[i] != '\"'; i++)
00169                                         *to++ = token[i];
00170                                 *to++ = '>';
00171                                 continue;
00172                         }
00173                         if (!strncmp(token, "sync type=\"morph\" value=\"", 25)) {
00174                                 *to++ = '<';
00175                                 *to++ = 'W';
00176                                 *to++ = 'T';
00177                                 for (unsigned int i = 25; token[i] != '\"'; i++)
00178                                         *to++ = token[i];
00179                                 *to++ = '>';
00180                                 continue;
00181                         }
00182                         else if (!strncmp(token, "scripRef", 8)) {
00183                                 *to++ = '<';
00184                                 *to++ = 'R';
00185                                 *to++ = 'X';
00186                                 *to++ = '>';
00187                                 continue;
00188                         }
00189                         else if (!strncmp(token, "/scripRef", 9)) {
00190                                 *to++ = '<';
00191                                 *to++ = 'R';
00192                                 *to++ = 'x';
00193                                 *to++ = '>';
00194                                 continue;
00195                         }
00196                         else if (!strncmp(token, "note", 4)) {
00197                                 *to++ = '<';
00198                                 *to++ = 'R';
00199                                 *to++ = 'F';
00200                                 *to++ = '>';
00201                                 continue;
00202                         }
00203                         else if (!strncmp(token, "/note", 5)) {
00204                                 *to++ = '<';
00205                                 *to++ = 'R';
00206                                 *to++ = 'f';
00207                                 *to++ = '>';
00208                                 continue;
00209                         }
00210                         else if (!strncmp(token, "sup", 3)) {
00211                                 *to++ = '<';
00212                                 *to++ = 'F';
00213                                 *to++ = 'S';
00214                                 *to++ = '>';
00215                         }
00216                         else if (!strncmp(token, "/sup", 4)) {
00217                                 *to++ = '<';
00218                                 *to++ = 'F';
00219                                 *to++ = 's';
00220                                 *to++ = '>';
00221                         }
00222                         else if (!strnicmp(token, "font color=#ff0000", 18)) {
00223                                 *to++ = '<';
00224                                 *to++ = 'F';
00225                                 *to++ = 'R';
00226                                 *to++ = '>';
00227                                 continue;
00228                         }
00229                         else if (!strnicmp(token, "/font", 5)) {
00230                                 *to++ = '<';
00231                                 *to++ = 'F';
00232                                 *to++ = 'r';
00233                                 *to++ = '>';
00234                                 continue;
00235                         }
00236                         else if (!strncmp(token, "div class=\"sechead\"", 19)) {
00237                                 *to++ = '<';
00238                                 *to++ = 'T';
00239                                 *to++ = 'S';
00240                                 *to++ = '>';
00241                                 sechead = true;
00242                                 continue;
00243                         }
00244                         else if (sechead && !strncmp(token, "/div", 19)) {
00245                                 *to++ = '<';
00246                                 *to++ = 'T';
00247                                 *to++ = 's';
00248                                 *to++ = '>';
00249                                 sechead = false;
00250                                 continue;
00251                         }
00252                         else if (!strncmp(token, "div class=\"title\"", 19)) {
00253                                 *to++ = '<';
00254                                 *to++ = 'T';
00255                                 *to++ = 'T';
00256                                 *to++ = '>';
00257                                 title = true;
00258                                 continue;
00259                         }
00260                         else if (title && !strncmp(token, "/div", 19)) {
00261                                 *to++ = '<';
00262                                 *to++ = 'T';
00263                                 *to++ = 't';
00264                                 *to++ = '>';
00265                                 title = false;
00266                                 continue;
00267                         }
00268                         else if (!strnicmp(token, "br", 2)) {
00269                                 *to++ = '<';
00270                                 *to++ = 'C';
00271                                 *to++ = 'L';
00272                                 *to++ = '>';
00273                                 continue;
00274                         }
00275                         else switch(*token) {
00276                         case 'I':                       // font tags
00277                         case 'i':
00278                                 *to++ = '<';
00279                                 *to++ = 'F';
00280                                 *to++ = 'I';
00281                                 *to++ = '>';
00282                                 continue;
00283                         case 'B':               // bold start
00284                         case 'b':
00285                                 *to++ = '<';
00286                                 *to++ = 'F';
00287                                 *to++ = 'B';
00288                                 *to++ = '>';
00289                                 continue;
00290                         case '/':
00291                                 switch(token[1]) {
00292                                 case 'P':
00293                                 case 'p':
00294                                         *to++ = '<';
00295                                         *to++ = 'C';
00296                                         *to++ = 'M';
00297                                         *to++ = '>';
00298                                         continue;
00299                                 case 'I':
00300                                 case 'i':               // italic end
00301                                         *to++ = '<';
00302                                         *to++ = 'F';
00303                                         *to++ = 'i';
00304                                         *to++ = '>';
00305                                         continue;
00306                                 case 'B':               // bold start
00307                                 case 'b':
00308                                         *to++ = '<';
00309                                         *to++ = 'F';
00310                                         *to++ = 'b';
00311                                         *to++ = '>';
00312                                         continue;
00313                                 }
00314                         }
00315                         continue;
00316                 }
00317                 if (intoken) {
00318                         if (tokpos < 2045)
00319                                 token[tokpos++] = *from;
00320                                 token[tokpos+2] = 0;
00321                 }
00322                 else    *to++ = *from;
00323         }
00324         *to++ = 0;
00325         *to = 0;          
00326         return 0;
00327 }
00328 
00329 
00330