Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

utf8transliterator.cpp

00001 /******************************************************************************
00002 *
00003 * utf8transliterators - SWFilter decendant to transliterate between
00004 *                        ICU-supported scripts.
00005 */
00006 
00007 #ifdef _ICU_
00008 
00009 #include <stdlib.h>
00010 #include <string.h>
00011 
00012 #ifdef __GNUC__
00013 #include <unixstr.h>
00014 #endif
00015 
00016 #include <utf8transliterator.h>
00017 
00018 const char UTF8Transliterator::optionstring[NUMTARGETSCRIPTS][16] = {
00019         "Off",
00020         "Latin",
00021         "Basic Latin",
00022         "Beta",
00023         "BGreek",
00024 /*
00025         "Greek",
00026         "Hebrew",
00027         "Cyrillic",
00028         "Arabic",
00029         "Syriac",
00030         "Katakana",
00031         "Hiragana",
00032         "Jamo",
00033         "Hangul",
00034         "Devanagari",
00035         "Tamil",
00036         "Bengali",
00037         "Gurmukhi",
00038         "Gujarati",
00039         "Oriya",
00040         "Telugu",
00041         "Kannada",
00042         "Malayalam",
00043         "Thai",
00044         "Georgian",
00045         "Armenian",
00046         "Ethiopic",
00047         "Gothic",
00048         "Ugaritic",
00049         "Coptic"
00050         */
00051 };
00052 
00053 const char UTF8Transliterator::optName[] = "Transliteration";
00054 const char UTF8Transliterator::optTip[] = "Transliterates between scripts";
00055 
00056 UTF8Transliterator::UTF8Transliterator() {
00057         option = 0;
00058         unsigned long i;
00059         for (i = 0; i < NUMTARGETSCRIPTS; i++) {
00060                 options.push_back(optionstring[i]);
00061         }
00062 }
00063 
00064 void UTF8Transliterator::setOptionValue(const char *ival)
00065 {
00066         unsigned char i = option = NUMTARGETSCRIPTS;
00067         while (i && stricmp(ival, optionstring[i])) {
00068                 i--;
00069                 option = i;
00070         }
00071 }
00072 
00073 const char *UTF8Transliterator::getOptionValue()
00074 {
00075         return (NUMTARGETSCRIPTS > option) ? optionstring[option] : 0;
00076 }
00077 
00078 char UTF8Transliterator::ProcessText(char *text, int maxlen, const SWKey *key, const SWModule *module)
00079 {
00080         if (option) {   // if we want transliteration
00081                 unsigned long i, j;
00082                 UErrorCode err = U_ZERO_ERROR;
00083                 UConverter * conv = NULL;
00084                 conv = ucnv_open("UTF-8", &err);
00085 
00086                 bool compat = false;
00087                 bool noNFC = false;
00088 
00089                 if (option == SE_JAMO) {
00090                         noNFC = true;
00091                 }
00092 
00093                 // Convert UTF-8 string to UTF-16 (UChars)
00094                 j = strlen(text);
00095                 int32_t len = (j * 2) + 1;
00096                 UChar *source = new UChar[len];
00097                 err = U_ZERO_ERROR;
00098                 len = ucnv_toUChars(conv, source, len, text, j, &err);
00099                 source[len] = 0;
00100 
00101                 // Figure out which scripts are used in the string
00102                 unsigned char scripts[NUMSCRIPTS];
00103 
00104                 for (i = 0; i < NUMSCRIPTS; i++) {
00105                         scripts[i] = false;
00106                 }
00107 
00108                 for (i = 0; i < len; i++) {
00109                         j = ublock_getCode(source[i]);
00110                         switch (j) {
00111                         case UBLOCK_BASIC_LATIN: scripts[SE_LATIN] = true; break;
00112                         case UBLOCK_GREEK: scripts[SE_GREEK] = true; break;
00113                         case UBLOCK_HEBREW: scripts[SE_HEBREW] = true; break;
00114                         case UBLOCK_CYRILLIC: scripts[SE_CYRILLIC] = true; break;
00115                         case UBLOCK_ARABIC: scripts[SE_ARABIC] = true; break;
00116                         case UBLOCK_SYRIAC: scripts[SE_SYRIAC] = true; break;
00117                         case UBLOCK_KATAKANA: scripts[SE_KATAKANA] = true; break;
00118                         case UBLOCK_HIRAGANA: scripts[SE_HIRAGANA] = true; break;
00119                         case UBLOCK_HANGUL_SYLLABLES: scripts[SE_HANGUL] = true; break;
00120                         case UBLOCK_HANGUL_JAMO: scripts[SE_JAMO] = true; break;
00121                         case UBLOCK_DEVANAGARI: scripts[SE_DEVANAGARI] = true; break;
00122                         case UBLOCK_TAMIL: scripts[SE_TAMIL] = true; break;
00123                         case UBLOCK_BENGALI: scripts[SE_BENGALI] = true; break;
00124                         case UBLOCK_GURMUKHI: scripts[SE_GURMUKHI] = true; break;
00125                         case UBLOCK_GUJARATI: scripts[SE_GUJARATI] = true; break;
00126                         case UBLOCK_ORIYA: scripts[SE_ORIYA] = true; break;
00127                         case UBLOCK_TELUGU: scripts[SE_TELUGU] = true; break;
00128                         case UBLOCK_KANNADA: scripts[SE_KANNADA] = true; break;
00129                         case UBLOCK_MALAYALAM: scripts[SE_MALAYALAM] = true; break;
00130                         case UBLOCK_THAI: scripts[SE_THAI] = true; break;
00131                         case UBLOCK_GEORGIAN: scripts[SE_GEORGIAN] = true; break;
00132                         case UBLOCK_ARMENIAN: scripts[SE_ARMENIAN] = true; break;
00133                         case UBLOCK_ETHIOPIC: scripts[SE_ETHIOPIC] = true; break;
00134                         case UBLOCK_GOTHIC: scripts[SE_GOTHIC] = true; break;
00135                         // needs Unicode 3.2? or 4.0? support from ICU
00136                         //case UBLOCK_UGARITIC: scripts[SE_UGARITIC] = true; break;
00137                         case UBLOCK_CJK_RADICALS_SUPPLEMENT:
00138                         case UBLOCK_KANGXI_RADICALS:
00139                         case UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS:
00140                         case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
00141                         case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
00142                         case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
00143                                 scripts[SE_HAN] = true;
00144                                 break;
00145                         case UBLOCK_CJK_COMPATIBILITY:
00146                         case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
00147                         case UBLOCK_CJK_COMPATIBILITY_FORMS:
00148                                 scripts[SE_HAN] = true;
00149                                 compat = true;
00150                                 break;
00151                         case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
00152                                 scripts[SE_HANGUL] = true;
00153                                 compat = true;
00154                                 break;
00155 
00156                         default: scripts[SE_LATIN] = true;
00157                         }
00158                 } 
00159                 scripts[option] = false; //turn off the reflexive transliteration
00160                 
00161                 //return if we have no transliteration to do for this text
00162                 j = 0;
00163                 for (i = 0; !j && i < NUMSCRIPTS; i++) {
00164                         if (scripts[i]) j++;
00165                 }
00166                 if (!j) {
00167                         ucnv_close(conv);
00168                         return 0;
00169                 }
00170 
00171                 UnicodeString id;
00172                 if (compat) {
00173                         id = UnicodeString("NFKD");
00174                 }
00175                 else {
00176                         id = UnicodeString("NFD");
00177                 }
00178 
00179                 //Simple X to Latin transliterators
00180                 if (scripts[SE_GREEK]) {
00181                         if (option == SE_BETA)
00182                                 id += UnicodeString(";Greek-Beta");
00183                         else if (option == SE_BGREEK)
00184                                 id += UnicodeString(";Greek-BGreek");
00185                         else {
00186                     if (!strnicmp (((SWModule*)module)->Lang(), "cop", 3)) {
00187                                         id += UnicodeString(";Coptic-Latin");
00188                 }
00189                 else {
00190                                                 id += UnicodeString(";Greek-Latin");
00191                 }
00192                                 scripts[SE_LATIN] = true;
00193                         }
00194                 }
00195                 if (scripts[SE_HEBREW]) {
00196                         if (option == SE_BETA)
00197                                 id += UnicodeString(";Hebrew-CCAT");
00198                         else if (option == SE_SYRIAC)
00199                                 id += UnicodeString(";Hebrew-Syriac");
00200                         else {
00201                                 id += UnicodeString(";Hebrew-Latin");
00202                                 scripts[SE_LATIN] = true;
00203                         }
00204                 }
00205                 if (scripts[SE_CYRILLIC]) {
00206                         id += UnicodeString(";Cyrillic-Latin");
00207                         scripts[SE_LATIN] = true;
00208                 }
00209                 if (scripts[SE_ARABIC]) {
00210                         id += UnicodeString(";Arabic-Latin");
00211                         scripts[SE_LATIN] = true;
00212                 }
00213                 if (scripts[SE_SYRIAC]) {
00214                         if (option == SE_BETA)
00215                                 id += UnicodeString(";Syriac-CCAT");
00216                         else if (option == SE_HEBREW)
00217                                 id += UnicodeString(";Syriac-Hebrew");
00218                         else {
00219                                 id += UnicodeString(";Syriac-Latin");
00220                                 scripts[SE_LATIN] = true;
00221                         }
00222                 }
00223                 if (scripts[SE_THAI]) {
00224                         id += UnicodeString(";Thai-Latin");
00225                         scripts[SE_LATIN] = true;
00226                 }
00227                 if (scripts[SE_GEORGIAN]) {
00228                         id += UnicodeString(";Georgian-Latin");
00229                         scripts[SE_LATIN] = true;
00230                 }
00231                 if (scripts[SE_ARMENIAN]) {
00232                         id += UnicodeString(";Armenian-Latin");
00233                         scripts[SE_LATIN] = true;
00234                 }                
00235                 if (scripts[SE_ETHIOPIC]) {
00236                         id += UnicodeString(";Ethiopic-Latin");
00237                         scripts[SE_LATIN] = true;
00238                 }
00239                 if (scripts[SE_GOTHIC]) {
00240                         id += UnicodeString(";Gothic-Latin");
00241                         scripts[SE_LATIN] = true;
00242                 }
00243                 if (scripts[SE_UGARITIC]) {
00244                         id += UnicodeString(";Ugaritic-Latin");
00245                         scripts[SE_LATIN] = true;
00246                 }
00247         if (scripts[SE_HAN]) {
00248                 if (!strnicmp (((SWModule*)module)->Lang(), "ja", 2)) {
00249                         id += UnicodeString(";Kanji-OnRomaji");
00250             }
00251             else {
00252                         id += UnicodeString(";Han-Pinyin");
00253             }
00254                         scripts[SE_LATIN] = true;
00255                 }
00256 
00257                 // Inter-Kana and Kana to Latin transliterators
00258                 if (option == SE_HIRAGANA && scripts[SE_KATAKANA]) {
00259                         id += UnicodeString(";Katakana-Hiragana");
00260                         scripts[SE_HIRAGANA] = true;
00261                 }
00262                 else if (option == SE_KATAKANA && scripts[SE_HIRAGANA]) {
00263                         id += UnicodeString(";Hiragana-Katakana");
00264                         scripts[SE_KATAKANA] = true;
00265                 }
00266                 else {
00267                         if (scripts[SE_KATAKANA]) {
00268                                 id += UnicodeString(";Katakana-Latin");
00269                                 scripts[SE_LATIN] = true;
00270                         }
00271                         if (scripts[SE_HIRAGANA]) {
00272                                 id += UnicodeString(";Hiragana-Latin");
00273                                 scripts[SE_LATIN] = true;
00274                         }
00275                 }
00276 
00277                 // Inter-Korean and Korean to Latin transliterators
00278                 if (option == SE_HANGUL && scripts[SE_JAMO]) {
00279                         noNFC = false;
00280                         scripts[SE_HANGUL] = true;
00281                 }
00282                 else if (option == SE_JAMO && scripts[SE_HANGUL]) {
00283                         noNFC = true;
00284                         scripts[SE_JAMO] = true;
00285                 }
00286                 else {
00287                         if (scripts[SE_HANGUL]) {
00288                                 id += UnicodeString(";Hangul-Latin");
00289                                 scripts[SE_LATIN] = true;
00290                         }
00291                         if (scripts[SE_JAMO]) {
00292                                 id += UnicodeString(";Jamo-Latin");
00293                                 scripts[SE_LATIN] = true;
00294                         }
00295                 }
00296 
00297                 // Indic-Latin
00298                 if (option < SE_DEVANAGARI || option > SE_MALAYALAM) {
00299                         // Indic to Latin
00300                         if (scripts[SE_TAMIL]) {
00301                                 id += UnicodeString(";Tamil-Latin");
00302                                 scripts[SE_LATIN] = true;
00303                         }
00304                         if (scripts[SE_BENGALI]) {
00305                                 id += UnicodeString(";Bengali-Latin");
00306                                 scripts[SE_LATIN] = true;
00307                         }
00308                         if (scripts[SE_GURMUKHI]) {
00309                                 id += UnicodeString(";Gurmukhi-Latin");
00310                                 scripts[SE_LATIN] = true;
00311                         }
00312                         if (scripts[SE_GUJARATI]) {
00313                                 id += UnicodeString(";Gujarati-Latin");
00314                                 scripts[SE_LATIN] = true;
00315                         }
00316                         if (scripts[SE_ORIYA]) {
00317                                 id += UnicodeString(";Oriya-Latin");
00318                                 scripts[SE_LATIN] = true;
00319                         }
00320                         if (scripts[SE_TELUGU]) {
00321                                 id += UnicodeString(";Telugu-Latin");
00322                                 scripts[SE_LATIN] = true;
00323                         }
00324                         if (scripts[SE_KANNADA]) {
00325                                 id += UnicodeString(";Kannada-Latin");
00326                                 scripts[SE_LATIN] = true;
00327                         }
00328                         if (scripts[SE_MALAYALAM]) {
00329                                 id += UnicodeString(";Malayalam-Latin");
00330                                 scripts[SE_LATIN] = true;
00331                         }
00332                 }
00333                 else {
00334                         if (scripts[SE_LATIN]) {
00335                                 id += UnicodeString(";Latin-InterIndic");
00336                         }
00337                         if (scripts[SE_DEVANAGARI]) {
00338                                 id += UnicodeString(";Devanagari-InterIndic");
00339                         }
00340                         if (scripts[SE_TAMIL]) {
00341                                 id += UnicodeString(";Tamil-InterIndic");
00342                         }
00343                         if (scripts[SE_BENGALI]) {
00344                                 id += UnicodeString(";Bengali-InterIndic");
00345                         }
00346                         if (scripts[SE_GURMUKHI]) {
00347                                 id += UnicodeString(";Gurmurkhi-InterIndic");
00348                         }
00349                         if (scripts[SE_GUJARATI]) {
00350                                 id += UnicodeString(";Gujarati-InterIndic");
00351                         }
00352                         if (scripts[SE_ORIYA]) {
00353                                 id += UnicodeString(";Oriya-InterIndic");
00354                         }
00355                         if (scripts[SE_TELUGU]) {
00356                                 id += UnicodeString(";Telugu-InterIndic");
00357                         }
00358                         if (scripts[SE_KANNADA]) {
00359                                 id += UnicodeString(";Kannada-InterIndic");
00360                         }
00361                         if (scripts[SE_MALAYALAM]) {
00362                                 id += UnicodeString(";Malayalam-InterIndic");
00363                         }
00364 
00365                         switch(option) {
00366                         case SE_DEVANAGARI:
00367                                 id += UnicodeString(";InterIndic-Devanagari");
00368                                 break;
00369                         case SE_TAMIL:
00370                                 id += UnicodeString(";InterIndic-Tamil");
00371                                 break;
00372                         case SE_BENGALI:
00373                                 id += UnicodeString(";InterIndic-Bengali");
00374                                 break;
00375                         case SE_GURMUKHI:
00376                                 id += UnicodeString(";InterIndic-Gurmukhi");
00377                                 break;
00378                         case SE_GUJARATI:
00379                                 id += UnicodeString(";InterIndic-Gujarati");
00380                                 break;
00381                         case SE_ORIYA:
00382                                 id += UnicodeString(";InterIndic-Oriya");
00383                                 break;
00384                         case SE_TELUGU:
00385                                 id += UnicodeString(";InterIndic-Telugu");
00386                                 break;
00387                         case SE_KANNADA:
00388                                 id += UnicodeString(";InterIndic-Kannada");
00389                                 break;
00390                         case SE_MALAYALAM:
00391                                 id += UnicodeString(";InterIndic-Malayalam");
00392                                 break;
00393                         default:
00394                                 id += UnicodeString(";InterIndic-Latin");
00395                                 scripts[SE_LATIN] = true;
00396                                 break;
00397                         }
00398                 }
00399 
00400                 if (scripts[SE_LATIN]) {
00401                 switch (option) {
00402                         case SE_GREEK:
00403                                 id += UnicodeString(";Latin-Greek");
00404                                 break;
00405                         case SE_HEBREW:
00406                                 id += UnicodeString(";Latin-Hebrew");
00407                                 break;
00408                         case SE_CYRILLIC:
00409                                 id += UnicodeString(";Latin-Cyrillic");
00410                                 break;
00411                         case SE_ARABIC:
00412                                 id += UnicodeString(";Latin-Arabic");
00413                                 break;
00414                         case SE_SYRIAC:
00415                                 id += UnicodeString(";Latin-Syriac");
00416                                 break;
00417                         case SE_THAI:
00418                                 id += UnicodeString(";Latin-Thai");
00419                                 break;
00420                         case SE_GEORGIAN:
00421                                 id += UnicodeString(";Latin-Georgian");
00422                                 break;
00423                         case SE_ARMENIAN:
00424                                 id += UnicodeString(";Latin-Armenian");
00425                                 break;
00426                         case SE_ETHIOPIC:
00427                                 id += UnicodeString(";Latin-Ethiopic");
00428                                 break;
00429                         case SE_GOTHIC:
00430                                 id += UnicodeString(";Latin-Gothic");
00431                                 break;
00432                         case SE_UGARITIC:
00433                                 id += UnicodeString(";Latin-Ugaritic");
00434                                 break;
00435                         case SE_COPTIC:
00436                                 id += UnicodeString(";Latin-Coptic");
00437                                 break;
00438                         case SE_KATAKANA:
00439                                 id += UnicodeString(";Latin-Katakana");
00440                                 break;
00441                         case SE_HIRAGANA:
00442                                 id += UnicodeString(";Latin-Hiragana");
00443                                 break;
00444                         case SE_JAMO:
00445                                 id += UnicodeString(";Latin-Jamo");
00446                                 break;
00447                         case SE_HANGUL:
00448                                 id += UnicodeString(";Latin-Hangul");
00449                                 break;
00450                         }
00451                 }
00452 
00453                 if (option == SE_BASICLATIN) {
00454                         id += UnicodeString(";Any-Latin1");
00455                 }
00456                                 
00457                 if (noNFC) {
00458                         id += UnicodeString(";NFD");
00459                 } else {
00460                         id += UnicodeString(";NFC");
00461                 }
00462 
00463                 UParseError perr;
00464 
00465                 err = U_ZERO_ERROR;
00466                 Transliterator * trans = Transliterator::createInstance(id, UTRANS_FORWARD, perr, err);
00467                 if (trans) {
00468                         UnicodeString target = UnicodeString(source);
00469                         trans->transliterate(target);
00470                         len = ucnv_fromUChars(conv, text, maxlen, target.getBuffer(), target.length(), &err);
00471                         if (len < maxlen) *(text + len) = 0;
00472                         else *(text + maxlen) = 0;
00473                         delete trans;
00474                 }
00475                 ucnv_close(conv);
00476         }
00477         return 0;
00478 }
00479 #endif

Generated on Thu Jun 20 22:13:01 2002 for The Sword Project by doxygen1.2.15