00001
00002
00003
00004
00005
00006
00007 #ifdef _ICU_
00008
00009 #include <stdlib.h>
00010 #include <string.h>
00011
00012 #ifdef __GNUC__
00013 #include <unixstr.h>
00014 #endif
00015
00016 #include <utf8transliterator.h>
00017
00018 const char UTF8Transliterator::optionstring[NUMTARGETSCRIPTS][16] = {
00019 "Off",
00020 "Latin",
00021 "Basic Latin",
00022 "Beta",
00023 "BGreek",
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051 };
00052
00053 const char UTF8Transliterator::optName[] = "Transliteration";
00054 const char UTF8Transliterator::optTip[] = "Transliterates between scripts";
00055
00056 UTF8Transliterator::UTF8Transliterator() {
00057 option = 0;
00058 unsigned long i;
00059 for (i = 0; i < NUMTARGETSCRIPTS; i++) {
00060 options.push_back(optionstring[i]);
00061 }
00062 }
00063
00064 void UTF8Transliterator::setOptionValue(const char *ival)
00065 {
00066 unsigned char i = option = NUMTARGETSCRIPTS;
00067 while (i && stricmp(ival, optionstring[i])) {
00068 i--;
00069 option = i;
00070 }
00071 }
00072
00073 const char *UTF8Transliterator::getOptionValue()
00074 {
00075 return (NUMTARGETSCRIPTS > option) ? optionstring[option] : 0;
00076 }
00077
00078 char UTF8Transliterator::ProcessText(char *text, int maxlen, const SWKey *key, const SWModule *module)
00079 {
00080 if (option) {
00081 unsigned long i, j;
00082 UErrorCode err = U_ZERO_ERROR;
00083 UConverter * conv = NULL;
00084 conv = ucnv_open("UTF-8", &err);
00085
00086 bool compat = false;
00087 bool noNFC = false;
00088
00089 if (option == SE_JAMO) {
00090 noNFC = true;
00091 }
00092
00093
00094 j = strlen(text);
00095 int32_t len = (j * 2) + 1;
00096 UChar *source = new UChar[len];
00097 err = U_ZERO_ERROR;
00098 len = ucnv_toUChars(conv, source, len, text, j, &err);
00099 source[len] = 0;
00100
00101
00102 unsigned char scripts[NUMSCRIPTS];
00103
00104 for (i = 0; i < NUMSCRIPTS; i++) {
00105 scripts[i] = false;
00106 }
00107
00108 for (i = 0; i < len; i++) {
00109 j = ublock_getCode(source[i]);
00110 switch (j) {
00111 case UBLOCK_BASIC_LATIN: scripts[SE_LATIN] = true; break;
00112 case UBLOCK_GREEK: scripts[SE_GREEK] = true; break;
00113 case UBLOCK_HEBREW: scripts[SE_HEBREW] = true; break;
00114 case UBLOCK_CYRILLIC: scripts[SE_CYRILLIC] = true; break;
00115 case UBLOCK_ARABIC: scripts[SE_ARABIC] = true; break;
00116 case UBLOCK_SYRIAC: scripts[SE_SYRIAC] = true; break;
00117 case UBLOCK_KATAKANA: scripts[SE_KATAKANA] = true; break;
00118 case UBLOCK_HIRAGANA: scripts[SE_HIRAGANA] = true; break;
00119 case UBLOCK_HANGUL_SYLLABLES: scripts[SE_HANGUL] = true; break;
00120 case UBLOCK_HANGUL_JAMO: scripts[SE_JAMO] = true; break;
00121 case UBLOCK_DEVANAGARI: scripts[SE_DEVANAGARI] = true; break;
00122 case UBLOCK_TAMIL: scripts[SE_TAMIL] = true; break;
00123 case UBLOCK_BENGALI: scripts[SE_BENGALI] = true; break;
00124 case UBLOCK_GURMUKHI: scripts[SE_GURMUKHI] = true; break;
00125 case UBLOCK_GUJARATI: scripts[SE_GUJARATI] = true; break;
00126 case UBLOCK_ORIYA: scripts[SE_ORIYA] = true; break;
00127 case UBLOCK_TELUGU: scripts[SE_TELUGU] = true; break;
00128 case UBLOCK_KANNADA: scripts[SE_KANNADA] = true; break;
00129 case UBLOCK_MALAYALAM: scripts[SE_MALAYALAM] = true; break;
00130 case UBLOCK_THAI: scripts[SE_THAI] = true; break;
00131 case UBLOCK_GEORGIAN: scripts[SE_GEORGIAN] = true; break;
00132 case UBLOCK_ARMENIAN: scripts[SE_ARMENIAN] = true; break;
00133 case UBLOCK_ETHIOPIC: scripts[SE_ETHIOPIC] = true; break;
00134 case UBLOCK_GOTHIC: scripts[SE_GOTHIC] = true; break;
00135
00136
00137 case UBLOCK_CJK_RADICALS_SUPPLEMENT:
00138 case UBLOCK_KANGXI_RADICALS:
00139 case UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS:
00140 case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
00141 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
00142 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
00143 scripts[SE_HAN] = true;
00144 break;
00145 case UBLOCK_CJK_COMPATIBILITY:
00146 case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
00147 case UBLOCK_CJK_COMPATIBILITY_FORMS:
00148 scripts[SE_HAN] = true;
00149 compat = true;
00150 break;
00151 case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
00152 scripts[SE_HANGUL] = true;
00153 compat = true;
00154 break;
00155
00156 default: scripts[SE_LATIN] = true;
00157 }
00158 }
00159 scripts[option] = false;
00160
00161
00162 j = 0;
00163 for (i = 0; !j && i < NUMSCRIPTS; i++) {
00164 if (scripts[i]) j++;
00165 }
00166 if (!j) {
00167 ucnv_close(conv);
00168 return 0;
00169 }
00170
00171 UnicodeString id;
00172 if (compat) {
00173 id = UnicodeString("NFKD");
00174 }
00175 else {
00176 id = UnicodeString("NFD");
00177 }
00178
00179
00180 if (scripts[SE_GREEK]) {
00181 if (option == SE_BETA)
00182 id += UnicodeString(";Greek-Beta");
00183 else if (option == SE_BGREEK)
00184 id += UnicodeString(";Greek-BGreek");
00185 else {
00186 if (!strnicmp (((SWModule*)module)->Lang(), "cop", 3)) {
00187 id += UnicodeString(";Coptic-Latin");
00188 }
00189 else {
00190 id += UnicodeString(";Greek-Latin");
00191 }
00192 scripts[SE_LATIN] = true;
00193 }
00194 }
00195 if (scripts[SE_HEBREW]) {
00196 if (option == SE_BETA)
00197 id += UnicodeString(";Hebrew-CCAT");
00198 else if (option == SE_SYRIAC)
00199 id += UnicodeString(";Hebrew-Syriac");
00200 else {
00201 id += UnicodeString(";Hebrew-Latin");
00202 scripts[SE_LATIN] = true;
00203 }
00204 }
00205 if (scripts[SE_CYRILLIC]) {
00206 id += UnicodeString(";Cyrillic-Latin");
00207 scripts[SE_LATIN] = true;
00208 }
00209 if (scripts[SE_ARABIC]) {
00210 id += UnicodeString(";Arabic-Latin");
00211 scripts[SE_LATIN] = true;
00212 }
00213 if (scripts[SE_SYRIAC]) {
00214 if (option == SE_BETA)
00215 id += UnicodeString(";Syriac-CCAT");
00216 else if (option == SE_HEBREW)
00217 id += UnicodeString(";Syriac-Hebrew");
00218 else {
00219 id += UnicodeString(";Syriac-Latin");
00220 scripts[SE_LATIN] = true;
00221 }
00222 }
00223 if (scripts[SE_THAI]) {
00224 id += UnicodeString(";Thai-Latin");
00225 scripts[SE_LATIN] = true;
00226 }
00227 if (scripts[SE_GEORGIAN]) {
00228 id += UnicodeString(";Georgian-Latin");
00229 scripts[SE_LATIN] = true;
00230 }
00231 if (scripts[SE_ARMENIAN]) {
00232 id += UnicodeString(";Armenian-Latin");
00233 scripts[SE_LATIN] = true;
00234 }
00235 if (scripts[SE_ETHIOPIC]) {
00236 id += UnicodeString(";Ethiopic-Latin");
00237 scripts[SE_LATIN] = true;
00238 }
00239 if (scripts[SE_GOTHIC]) {
00240 id += UnicodeString(";Gothic-Latin");
00241 scripts[SE_LATIN] = true;
00242 }
00243 if (scripts[SE_UGARITIC]) {
00244 id += UnicodeString(";Ugaritic-Latin");
00245 scripts[SE_LATIN] = true;
00246 }
00247 if (scripts[SE_HAN]) {
00248 if (!strnicmp (((SWModule*)module)->Lang(), "ja", 2)) {
00249 id += UnicodeString(";Kanji-OnRomaji");
00250 }
00251 else {
00252 id += UnicodeString(";Han-Pinyin");
00253 }
00254 scripts[SE_LATIN] = true;
00255 }
00256
00257
00258 if (option == SE_HIRAGANA && scripts[SE_KATAKANA]) {
00259 id += UnicodeString(";Katakana-Hiragana");
00260 scripts[SE_HIRAGANA] = true;
00261 }
00262 else if (option == SE_KATAKANA && scripts[SE_HIRAGANA]) {
00263 id += UnicodeString(";Hiragana-Katakana");
00264 scripts[SE_KATAKANA] = true;
00265 }
00266 else {
00267 if (scripts[SE_KATAKANA]) {
00268 id += UnicodeString(";Katakana-Latin");
00269 scripts[SE_LATIN] = true;
00270 }
00271 if (scripts[SE_HIRAGANA]) {
00272 id += UnicodeString(";Hiragana-Latin");
00273 scripts[SE_LATIN] = true;
00274 }
00275 }
00276
00277
00278 if (option == SE_HANGUL && scripts[SE_JAMO]) {
00279 noNFC = false;
00280 scripts[SE_HANGUL] = true;
00281 }
00282 else if (option == SE_JAMO && scripts[SE_HANGUL]) {
00283 noNFC = true;
00284 scripts[SE_JAMO] = true;
00285 }
00286 else {
00287 if (scripts[SE_HANGUL]) {
00288 id += UnicodeString(";Hangul-Latin");
00289 scripts[SE_LATIN] = true;
00290 }
00291 if (scripts[SE_JAMO]) {
00292 id += UnicodeString(";Jamo-Latin");
00293 scripts[SE_LATIN] = true;
00294 }
00295 }
00296
00297
00298 if (option < SE_DEVANAGARI || option > SE_MALAYALAM) {
00299
00300 if (scripts[SE_TAMIL]) {
00301 id += UnicodeString(";Tamil-Latin");
00302 scripts[SE_LATIN] = true;
00303 }
00304 if (scripts[SE_BENGALI]) {
00305 id += UnicodeString(";Bengali-Latin");
00306 scripts[SE_LATIN] = true;
00307 }
00308 if (scripts[SE_GURMUKHI]) {
00309 id += UnicodeString(";Gurmukhi-Latin");
00310 scripts[SE_LATIN] = true;
00311 }
00312 if (scripts[SE_GUJARATI]) {
00313 id += UnicodeString(";Gujarati-Latin");
00314 scripts[SE_LATIN] = true;
00315 }
00316 if (scripts[SE_ORIYA]) {
00317 id += UnicodeString(";Oriya-Latin");
00318 scripts[SE_LATIN] = true;
00319 }
00320 if (scripts[SE_TELUGU]) {
00321 id += UnicodeString(";Telugu-Latin");
00322 scripts[SE_LATIN] = true;
00323 }
00324 if (scripts[SE_KANNADA]) {
00325 id += UnicodeString(";Kannada-Latin");
00326 scripts[SE_LATIN] = true;
00327 }
00328 if (scripts[SE_MALAYALAM]) {
00329 id += UnicodeString(";Malayalam-Latin");
00330 scripts[SE_LATIN] = true;
00331 }
00332 }
00333 else {
00334 if (scripts[SE_LATIN]) {
00335 id += UnicodeString(";Latin-InterIndic");
00336 }
00337 if (scripts[SE_DEVANAGARI]) {
00338 id += UnicodeString(";Devanagari-InterIndic");
00339 }
00340 if (scripts[SE_TAMIL]) {
00341 id += UnicodeString(";Tamil-InterIndic");
00342 }
00343 if (scripts[SE_BENGALI]) {
00344 id += UnicodeString(";Bengali-InterIndic");
00345 }
00346 if (scripts[SE_GURMUKHI]) {
00347 id += UnicodeString(";Gurmurkhi-InterIndic");
00348 }
00349 if (scripts[SE_GUJARATI]) {
00350 id += UnicodeString(";Gujarati-InterIndic");
00351 }
00352 if (scripts[SE_ORIYA]) {
00353 id += UnicodeString(";Oriya-InterIndic");
00354 }
00355 if (scripts[SE_TELUGU]) {
00356 id += UnicodeString(";Telugu-InterIndic");
00357 }
00358 if (scripts[SE_KANNADA]) {
00359 id += UnicodeString(";Kannada-InterIndic");
00360 }
00361 if (scripts[SE_MALAYALAM]) {
00362 id += UnicodeString(";Malayalam-InterIndic");
00363 }
00364
00365 switch(option) {
00366 case SE_DEVANAGARI:
00367 id += UnicodeString(";InterIndic-Devanagari");
00368 break;
00369 case SE_TAMIL:
00370 id += UnicodeString(";InterIndic-Tamil");
00371 break;
00372 case SE_BENGALI:
00373 id += UnicodeString(";InterIndic-Bengali");
00374 break;
00375 case SE_GURMUKHI:
00376 id += UnicodeString(";InterIndic-Gurmukhi");
00377 break;
00378 case SE_GUJARATI:
00379 id += UnicodeString(";InterIndic-Gujarati");
00380 break;
00381 case SE_ORIYA:
00382 id += UnicodeString(";InterIndic-Oriya");
00383 break;
00384 case SE_TELUGU:
00385 id += UnicodeString(";InterIndic-Telugu");
00386 break;
00387 case SE_KANNADA:
00388 id += UnicodeString(";InterIndic-Kannada");
00389 break;
00390 case SE_MALAYALAM:
00391 id += UnicodeString(";InterIndic-Malayalam");
00392 break;
00393 default:
00394 id += UnicodeString(";InterIndic-Latin");
00395 scripts[SE_LATIN] = true;
00396 break;
00397 }
00398 }
00399
00400 if (scripts[SE_LATIN]) {
00401 switch (option) {
00402 case SE_GREEK:
00403 id += UnicodeString(";Latin-Greek");
00404 break;
00405 case SE_HEBREW:
00406 id += UnicodeString(";Latin-Hebrew");
00407 break;
00408 case SE_CYRILLIC:
00409 id += UnicodeString(";Latin-Cyrillic");
00410 break;
00411 case SE_ARABIC:
00412 id += UnicodeString(";Latin-Arabic");
00413 break;
00414 case SE_SYRIAC:
00415 id += UnicodeString(";Latin-Syriac");
00416 break;
00417 case SE_THAI:
00418 id += UnicodeString(";Latin-Thai");
00419 break;
00420 case SE_GEORGIAN:
00421 id += UnicodeString(";Latin-Georgian");
00422 break;
00423 case SE_ARMENIAN:
00424 id += UnicodeString(";Latin-Armenian");
00425 break;
00426 case SE_ETHIOPIC:
00427 id += UnicodeString(";Latin-Ethiopic");
00428 break;
00429 case SE_GOTHIC:
00430 id += UnicodeString(";Latin-Gothic");
00431 break;
00432 case SE_UGARITIC:
00433 id += UnicodeString(";Latin-Ugaritic");
00434 break;
00435 case SE_COPTIC:
00436 id += UnicodeString(";Latin-Coptic");
00437 break;
00438 case SE_KATAKANA:
00439 id += UnicodeString(";Latin-Katakana");
00440 break;
00441 case SE_HIRAGANA:
00442 id += UnicodeString(";Latin-Hiragana");
00443 break;
00444 case SE_JAMO:
00445 id += UnicodeString(";Latin-Jamo");
00446 break;
00447 case SE_HANGUL:
00448 id += UnicodeString(";Latin-Hangul");
00449 break;
00450 }
00451 }
00452
00453 if (option == SE_BASICLATIN) {
00454 id += UnicodeString(";Any-Latin1");
00455 }
00456
00457 if (noNFC) {
00458 id += UnicodeString(";NFD");
00459 } else {
00460 id += UnicodeString(";NFC");
00461 }
00462
00463 UParseError perr;
00464
00465 err = U_ZERO_ERROR;
00466 Transliterator * trans = Transliterator::createInstance(id, UTRANS_FORWARD, perr, err);
00467 if (trans) {
00468 UnicodeString target = UnicodeString(source);
00469 trans->transliterate(target);
00470 len = ucnv_fromUChars(conv, text, maxlen, target.getBuffer(), target.length(), &err);
00471 if (len < maxlen) *(text + len) = 0;
00472 else *(text + maxlen) = 0;
00473 delete trans;
00474 }
00475 ucnv_close(conv);
00476 }
00477 return 0;
00478 }
00479 #endif