Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

scsuutf8.cpp

00001 /******************************************************************************
00002  *
00003  * SCSUUTF8 -   SWFilter decendant to convert a SCSU character to UTF-8
00004  *
00005  */
00006 
00007 
00008 /* This class is based on:
00009  * http://czyborra.com/scsu/scsu.c written by Roman Czyborra@dds.nl
00010  * on Andrea's balcony in North Amsterdam on 1998-08-04
00011  * Thanks to Richard Verhoeven <rcb5@win.tue.nl> for his suggestion
00012  * to correct the haphazard "if" after UQU to "else if" on 1998-10-01
00013  * 
00014  * This is a deflator to UTF-8 output for input compressed in SCSU,
00015  * the (Reuters) Standard Compression Scheme for Unicode as described
00016  * in http://www.unicode.org/unicode/reports/tr6.html
00017  */
00018 
00019 #include <stdlib.h>
00020 #include <stdio.h>
00021 #include <swmodule.h>
00022 
00023 #include <scsuutf8.h>
00024 
00025 SCSUUTF8::SCSUUTF8() {
00026 }
00027 
00028 
00029 unsigned char* SCSUUTF8::UTF8Output(unsigned long uchar, unsigned char* text)
00030 {
00031   /* join UTF-16 surrogates without any pairing sanity checks */
00032 
00033   static int d;
00034   
00035   if (uchar >= 0xd800 && uchar <= 0xdbff) { d = uchar & 0x3f; return text;  }
00036   if (uchar >= 0xdc00 && uchar <= 0xdfff) { uchar = uchar + 0x2400 + d * 0x400; }
00037   
00038   /* output one character as UTF-8 multibyte sequence */
00039   
00040   if (uchar < 0x80) {
00041     *text++ = c;
00042   }
00043   else if (uchar < 0x800) { 
00044     *text++ = 0xc0 | uchar >> 6; 
00045     *text++ = 0x80 | uchar & 0x3f;
00046   }
00047   else if (uchar < 0x10000) {
00048     *text++ = 0xe0 | uchar >> 12; 
00049     *text++ = 0x80 | uchar >> 6 & 0x3f;
00050     *text++ = 0x80 | uchar & 0x3f;
00051   }
00052   else if (uchar < 0x200000) {
00053     *text++ = 0xf0 | uchar >> 18;
00054     *text++ = 0x80 | uchar >> 12 & 0x3f; 
00055     *text++ = 0x80 | uchar >> 6 & 0x3f; 
00056     *text++ = 0x80 | uchar & 0x3f;
00057   }  
00058   
00059   return text;
00060 }
00061 
00062 char SCSUUTF8::ProcessText(char *text, int len, const SWKey *key, const SWModule *module)
00063 {
00064   unsigned char *to, *from;
00065   unsigned long buflen = len * FILTERPAD;
00066   char active = 0, mode = 0;
00067 
00068   static unsigned short start[8] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
00069   static unsigned short slide[8] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
00070   static unsigned short win[256]   = {
00071     0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
00072     0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
00073     0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
00074     0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
00075     0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
00076     0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
00077     0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
00078     0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
00079     0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
00080     0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
00081     0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
00082     0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
00083     0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
00084     0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
00085     0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
00086     0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
00087     0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
00088     0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
00089     0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
00090     0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
00091     0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
00092     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00093     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00094     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00095     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00096     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00097     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00098     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00099     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00100     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00101     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
00102     0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
00103   };
00104 
00105   if (!len)
00106         return 0;
00107 
00108   memmove(&text[buflen - len], text, len);
00109   from = (unsigned char*)&text[buflen - len];
00110   to = (unsigned char *)text;
00111 
00112   // -------------------------------
00113 
00114   for (int i = 0; i < len;) {
00115 
00116 
00117       if (i >= len) break;
00118       c = from[i++];
00119 
00120       if (c >= 0x80)
00121         {
00122           to = UTF8Output (c - 0x80 + slide[active], to);
00123         }
00124       else if (c >= 0x20 && c <= 0x7F)
00125         {
00126           to = UTF8Output (c, to);
00127         }
00128       else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
00129         {
00130           to = UTF8Output (c, to);
00131         }
00132       else if (c >= 0x1 && c <= 0x8) /* SQn */
00133         {
00134           if (i >= len) break;
00135           /* single quote */ d = from[i++];
00136 
00137           to = UTF8Output (d < 0x80 ? d + start [c - 0x1] :
00138                   d - 0x80 + slide [c - 0x1], to);
00139         }
00140       else if (c >= 0x10 && c <= 0x17) /* SCn */
00141         {
00142           /* change window */ active = c - 0x10;
00143         }
00144       else if (c >= 0x18 && c <= 0x1F) /* SDn */
00145         {
00146           /* define window */ active = c - 0x18;
00147           if (i >= len) break;
00148           slide [active] = win [from[i++]];
00149         }
00150       else if (c == 0xB) /* SDX */
00151         {
00152           if (i >= len) break;
00153           c = from[i++];
00154 
00155           if (i >= len) break;
00156           d = from[i++];
00157 
00158           slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
00159         }
00160       else if (c == 0xE) /* SQU */
00161         {
00162           if (i >= len) break;
00163           /* SQU */ c = from[i++];
00164 
00165           if (i >= len) break;
00166           to = UTF8Output (c << 8 | from[i++], to);
00167         }
00168       else if (c == 0xF) /* SCU */
00169         {
00170           /* change to Unicode mode */ mode = 1;
00171 
00172           while (mode)
00173             {
00174               if (i >= len) break;
00175               c = from[i++];
00176 
00177               if (c <= 0xDF || c >= 0xF3)
00178                 {
00179                   if (i >= len) break;
00180                   to = UTF8Output (c << 8 | from[i++], to);
00181                 }
00182               else if (c == 0xF0) /* UQU */
00183                 {
00184                   if (i >= len) break;
00185                   c = from[i++];
00186 
00187                   if (i >= len) break;
00188                   to = UTF8Output (c << 8 | from[i++], to);
00189                 }
00190               else if (c >= 0xE0 && c <= 0xE7) /* UCn */
00191                 {
00192                   active = c - 0xE0; mode = 0;
00193                 }
00194               else if (c >= 0xE8 && c <= 0xEF) /* UDn */
00195                 {
00196                   if (i >= len) break;
00197                   slide [active=c-0xE8] = win [from[i++]]; mode = 0;
00198                 }
00199               else if (c == 0xF1) /* UDX */
00200                 {
00201                   if (i >= len) break;
00202                   c = from[i++];
00203 
00204                   if (i >= len) break;
00205                   d = from[i++];
00206 
00207                   slide [active = c>>5] =
00208                     0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
00209                 }
00210             }
00211         }
00212 
00213 
00214   }
00215 
00216   *to++ = 0;
00217   *to = 0;
00218   return 0;
00219 }
00220 

Generated on Thu Jun 20 22:13:00 2002 for The Sword Project by doxygen1.2.15