hebrewmcim.cpp Source File

00001 
00011 #include <hebrewmcim.h>
00012 
00013 HebrewMCIM::HebrewMCIM()
00014                 :SWInputMethod() {
00015 
00016    init();
00017 }
00018 
00019 
00020 int *HebrewMCIM::translate(char in) {
00021         int retVal = 0;
00022         static int retString[5];
00023         int retStringIndex = 0;
00024 
00025         memset(retString, 0, 5);
00026 
00027         if (getState() > 1) {
00028                 if (getState() >= 12) { // serious issue with internal structure
00029                         setState(0);
00030                         retString[retStringIndex++] = in;
00031                         return retString;
00032                 }
00033                 map<int, int>::iterator find = subst2[getState()].find(in);
00034                 if (find != subst2[getState()].end())
00035                         retVal = find->second;
00036                 else retVal = in;
00037 
00038                 setState(0);
00039                 retString[retStringIndex++] = retVal;
00040                 return retString;
00041         }
00042         else {
00043                 retVal = subst[in];
00044 
00045                 if (retVal == 0) {
00046                         setState(0);
00047                         retString[retStringIndex++] = in;
00048                         return retString;
00049                 }
00050                 if (retVal > 100) {
00051                         setState(1);
00052                         retString[retStringIndex++] = retVal;
00053                         return retString;
00054                 }
00055                 if (retVal == 50) {  // multiChar
00056                         setState(1);
00057                         int *chars = multiChars[in];
00058                         if (chars != 0) {
00059                                 retString[retStringIndex++] = chars[0];
00060                                 retString[retStringIndex++] = chars[1];
00061                                 return retString;
00062                         }
00063                 }
00064         }
00065         setState(retVal);
00066         return 0;
00067 }
00068 
00069 
00070 void HebrewMCIM::init() {
00071         memset(subst, 0, 255);
00072 
00073         subst[')'] = 1488;
00074         subst['B'] = 1489;
00075         subst['G'] = 1490;
00076         subst['D'] = 1491;
00077         subst['H'] = 1492;
00078         subst['W'] = 1493;
00079         subst['Z'] = 1494;
00080         subst['X'] = 1495;
00081         subst['+'] = 1496;
00082         subst['Y'] = 1497;
00083 
00084         subst['k'] = 1498;  // finals
00085         subst['m'] = 1501;
00086         subst['n'] = 1503;
00087         subst['c'] = 1509;
00088 
00089         subst['P'] = 1508;
00090         subst['K'] = 1499;
00091         subst['L'] = 1500;
00092         subst['M'] = 1502;
00093         subst['N'] = 1504;
00094         subst['S'] = 1505;
00095         subst['('] = 1506;
00096         subst['p'] = 1507;
00097         subst['C'] = 1510;
00098         subst['Q'] = 1511;
00099         subst['R'] = 1512;
00100         subst['#'] = 1513;
00101 
00102         // special multiChars
00103         subst['&'] = 50;
00104         subst['$'] = 50;
00105 
00106         static int x[] = {1513, 1474};
00107         multiChars['&'] = x;
00108         static int y[] = {1513, 1473};
00109         multiChars['$'] = y;
00110 
00111         subst['T'] = 1514;
00112 
00113         // VOWELS
00114         subst['A'] = 1463;
00115         subst['F'] = 1464;
00116         subst['E'] = 1462;
00117         subst['"'] = 1461;
00118         subst['I'] = 1460;
00119         subst['O'] = 1465;
00120         subst['U'] = 1467;
00121 
00122 
00123 
00124         // OTHER DIACRITICS
00125         subst['.'] = 1468;
00126         subst['-'] = 1470;
00127         subst[','] = 1471;
00128 
00129         // Compound input
00130 
00131         // CANTILLATION
00132 
00133         subst[':'] = 2;
00134         subst2[2]['A'] = 1458;
00135         subst2[2]['E'] = 1457;
00136         subst2[2]['F'] = 1459;
00137 
00138 
00139         /* Telisha qetana is postpositive as in '04' above. However, Michigan
00140 # code '24' is for a medial telisha. Graphically, there is no
00141 # difference.
00142         */
00143         subst['2'] = 5;
00144         subst2[5]['4'] = 1449;
00145 
00146 
00147         /* Note Michigan encoding distinguishes between medial metheg '35' (occuring
00148 # on the left of the vowel), and the ordinary meteg '95' (occuring on the
00149 # right of the vowel). It is also used for silluq.
00150         */
00151         subst['3'] = 6;
00152         subst2[6]['3'] = 1433;
00153         subst2[6]['5'] = 1469;
00154 
00155 
00156         /* The Michigan code of telisha gedola in medial position. Graphically,
00157 # there is no difference.
00158         */
00159         subst['4'] = 7;
00160         subst2[7]['4'] = 1440;
00161 
00162         subst['6'] = 8;
00163         subst2[8]['0'] = 1451;
00164         subst2[8]['1'] = 1436;
00165 
00166         subst['1'] = 4;
00167         subst2[4]['0'] = 1434;
00168 
00169         /* In the poetic books, prepositive dehi occurs; it's unclear whether
00170 # tipeha also occurs in the poetic books. Otherwise, we could simply
00171 # check for what book in the Tanach we are in. Michigan uses the same
00172 # code for each.
00173         */
00174 
00175         subst2[4]['3'] = 1430;
00176 
00177         /* This is the poetic accent mugrash, which also includes rebia, but is
00178 # encoded separately as '81' in the Michigan text.
00179         */
00180         subst2[4]['1'] = 1437;
00181         subst2[4]['4'] = 1440;
00182 
00183 
00184         subst['0'] = 3;
00185         subst2[3]['0'] = 1475;
00186         subst2[3]['1'] = 1426;
00187 
00188         /* According to BHS, zarqa and sinnor are both postpositive. However,
00189 # the Michigan encoding uses one code for both. The Unicode zarqa
00190 # (0x0598) is definitely NOT postpositive. And further, the shape of
00191 # the symbol is different in BHS and Uniocde. This needs further
00192 # research to determine what's going on here. For now, we follow BHS
00193 # and use the postpositive Unicode zinor or both accents.
00194         */
00195 
00196         subst2[3]['2'] = 1454;
00197 
00198         /* Pashta is postpositive, and the Unicode equivalent reflects
00199 # this. However, there is a poetic equivalent -- azla legarmeh --
00200 # which is not postpositive, but no equivalent code point exists in
00201 # Unicode. The Michigan encoding does not distinguish between the two,
00202 # although it could be algorithmically determined.
00203         */
00204 
00205         subst2[3]['3'] = 1433;
00206         subst2[3]['4'] = 1449;
00207         subst2[3]['5'] = 1472;
00208 
00209 
00210         /* This is the Unicode Hebrew *accent*; there is also another Hebrew
00211 # *punctuation* called GERSHAYIM 0x05F4. I'm using the more
00212 # traditional rounded marks, rather than the alternate straight
00213 # marks.
00214         */
00215 
00216         subst2[8]['2'] = 1438;
00217 
00218         // Also known as azla
00219         subst2[8]['3'] = 1448;
00220         subst2[8]['4'] = 1452;
00221         subst2[8]['5'] = 1427;
00222 
00223 
00224         subst['8'] = 9;
00225         subst2[9]['0'] = 1428;
00226         subst2[9]['1'] = 1431;
00227 
00228         /* Note, this accent is actually sinnorit, but it does not exist as a
00229 # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
00230 # is postpositive, while sinnorit is not. ZARQA is as close as I can
00231 # get to this.
00232         */
00233         subst2[9]['2'] = 1432;
00234 
00235         /* The Unicode form does not match the form used by BHS, but the names
00236 # are the same.
00237         */
00238         subst2[9]['3'] = 1441;
00239         subst2[9]['4'] = 1439;
00240         subst2[9]['5'] = 1429;
00241 
00242         subst['7'] = 10;
00243         subst2[10]['0'] = 1444;
00244         subst2[10]['1'] = 1445;
00245         subst2[10]['2'] = 1446;
00246         subst2[10]['3'] = 1430;  // also '13', '73' also is used for majela
00247         subst2[10]['4'] = 1443;
00248         subst2[10]['5'] = 1469;  // this is silluq; should appear to the left of the vowel
00249 
00250         subst['9'] = 11;
00251         subst2[11]['1'] = 1435;
00252         subst2[11]['2'] = 1425;
00253         subst2[11]['3'] = 1450;
00254         subst2[11]['4'] = 1447;
00255         subst2[11]['5'] = 1469;  // should appear to the right of the vowel
00256 
00257 }
00258 
00259         /*
00260 
00261 
00262 # CANTILLION MARKS
00263 
00264         my  $ETNAHTA =           '&#1425;';
00265 # officially the Unicode name for this symbol was "SEGOL." However, that is
00266 # not a unique name, conflicting with the vowel of the same name. Further,
00267 # the position of the symbol is different. I have changed the name of the
00268 # accent to "SEGOLTA," the traditional name for this accent.
00269         my  $SEGOLTA =           '&#1426;';
00270         my  $SHALSHELET =        '&#1427;';
00271         my  $ZAQEF_QATAN =       '&#1428;';
00272         my  $ZAQEF_GADOL =       '&#1429;';
00273         my  $TIPEHA =            '&#1430;';
00274         my  $REVIA =             '&#1431;';
00275         my  $ZARQA =             '&#1432;';
00276         my  $PASHTA =            '&#1433;';
00277         my  $YETIV =             '&#1434;';
00278         my  $TEVIR =             '&#1435;';
00279         my  $GERESH =            '&#1436;';
00280         my  $GERESH_MUQDAM =     '&#1437;';
00281         my  $GERSHAYIM =         '&#1438;';
00282         my  $QARNEY_PARA =       '&#1439;';
00283         my  $TELISHA_GEDOLA =    '&#1440;';
00284         my  $PAZER =             '&#1441;';
00285         my  $MUNAH =             '&#1443;';
00286         my  $MAHAPAKH =          '&#1444;';
00287         my  $MERKHA =            '&#1445;';
00288         my  $MERKHA_KEFULA =     '&#1446;';
00289         my  $DARGA =             '&#1447;';
00290         my  $QADMA =             '&#1448;';
00291         my  $TELISHA_QETANA =    '&#1449;';
00292         my  $YERAH_BEN_YOMO =    '&#1450;';
00293         my  $OLE =               '&#1451;';
00294         my  $ILUY =              '&#1452;';
00295         my  $DEHI =              '&#1453;';
00296         my  $ZINOR =             '&#1454;';
00297 # HEBREW MARK
00298         my  $MASORA_CIRCLE =     '&#1455;';
00299 # HEBREW EXTENDED-A  points and punctuation
00300         my  $SHEVA =             '&#1456;';
00301         my  $HATAF_SEGOL =       '&#1457;';
00302         my  $HATAF_PATAH =       '&#1458;';
00303         my  $HATAF_QAMATS =      '&#1459;';
00304         my  $HIRIQ =             '&#1460;';
00305         my  $TSERE =             '&#1461;';
00306         my  $SEGOL =             '&#1462;';
00307 # furtive Patah is not a distinct character
00308         my  $PATAH =             '&#1463;';
00309         my  $QAMATS =            '&#1464;';
00310         my  $HOLAM =             '&#1465;';
00311         my  $QUBUTS =            '&#1467;';
00312 # also used as shuruq
00313 # falls within the base letter
00314         my  $DAGESH_OR_MAPIQ =   '&#1468;';
00315 # also used as siluq
00316         my  $METAG =             '&#1469;';
00317         my  $MAQAF =             '&#1470;';
00318         my  $RAFE =              '&#1471;';
00319 # Also used for legarmeh
00320 #   may be treated as spacing punctuation, not as a point
00321         my  $PASEQ =             '&#1472;';
00322         my  $SHIN_DOT =          '&#1473;';
00323         my  $SIN_DOT =           '&#1474;';
00324         my  $SOF_PASUQ =         '&#1475;';
00325 # HEBREW MARK
00326         my  $UPPER_DOT =         '&#1476;';
00327 # HEBREW LETTERS based on ISO 8859-8
00328 # aleph
00329 #  x (alef symbol - 2135)
00330         my  $ALEF =              '&#1488;';
00331 #  x (bet symbol - 2136)
00332         my  $BET =               '&#1489;';
00333 #  x (gimel symbol - 2137)
00334         my  $GIMEL =             '&#1490;';
00335 #  x (dalet symbol - 2138)
00336         my  $DALET =             '&#1491;';
00337         my  $HE =                '&#1492;';
00338         my  $VAV =               '&#1493;';
00339         my  $ZAYIN =             '&#1494;';
00340         my  $HET =               '&#1495;';
00341         my  $TET =               '&#1496;';
00342         my  $YOD =               '&#1497;';
00343         my  $FINAL_KAF =         '&#1498;';
00344         my  $KAF =               '&#1499;';
00345         my  $LAMED =             '&#1500;';
00346         my  $FINAL_MEM =         '&#1501;';
00347         my  $MEM =               '&#1502;';
00348         my  $FINAL_NUN =         '&#1503;';
00349         my  $NUN =               '&#1504;';
00350         my  $SAMEKH =            '&#1505;';
00351         my  $AYIN =              '&#1506;';
00352         my  $FINAL_PE =          '&#1507;';
00353         my  $PE =                '&#1508;';
00354         my  $FINAL_TSADI =       '&#1509;';
00355 # also known as zade
00356         my  $TSADI =             '&#1510;';
00357         my  $QOF =               '&#1511;';
00358         my  $RESH =              '&#1512;';
00359         my  $SHIN =              '&#1513;';
00360         my  $TAV =               '&#1514;';
00361 # Yiddish digraphs
00362 #   Hebrew Ligature
00363 # tsvey vovn
00364         my  $DOUBLE_VAV =        '&#1520;';
00365         my  $VAV_YOD =           '&#1521;';
00366 # tsvey yudn
00367         my  $DOUBLE_YOD =        '&#1522;';
00368 
00369 # Additional punctuation
00370         my  $PUNCT_GERESH =      '&#1523;';
00371         my  $PUNCT_GERSHAYIM =   '&#1524;';
00372 # Reserved: 0x05F5"
00373 # x (hebrew point judeo-spanish varika - FB1E)
00374 #my  $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E
00375 
00376 #############################
00377 # End of Unicode 2.0 Hebrew #
00378 #############################
00379 
00380 # A hash whose key is a Michagan code, and whose value is a Unicode
00381 # equvalent
00382 
00383         char subst[] = new char [255];
00384         subst[')'] = 1488;
00385         'B'  => $BET,
00386         'G'  => $GIMEL,
00387         'D'  => $DALET,
00388         'H'  => $HE,
00389         'W'  => $VAV,
00390         'Z'  => $ZAYIN,
00391         'X'  => $HET,
00392         '+'  => $TET,
00393         'Y'  => $YOD,
00394         'K'  => $KAF,
00395         'L'  => $LAMED,
00396         'M'  => $MEM,
00397         'N'  => $NUN,
00398         'S'  => $SAMEKH,
00399         '('  => $AYIN,
00400         'P'  => $PE,
00401         'C'  => $TSADI,
00402         'Q'  => $QOF,
00403         'R'  => $RESH,
00404         '#'  => $SHIN, # the letter shin without a point
00405         '&'  => ($SHIN . $SIN_DOT),
00406         '$'  => ($SHIN . $SHIN_DOT), # '
00407         'T'  => $TAV,
00408 # VOWELS
00409         'A'  => $PATAH,
00410         'F'  => $QAMATS,
00411         'E'  => $SEGOL,
00412         '"'  => $TSERE,
00413         'I'  => $HIRIQ,
00414         'O'  => $HOLAM,
00415         'U'  => $QUBUTS,
00416         ':'  => $SHEVA,
00417         ':A' => $HATAF_PATAH,
00418         ':E' => $HATAF_SEGOL,
00419         ':F' => $HATAF_QAMATS,
00420 # OTHER DIACRITICS
00421         '.'  => $DAGESH_OR_MAPIQ,
00422         '-'  => $MAQAF,
00423         ','  => $RAFE,
00424 # CANTILLATION
00425         '00' => $SOF_PASUQ,
00426         '01' => $SEGOLTA,
00427 # According to BHS, zarqa and sinnor are both postpositive. However,
00428 # the Michigan encoding uses one code for both. The Unicode zarqa
00429 # (0x0598) is definitely NOT postpositive. And further, the shape of
00430 # the symbol is different in BHS and Uniocde. This needs further
00431 # research to determine what's going on here. For now, we follow BHS
00432 # and use the postpositive Unicode zinor or both accents.
00433         '02' => $ZINOR,
00434 # Pashta is postpositive, and the Unicode equivalent reflects
00435 # this. However, there is a poetic equivalent -- azla legarmeh --
00436 # which is not postpositive, but no equivalent code point exists in
00437 # Unicode. The Michigan encoding does not distinguish between the two,
00438 # although it could be algorithmically determined.
00439         '03' => $PASHTA,
00440         '04' => $TELISHA_QETANA,
00441         '05' => $PASEQ,
00442         '10' => $YETIV,
00443 # In the poetic books, prepositive dehi occurs; it's unclear whether
00444 # tipeha also occurs in the poetic books. Otherwise, we could simply
00445 # check for what book in the Tanach we are in. Michigan uses the same
00446 # code for each.
00447         '13' => $TIPEHA, # also $DEHI
00448 # This is the poetic accent mugrash, which also includes rebia, but is
00449 # encoded separately as '81' in the Michigan text.
00450         '11' => $GERESH_MUQDAM,
00451         '14' => $TELISHA_GEDOLA,
00452 # Telisha qetana is postpositive as in '04' above. However, Michigan
00453 # code '24' is for a medial telisha. Graphically, there is no
00454 # difference.
00455         '24' => $TELISHA_QETANA,
00456         '33' => $PASHTA,
00457 # The Michigan code of telisha gedola in medial position. Graphically,
00458 # there is no difference.
00459         '44' => $TELISHA_GEDOLA,
00460         '60' => $OLE,
00461         '61' => $GERESH,
00462 # This is the Unicode Hebrew *accent*; there is also another Hebrew
00463 # *punctuation* called GERSHAYIM 0x05F4. I'm using the more
00464 # traditional rounded marks, rather than the alternate straight
00465 # marks.
00466         '62' => $GERSHAYIM,
00467 # Also known as azla
00468         '63' => $QADMA,
00469         '64' => $ILUY,
00470         '65' => $SHALSHELET,
00471         '80' => $ZAQEF_QATAN,
00472         '81' => $REVIA,
00473 # Note, this accent is actually sinnorit, but it does not exist as a
00474 # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
00475 # is postpositive, while sinnorit is not. ZARQA is as close as I can
00476 # get to this.
00477         '82' => $ZARQA,
00478 # The Unicode form does not match the form used by BHS, but the names
00479 # are the same.
00480         '83' => $PAZER,
00481         '84' => $QARNEY_PARA,
00482         '85' => $ZAQEF_GADOL,
00483 # Note Michigan encoding distinguishes between medial metheg '35' (occuring
00484 # on the left of the vowel), and the ordinary meteg '95' (occuring on the
00485 # right of the vowel). It is also used for silluq.
00486         '35' => $METAG,
00487         '70' => $MAHAPAKH,
00488         '71' => $MERKHA,
00489         '72' => $MERKHA_KEFULA,
00490         '73' => $TIPEHA, # also '13', '73' also is used for majela
00491         '74' => $MUNAH,
00492         '75' => $METAG, # this is silluq; should appear to the left of the vowel
00493         '91' => $TEVIR,
00494         '92' => $ETNAHTA,
00495         '93' => $YERAH_BEN_YOMO,
00496         '94' => $DARGA,
00497         '95' => $METAG, # should appear to the right of the vowel
00498 
00499 # Not used by the Michigan Encoding
00500 # $UPPER_DOT = '05C4';
00501         );
00502 
00503 # declare other variables
00504         my (@bhsLines,
00505         @bhsVerse,
00506         @entity_line) = ();
00507 
00508         my ($i,
00509         $verse,
00510         $word,
00511         $character) = 0;
00512 
00513         my ($element,
00514         $saveGuttural) = "";
00515 
00516 # read in a line
00517         while (<>) {
00518 # Process one verse
00519 # iterate over every character and change to XML decimal entity
00520         CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {
00521          # find and convert final kaf, mem, nun, pe, tsade
00522          ( # if final form
00523           $bhsVerse[$i] =~ /[KMNPC]/
00524          )
00525            &&
00526                 (
00527                  ( # whitespace or
00528                   $bhsVerse[$i+1] =~ /[ \-?]/
00529                  )
00530                  ||
00531                  ( # EOL or
00532                   $i == ( scalar(@bhsVerse) - 1 )
00533                  )
00534                  ||
00535                  ( # sof pasuq or
00536                   ( $bhsVerse[$i+1] =~ /0/ ) &&
00537                   ( $bhsVerse[$i+2] =~ /0/ )
00538                  )
00539                  ||
00540                  ( # one accent followed by white, eol or
00541                   (
00542                    ( $bhsVerse[$i+1] =~ /\d/ ) &&
00543                    ( $bhsVerse[$i+2] =~ /\d/ )
00544                   ) &&
00545                   (
00546                    ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||
00547                    ( $i == ( scalar(@bhsVerse) - 1 ) )
00548                   )
00549                  )
00550                  ||
00551                  ( # two accents followed by white, eol
00552                   (
00553                    ( $bhsVerse[$i+1] =~ /\d/ ) &&
00554                    ( $bhsVerse[$i+2] =~ /\d/ ) &&
00555                    ( $bhsVerse[$i+3] =~ /\d/ ) &&
00556                    ( $bhsVerse[$i+4] =~ /\d/ )
00557                   ) &&
00558                   (
00559                    ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||
00560                    ( $i == ( scalar(@bhsVerse) - 1 ) )
00561                   )
00562                  )
00563                  ||
00564                  ( # followed by a vowel and white, eol, sof pasuq
00565                   ( $bhsVerse[$i+1] =~ /[:F]/ ) &&
00566                   ( # followed by
00567                    ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or
00568                    ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or
00569                    ( # sof pasuq
00570                     ( $bhsVerse[$i+2] =~ /0/ ) &&
00571                     ( $bhsVerse[$i+3] =~ /0/ )
00572                    )
00573                   )
00574                  )
00575                 ) # end of what follows after final letter
00576                   &&
00577                     do {
00578                          $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }
00579                            && next CHAR;
00580                          $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }
00581                            && next CHAR;
00582                          $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }
00583                            && next CHAR;
00584                          $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }
00585                            && next CHAR;
00586                          $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }
00587                            && next CHAR;
00588                     };
00589          # find and convert "furtive patach"
00590          ( $bhsVerse[$i] =~ /A/ ) &&             # If the letter is a patach
00591            ( $bhsVerse[$i-1] =~ /[)HX(]/ ) &&    #  and is preceeded by a guttural
00592            ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || #  and is preceeded by a vowel
00593                 ( ( $bhsVerse[$i-2] =~ /\./ ) &&    #  or by suruq
00594                   ( $bhsVerse[$i-3] =~ /W/ ) ) ||    #
00595                 ( ( $bhsVerse[$i-2] =~ /W/ ) &&      #  or by holem (written plene)
00596                   ( $bhsVerse[$i-3] =~ /O/ ) ) ||    #
00597                 ( ( $bhsVerse[$i-2] =~ /Y/ ) &&      #  or by hiriq-yod
00598                   ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&
00599                   do {
00600                          $saveGuttural = pop @entity_line; # snip off the gutteral
00601                          push @entity_line,$PATAH;         # push on the patach
00602                          push @entity_line,$saveGuttural;  # push back on the gutteral
00603                          next CHAR;
00604                   };
00605          # convert cantillation
00606          #   since we have previously dealt with all other cases of
00607          #   numbers, two digit patterns are all we have to search for
00608          $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {
00609                 push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
00610                 $i++; # accents are two digits long, so advance past the 2nd digit
00611                 next CHAR;
00612          };
00613          # convert katef vowels, which are two characters long
00614          $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {
00615                 push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
00616                 $i++;
00617                 next CHAR;
00618          };
00619          # convert everything else
00620          push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};
00621         } # end CHAR
00622 # print the line to standard output with XML character-level encoding
00623 # each character has the following format:
00624 # <c id="1kg1.verse#.word#.character#">&#1234;</c>
00625 
00626 # set up the verse element
00627         $word = 1;
00628         $character = 1;
00629         print "<verse>\n<word>\n";
00630 # print each character element
00631 # if there is a space, then close the word entity, open a new word
00632 # entity, increment the word number, reset the character number to
00633 # zero.
00634         foreach $element (@entity_line) {
00635          if ( $element =~ " " ) {
00636            $word++;
00637            $character = 1;
00638            print "</word>\n<word>\n";
00639            next;
00640          }
00641          print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";
00642          $character++;
00643         }
00644 # close the verse element
00645         print "</word></verse>\n";
00646 # reinitialize variables
00647         @bhsVerse = ();
00648         @entity_line = ();
00649         @bhsLines = ();
00650         } # end while
00651 # close the XML document
00652         print "</body>\n";
00653         */