00001 00011 #include <hebrewmcim.h> 00012 00013 HebrewMCIM::HebrewMCIM() 00014 :SWInputMethod() { 00015 00016 init(); 00017 } 00018 00019 00020 int *HebrewMCIM::translate(char in) { 00021 int retVal = 0; 00022 static int retString[5]; 00023 int retStringIndex = 0; 00024 00025 memset(retString, 0, 5); 00026 00027 if (getState() > 1) { 00028 if (getState() >= 12) { // serious issue with internal structure 00029 setState(0); 00030 retString[retStringIndex++] = in; 00031 return retString; 00032 } 00033 map<int, int>::iterator find = subst2[getState()].find(in); 00034 if (find != subst2[getState()].end()) 00035 retVal = find->second; 00036 else retVal = in; 00037 00038 setState(0); 00039 retString[retStringIndex++] = retVal; 00040 return retString; 00041 } 00042 else { 00043 retVal = subst[in]; 00044 00045 if (retVal == 0) { 00046 setState(0); 00047 retString[retStringIndex++] = in; 00048 return retString; 00049 } 00050 if (retVal > 100) { 00051 setState(1); 00052 retString[retStringIndex++] = retVal; 00053 return retString; 00054 } 00055 if (retVal == 50) { // multiChar 00056 setState(1); 00057 int *chars = multiChars[in]; 00058 if (chars != 0) { 00059 retString[retStringIndex++] = chars[0]; 00060 retString[retStringIndex++] = chars[1]; 00061 return retString; 00062 } 00063 } 00064 } 00065 setState(retVal); 00066 return 0; 00067 } 00068 00069 00070 void HebrewMCIM::init() { 00071 memset(subst, 0, 255); 00072 00073 subst[')'] = 1488; 00074 subst['B'] = 1489; 00075 subst['G'] = 1490; 00076 subst['D'] = 1491; 00077 subst['H'] = 1492; 00078 subst['W'] = 1493; 00079 subst['Z'] = 1494; 00080 subst['X'] = 1495; 00081 subst['+'] = 1496; 00082 subst['Y'] = 1497; 00083 00084 subst['k'] = 1498; // finals 00085 subst['m'] = 1501; 00086 subst['n'] = 1503; 00087 subst['c'] = 1509; 00088 00089 subst['P'] = 1508; 00090 subst['K'] = 1499; 00091 subst['L'] = 1500; 00092 subst['M'] = 1502; 00093 subst['N'] = 1504; 00094 subst['S'] = 1505; 00095 subst['('] = 1506; 00096 subst['p'] = 1507; 00097 subst['C'] = 1510; 00098 subst['Q'] = 1511; 00099 subst['R'] = 1512; 00100 subst['#'] = 1513; 00101 00102 // special multiChars 00103 subst['&'] = 50; 00104 subst['$'] = 50; 00105 00106 static int x[] = {1513, 1474}; 00107 multiChars['&'] = x; 00108 static int y[] = {1513, 1473}; 00109 multiChars['$'] = y; 00110 00111 subst['T'] = 1514; 00112 00113 // VOWELS 00114 subst['A'] = 1463; 00115 subst['F'] = 1464; 00116 subst['E'] = 1462; 00117 subst['"'] = 1461; 00118 subst['I'] = 1460; 00119 subst['O'] = 1465; 00120 subst['U'] = 1467; 00121 00122 00123 00124 // OTHER DIACRITICS 00125 subst['.'] = 1468; 00126 subst['-'] = 1470; 00127 subst[','] = 1471; 00128 00129 // Compound input 00130 00131 // CANTILLATION 00132 00133 subst[':'] = 2; 00134 subst2[2]['A'] = 1458; 00135 subst2[2]['E'] = 1457; 00136 subst2[2]['F'] = 1459; 00137 00138 00139 /* Telisha qetana is postpositive as in '04' above. However, Michigan 00140 # code '24' is for a medial telisha. Graphically, there is no 00141 # difference. 00142 */ 00143 subst['2'] = 5; 00144 subst2[5]['4'] = 1449; 00145 00146 00147 /* Note Michigan encoding distinguishes between medial metheg '35' (occuring 00148 # on the left of the vowel), and the ordinary meteg '95' (occuring on the 00149 # right of the vowel). It is also used for silluq. 00150 */ 00151 subst['3'] = 6; 00152 subst2[6]['3'] = 1433; 00153 subst2[6]['5'] = 1469; 00154 00155 00156 /* The Michigan code of telisha gedola in medial position. Graphically, 00157 # there is no difference. 00158 */ 00159 subst['4'] = 7; 00160 subst2[7]['4'] = 1440; 00161 00162 subst['6'] = 8; 00163 subst2[8]['0'] = 1451; 00164 subst2[8]['1'] = 1436; 00165 00166 subst['1'] = 4; 00167 subst2[4]['0'] = 1434; 00168 00169 /* In the poetic books, prepositive dehi occurs; it's unclear whether 00170 # tipeha also occurs in the poetic books. Otherwise, we could simply 00171 # check for what book in the Tanach we are in. Michigan uses the same 00172 # code for each. 00173 */ 00174 00175 subst2[4]['3'] = 1430; 00176 00177 /* This is the poetic accent mugrash, which also includes rebia, but is 00178 # encoded separately as '81' in the Michigan text. 00179 */ 00180 subst2[4]['1'] = 1437; 00181 subst2[4]['4'] = 1440; 00182 00183 00184 subst['0'] = 3; 00185 subst2[3]['0'] = 1475; 00186 subst2[3]['1'] = 1426; 00187 00188 /* According to BHS, zarqa and sinnor are both postpositive. However, 00189 # the Michigan encoding uses one code for both. The Unicode zarqa 00190 # (0x0598) is definitely NOT postpositive. And further, the shape of 00191 # the symbol is different in BHS and Uniocde. This needs further 00192 # research to determine what's going on here. For now, we follow BHS 00193 # and use the postpositive Unicode zinor or both accents. 00194 */ 00195 00196 subst2[3]['2'] = 1454; 00197 00198 /* Pashta is postpositive, and the Unicode equivalent reflects 00199 # this. However, there is a poetic equivalent -- azla legarmeh -- 00200 # which is not postpositive, but no equivalent code point exists in 00201 # Unicode. The Michigan encoding does not distinguish between the two, 00202 # although it could be algorithmically determined. 00203 */ 00204 00205 subst2[3]['3'] = 1433; 00206 subst2[3]['4'] = 1449; 00207 subst2[3]['5'] = 1472; 00208 00209 00210 /* This is the Unicode Hebrew *accent*; there is also another Hebrew 00211 # *punctuation* called GERSHAYIM 0x05F4. I'm using the more 00212 # traditional rounded marks, rather than the alternate straight 00213 # marks. 00214 */ 00215 00216 subst2[8]['2'] = 1438; 00217 00218 // Also known as azla 00219 subst2[8]['3'] = 1448; 00220 subst2[8]['4'] = 1452; 00221 subst2[8]['5'] = 1427; 00222 00223 00224 subst['8'] = 9; 00225 subst2[9]['0'] = 1428; 00226 subst2[9]['1'] = 1431; 00227 00228 /* Note, this accent is actually sinnorit, but it does not exist as a 00229 # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent 00230 # is postpositive, while sinnorit is not. ZARQA is as close as I can 00231 # get to this. 00232 */ 00233 subst2[9]['2'] = 1432; 00234 00235 /* The Unicode form does not match the form used by BHS, but the names 00236 # are the same. 00237 */ 00238 subst2[9]['3'] = 1441; 00239 subst2[9]['4'] = 1439; 00240 subst2[9]['5'] = 1429; 00241 00242 subst['7'] = 10; 00243 subst2[10]['0'] = 1444; 00244 subst2[10]['1'] = 1445; 00245 subst2[10]['2'] = 1446; 00246 subst2[10]['3'] = 1430; // also '13', '73' also is used for majela 00247 subst2[10]['4'] = 1443; 00248 subst2[10]['5'] = 1469; // this is silluq; should appear to the left of the vowel 00249 00250 subst['9'] = 11; 00251 subst2[11]['1'] = 1435; 00252 subst2[11]['2'] = 1425; 00253 subst2[11]['3'] = 1450; 00254 subst2[11]['4'] = 1447; 00255 subst2[11]['5'] = 1469; // should appear to the right of the vowel 00256 00257 } 00258 00259 /* 00260 00261 00262 # CANTILLION MARKS 00263 00264 my $ETNAHTA = '֑'; 00265 # officially the Unicode name for this symbol was "SEGOL." However, that is 00266 # not a unique name, conflicting with the vowel of the same name. Further, 00267 # the position of the symbol is different. I have changed the name of the 00268 # accent to "SEGOLTA," the traditional name for this accent. 00269 my $SEGOLTA = '֒'; 00270 my $SHALSHELET = '֓'; 00271 my $ZAQEF_QATAN = '֔'; 00272 my $ZAQEF_GADOL = '֕'; 00273 my $TIPEHA = '֖'; 00274 my $REVIA = '֗'; 00275 my $ZARQA = '֘'; 00276 my $PASHTA = '֙'; 00277 my $YETIV = '֚'; 00278 my $TEVIR = '֛'; 00279 my $GERESH = '֜'; 00280 my $GERESH_MUQDAM = '֝'; 00281 my $GERSHAYIM = '֞'; 00282 my $QARNEY_PARA = '֟'; 00283 my $TELISHA_GEDOLA = '֠'; 00284 my $PAZER = '֡'; 00285 my $MUNAH = '֣'; 00286 my $MAHAPAKH = '֤'; 00287 my $MERKHA = '֥'; 00288 my $MERKHA_KEFULA = '֦'; 00289 my $DARGA = '֧'; 00290 my $QADMA = '֨'; 00291 my $TELISHA_QETANA = '֩'; 00292 my $YERAH_BEN_YOMO = '֪'; 00293 my $OLE = '֫'; 00294 my $ILUY = '֬'; 00295 my $DEHI = '֭'; 00296 my $ZINOR = '֮'; 00297 # HEBREW MARK 00298 my $MASORA_CIRCLE = '֯'; 00299 # HEBREW EXTENDED-A points and punctuation 00300 my $SHEVA = 'ְ'; 00301 my $HATAF_SEGOL = 'ֱ'; 00302 my $HATAF_PATAH = 'ֲ'; 00303 my $HATAF_QAMATS = 'ֳ'; 00304 my $HIRIQ = 'ִ'; 00305 my $TSERE = 'ֵ'; 00306 my $SEGOL = 'ֶ'; 00307 # furtive Patah is not a distinct character 00308 my $PATAH = 'ַ'; 00309 my $QAMATS = 'ָ'; 00310 my $HOLAM = 'ֹ'; 00311 my $QUBUTS = 'ֻ'; 00312 # also used as shuruq 00313 # falls within the base letter 00314 my $DAGESH_OR_MAPIQ = 'ּ'; 00315 # also used as siluq 00316 my $METAG = 'ֽ'; 00317 my $MAQAF = '־'; 00318 my $RAFE = 'ֿ'; 00319 # Also used for legarmeh 00320 # may be treated as spacing punctuation, not as a point 00321 my $PASEQ = '׀'; 00322 my $SHIN_DOT = 'ׁ'; 00323 my $SIN_DOT = 'ׂ'; 00324 my $SOF_PASUQ = '׃'; 00325 # HEBREW MARK 00326 my $UPPER_DOT = 'ׄ'; 00327 # HEBREW LETTERS based on ISO 8859-8 00328 # aleph 00329 # x (alef symbol - 2135) 00330 my $ALEF = 'א'; 00331 # x (bet symbol - 2136) 00332 my $BET = 'ב'; 00333 # x (gimel symbol - 2137) 00334 my $GIMEL = 'ג'; 00335 # x (dalet symbol - 2138) 00336 my $DALET = 'ד'; 00337 my $HE = 'ה'; 00338 my $VAV = 'ו'; 00339 my $ZAYIN = 'ז'; 00340 my $HET = 'ח'; 00341 my $TET = 'ט'; 00342 my $YOD = 'י'; 00343 my $FINAL_KAF = 'ך'; 00344 my $KAF = 'כ'; 00345 my $LAMED = 'ל'; 00346 my $FINAL_MEM = 'ם'; 00347 my $MEM = 'מ'; 00348 my $FINAL_NUN = 'ן'; 00349 my $NUN = 'נ'; 00350 my $SAMEKH = 'ס'; 00351 my $AYIN = 'ע'; 00352 my $FINAL_PE = 'ף'; 00353 my $PE = 'פ'; 00354 my $FINAL_TSADI = 'ץ'; 00355 # also known as zade 00356 my $TSADI = 'צ'; 00357 my $QOF = 'ק'; 00358 my $RESH = 'ר'; 00359 my $SHIN = 'ש'; 00360 my $TAV = 'ת'; 00361 # Yiddish digraphs 00362 # Hebrew Ligature 00363 # tsvey vovn 00364 my $DOUBLE_VAV = 'װ'; 00365 my $VAV_YOD = 'ױ'; 00366 # tsvey yudn 00367 my $DOUBLE_YOD = 'ײ'; 00368 00369 # Additional punctuation 00370 my $PUNCT_GERESH = '׳'; 00371 my $PUNCT_GERSHAYIM = '״'; 00372 # Reserved: 0x05F5" 00373 # x (hebrew point judeo-spanish varika - FB1E) 00374 #my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E 00375 00376 ############################# 00377 # End of Unicode 2.0 Hebrew # 00378 ############################# 00379 00380 # A hash whose key is a Michagan code, and whose value is a Unicode 00381 # equvalent 00382 00383 char subst[] = new char [255]; 00384 subst[')'] = 1488; 00385 'B' => $BET, 00386 'G' => $GIMEL, 00387 'D' => $DALET, 00388 'H' => $HE, 00389 'W' => $VAV, 00390 'Z' => $ZAYIN, 00391 'X' => $HET, 00392 '+' => $TET, 00393 'Y' => $YOD, 00394 'K' => $KAF, 00395 'L' => $LAMED, 00396 'M' => $MEM, 00397 'N' => $NUN, 00398 'S' => $SAMEKH, 00399 '(' => $AYIN, 00400 'P' => $PE, 00401 'C' => $TSADI, 00402 'Q' => $QOF, 00403 'R' => $RESH, 00404 '#' => $SHIN, # the letter shin without a point 00405 '&' => ($SHIN . $SIN_DOT), 00406 '$' => ($SHIN . $SHIN_DOT), # ' 00407 'T' => $TAV, 00408 # VOWELS 00409 'A' => $PATAH, 00410 'F' => $QAMATS, 00411 'E' => $SEGOL, 00412 '"' => $TSERE, 00413 'I' => $HIRIQ, 00414 'O' => $HOLAM, 00415 'U' => $QUBUTS, 00416 ':' => $SHEVA, 00417 ':A' => $HATAF_PATAH, 00418 ':E' => $HATAF_SEGOL, 00419 ':F' => $HATAF_QAMATS, 00420 # OTHER DIACRITICS 00421 '.' => $DAGESH_OR_MAPIQ, 00422 '-' => $MAQAF, 00423 ',' => $RAFE, 00424 # CANTILLATION 00425 '00' => $SOF_PASUQ, 00426 '01' => $SEGOLTA, 00427 # According to BHS, zarqa and sinnor are both postpositive. However, 00428 # the Michigan encoding uses one code for both. The Unicode zarqa 00429 # (0x0598) is definitely NOT postpositive. And further, the shape of 00430 # the symbol is different in BHS and Uniocde. This needs further 00431 # research to determine what's going on here. For now, we follow BHS 00432 # and use the postpositive Unicode zinor or both accents. 00433 '02' => $ZINOR, 00434 # Pashta is postpositive, and the Unicode equivalent reflects 00435 # this. However, there is a poetic equivalent -- azla legarmeh -- 00436 # which is not postpositive, but no equivalent code point exists in 00437 # Unicode. The Michigan encoding does not distinguish between the two, 00438 # although it could be algorithmically determined. 00439 '03' => $PASHTA, 00440 '04' => $TELISHA_QETANA, 00441 '05' => $PASEQ, 00442 '10' => $YETIV, 00443 # In the poetic books, prepositive dehi occurs; it's unclear whether 00444 # tipeha also occurs in the poetic books. Otherwise, we could simply 00445 # check for what book in the Tanach we are in. Michigan uses the same 00446 # code for each. 00447 '13' => $TIPEHA, # also $DEHI 00448 # This is the poetic accent mugrash, which also includes rebia, but is 00449 # encoded separately as '81' in the Michigan text. 00450 '11' => $GERESH_MUQDAM, 00451 '14' => $TELISHA_GEDOLA, 00452 # Telisha qetana is postpositive as in '04' above. However, Michigan 00453 # code '24' is for a medial telisha. Graphically, there is no 00454 # difference. 00455 '24' => $TELISHA_QETANA, 00456 '33' => $PASHTA, 00457 # The Michigan code of telisha gedola in medial position. Graphically, 00458 # there is no difference. 00459 '44' => $TELISHA_GEDOLA, 00460 '60' => $OLE, 00461 '61' => $GERESH, 00462 # This is the Unicode Hebrew *accent*; there is also another Hebrew 00463 # *punctuation* called GERSHAYIM 0x05F4. I'm using the more 00464 # traditional rounded marks, rather than the alternate straight 00465 # marks. 00466 '62' => $GERSHAYIM, 00467 # Also known as azla 00468 '63' => $QADMA, 00469 '64' => $ILUY, 00470 '65' => $SHALSHELET, 00471 '80' => $ZAQEF_QATAN, 00472 '81' => $REVIA, 00473 # Note, this accent is actually sinnorit, but it does not exist as a 00474 # separate glyph in the Unicode standard. The 'ZINOR' Unicode accent 00475 # is postpositive, while sinnorit is not. ZARQA is as close as I can 00476 # get to this. 00477 '82' => $ZARQA, 00478 # The Unicode form does not match the form used by BHS, but the names 00479 # are the same. 00480 '83' => $PAZER, 00481 '84' => $QARNEY_PARA, 00482 '85' => $ZAQEF_GADOL, 00483 # Note Michigan encoding distinguishes between medial metheg '35' (occuring 00484 # on the left of the vowel), and the ordinary meteg '95' (occuring on the 00485 # right of the vowel). It is also used for silluq. 00486 '35' => $METAG, 00487 '70' => $MAHAPAKH, 00488 '71' => $MERKHA, 00489 '72' => $MERKHA_KEFULA, 00490 '73' => $TIPEHA, # also '13', '73' also is used for majela 00491 '74' => $MUNAH, 00492 '75' => $METAG, # this is silluq; should appear to the left of the vowel 00493 '91' => $TEVIR, 00494 '92' => $ETNAHTA, 00495 '93' => $YERAH_BEN_YOMO, 00496 '94' => $DARGA, 00497 '95' => $METAG, # should appear to the right of the vowel 00498 00499 # Not used by the Michigan Encoding 00500 # $UPPER_DOT = '05C4'; 00501 ); 00502 00503 # declare other variables 00504 my (@bhsLines, 00505 @bhsVerse, 00506 @entity_line) = (); 00507 00508 my ($i, 00509 $verse, 00510 $word, 00511 $character) = 0; 00512 00513 my ($element, 00514 $saveGuttural) = ""; 00515 00516 # read in a line 00517 while (<>) { 00518 # Process one verse 00519 # iterate over every character and change to XML decimal entity 00520 CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) { 00521 # find and convert final kaf, mem, nun, pe, tsade 00522 ( # if final form 00523 $bhsVerse[$i] =~ /[KMNPC]/ 00524 ) 00525 && 00526 ( 00527 ( # whitespace or 00528 $bhsVerse[$i+1] =~ /[ \-?]/ 00529 ) 00530 || 00531 ( # EOL or 00532 $i == ( scalar(@bhsVerse) - 1 ) 00533 ) 00534 || 00535 ( # sof pasuq or 00536 ( $bhsVerse[$i+1] =~ /0/ ) && 00537 ( $bhsVerse[$i+2] =~ /0/ ) 00538 ) 00539 || 00540 ( # one accent followed by white, eol or 00541 ( 00542 ( $bhsVerse[$i+1] =~ /\d/ ) && 00543 ( $bhsVerse[$i+2] =~ /\d/ ) 00544 ) && 00545 ( 00546 ( $bhsVerse[$i+3] =~ /[ \-?]/ ) || 00547 ( $i == ( scalar(@bhsVerse) - 1 ) ) 00548 ) 00549 ) 00550 || 00551 ( # two accents followed by white, eol 00552 ( 00553 ( $bhsVerse[$i+1] =~ /\d/ ) && 00554 ( $bhsVerse[$i+2] =~ /\d/ ) && 00555 ( $bhsVerse[$i+3] =~ /\d/ ) && 00556 ( $bhsVerse[$i+4] =~ /\d/ ) 00557 ) && 00558 ( 00559 ( $bhsVerse[$i+5] =~ /[ \-?]/ ) || 00560 ( $i == ( scalar(@bhsVerse) - 1 ) ) 00561 ) 00562 ) 00563 || 00564 ( # followed by a vowel and white, eol, sof pasuq 00565 ( $bhsVerse[$i+1] =~ /[:F]/ ) && 00566 ( # followed by 00567 ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or 00568 ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or 00569 ( # sof pasuq 00570 ( $bhsVerse[$i+2] =~ /0/ ) && 00571 ( $bhsVerse[$i+3] =~ /0/ ) 00572 ) 00573 ) 00574 ) 00575 ) # end of what follows after final letter 00576 && 00577 do { 00578 $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; } 00579 && next CHAR; 00580 $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; } 00581 && next CHAR; 00582 $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; } 00583 && next CHAR; 00584 $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; } 00585 && next CHAR; 00586 $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; } 00587 && next CHAR; 00588 }; 00589 # find and convert "furtive patach" 00590 ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach 00591 ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural 00592 ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel 00593 ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq 00594 ( $bhsVerse[$i-3] =~ /W/ ) ) || # 00595 ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene) 00596 ( $bhsVerse[$i-3] =~ /O/ ) ) || # 00597 ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod 00598 ( $bhsVerse[$i-3] =~ /I/ ) ) ) && 00599 do { 00600 $saveGuttural = pop @entity_line; # snip off the gutteral 00601 push @entity_line,$PATAH; # push on the patach 00602 push @entity_line,$saveGuttural; # push back on the gutteral 00603 next CHAR; 00604 }; 00605 # convert cantillation 00606 # since we have previously dealt with all other cases of 00607 # numbers, two digit patterns are all we have to search for 00608 $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do { 00609 push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"}; 00610 $i++; # accents are two digits long, so advance past the 2nd digit 00611 next CHAR; 00612 }; 00613 # convert katef vowels, which are two characters long 00614 $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do { 00615 push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"}; 00616 $i++; 00617 next CHAR; 00618 }; 00619 # convert everything else 00620 push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"}; 00621 } # end CHAR 00622 # print the line to standard output with XML character-level encoding 00623 # each character has the following format: 00624 # <c id="1kg1.verse#.word#.character#">Ӓ</c> 00625 00626 # set up the verse element 00627 $word = 1; 00628 $character = 1; 00629 print "<verse>\n<word>\n"; 00630 # print each character element 00631 # if there is a space, then close the word entity, open a new word 00632 # entity, increment the word number, reset the character number to 00633 # zero. 00634 foreach $element (@entity_line) { 00635 if ( $element =~ " " ) { 00636 $word++; 00637 $character = 1; 00638 print "</word>\n<word>\n"; 00639 next; 00640 } 00641 print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n"; 00642 $character++; 00643 } 00644 # close the verse element 00645 print "</word></verse>\n"; 00646 # reinitialize variables 00647 @bhsVerse = (); 00648 @entity_line = (); 00649 @bhsLines = (); 00650 } # end while 00651 # close the XML document 00652 print "</body>\n"; 00653 */