<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
<title>hebrewmcim.cpp Source File</title>
<link href="doxygen.css" rel="stylesheet" type="text/css">
</head><body>
<!-- Generated by Doxygen 1.2.15 -->
<center>
<a class="qindex" href="index.html">Main Page</a> <a class="qindex" href="namespaces.html">Namespace List</a> <a class="qindex" href="hierarchy.html">Class Hierarchy</a> <a class="qindex" href="classes.html">Alphabetical List</a> <a class="qindex" href="annotated.html">Compound List</a> <a class="qindex" href="files.html">File List</a> <a class="qindex" href="functions.html">Compound Members</a> </center>
<hr><h1>hebrewmcim.cpp</h1><div class="fragment"><pre>00001
00011 <font class="preprocessor">#include <hebrewmcim.h></font>
00012
00013 HebrewMCIM::HebrewMCIM()
00014 :<a class="code" href="class_s_w_input_method.html">SWInputMethod</a>() {
00015
00016 init();
00017 }
00018
00019
00020 <font class="keywordtype">int</font> *HebrewMCIM::translate(<font class="keywordtype">char</font> in) {
00021 <font class="keywordtype">int</font> retVal = 0;
00022 <font class="keyword">static</font> <font class="keywordtype">int</font> retString[5];
00023 <font class="keywordtype">int</font> retStringIndex = 0;
00024
00025 memset(retString, 0, 5);
00026
00027 <font class="keywordflow">if</font> (getState() > 1) {
00028 <font class="keywordflow">if</font> (getState() >= 12) { <font class="comment">// serious issue with internal structure</font>
00029 setState(0);
00030 retString[retStringIndex++] = in;
00031 <font class="keywordflow">return</font> retString;
00032 }
00033 map<int, int>::iterator find = subst2[getState()].find(in);
00034 <font class="keywordflow">if</font> (find != subst2[getState()].end())
00035 retVal = find->second;
00036 <font class="keywordflow">else</font> retVal = in;
00037
00038 setState(0);
00039 retString[retStringIndex++] = retVal;
00040 <font class="keywordflow">return</font> retString;
00041 }
00042 <font class="keywordflow">else</font> {
00043 retVal = subst[in];
00044
00045 <font class="keywordflow">if</font> (retVal == 0) {
00046 setState(0);
00047 retString[retStringIndex++] = in;
00048 <font class="keywordflow">return</font> retString;
00049 }
00050 <font class="keywordflow">if</font> (retVal > 100) {
00051 setState(1);
00052 retString[retStringIndex++] = retVal;
00053 <font class="keywordflow">return</font> retString;
00054 }
00055 <font class="keywordflow">if</font> (retVal == 50) { <font class="comment">// multiChar</font>
00056 setState(1);
00057 <font class="keywordtype">int</font> *chars = multiChars[in];
00058 <font class="keywordflow">if</font> (chars != 0) {
00059 retString[retStringIndex++] = chars[0];
00060 retString[retStringIndex++] = chars[1];
00061 <font class="keywordflow">return</font> retString;
00062 }
00063 }
00064 }
00065 setState(retVal);
00066 <font class="keywordflow">return</font> 0;
00067 }
00068
00069
00070 <font class="keywordtype">void</font> HebrewMCIM::init() {
00071 memset(subst, 0, 255);
00072
00073 subst[<font class="charliteral">')'</font>] = 1488;
00074 subst[<font class="charliteral">'B'</font>] = 1489;
00075 subst[<font class="charliteral">'G'</font>] = 1490;
00076 subst[<font class="charliteral">'D'</font>] = 1491;
00077 subst[<font class="charliteral">'H'</font>] = 1492;
00078 subst[<font class="charliteral">'W'</font>] = 1493;
00079 subst[<font class="charliteral">'Z'</font>] = 1494;
00080 subst[<font class="charliteral">'X'</font>] = 1495;
00081 subst[<font class="charliteral">'+'</font>] = 1496;
00082 subst[<font class="charliteral">'Y'</font>] = 1497;
00083
00084 subst[<font class="charliteral">'k'</font>] = 1498; <font class="comment">// finals</font>
00085 subst[<font class="charliteral">'m'</font>] = 1501;
00086 subst[<font class="charliteral">'n'</font>] = 1503;
00087 subst[<font class="charliteral">'c'</font>] = 1509;
00088
00089 subst[<font class="charliteral">'P'</font>] = 1508;
00090 subst[<font class="charliteral">'K'</font>] = 1499;
00091 subst[<font class="charliteral">'L'</font>] = 1500;
00092 subst[<font class="charliteral">'M'</font>] = 1502;
00093 subst[<font class="charliteral">'N'</font>] = 1504;
00094 subst[<font class="charliteral">'S'</font>] = 1505;
00095 subst[<font class="charliteral">'('</font>] = 1506;
00096 subst[<font class="charliteral">'p'</font>] = 1507;
00097 subst[<font class="charliteral">'C'</font>] = 1510;
00098 subst[<font class="charliteral">'Q'</font>] = 1511;
00099 subst[<font class="charliteral">'R'</font>] = 1512;
00100 subst[<font class="charliteral">'#'</font>] = 1513;
00101
00102 <font class="comment">// special multiChars</font>
00103 subst[<font class="charliteral">'&'</font>] = 50;
00104 subst[<font class="charliteral">'$'</font>] = 50;
00105
00106 <font class="keyword">static</font> <font class="keywordtype">int</font> x[] = {1513, 1474};
00107 multiChars[<font class="charliteral">'&'</font>] = x;
00108 <font class="keyword">static</font> <font class="keywordtype">int</font> y[] = {1513, 1473};
00109 multiChars[<font class="charliteral">'$'</font>] = y;
00110
00111 subst[<font class="charliteral">'T'</font>] = 1514;
00112
00113 <font class="comment">// VOWELS</font>
00114 subst[<font class="charliteral">'A'</font>] = 1463;
00115 subst[<font class="charliteral">'F'</font>] = 1464;
00116 subst[<font class="charliteral">'E'</font>] = 1462;
00117 subst[<font class="charliteral">'"'</font>] = 1461;
00118 subst[<font class="charliteral">'I'</font>] = 1460;
00119 subst[<font class="charliteral">'O'</font>] = 1465;
00120 subst[<font class="charliteral">'U'</font>] = 1467;
00121
00122
00123
00124 <font class="comment">// OTHER DIACRITICS</font>
00125 subst[<font class="charliteral">'.'</font>] = 1468;
00126 subst[<font class="charliteral">'-'</font>] = 1470;
00127 subst[<font class="charliteral">','</font>] = 1471;
00128
00129 <font class="comment">// Compound input</font>
00130
00131 <font class="comment">// CANTILLATION</font>
00132
00133 subst[<font class="charliteral">':'</font>] = 2;
00134 subst2[2][<font class="charliteral">'A'</font>] = 1458;
00135 subst2[2][<font class="charliteral">'E'</font>] = 1457;
00136 subst2[2][<font class="charliteral">'F'</font>] = 1459;
00137
00138
00139 <font class="comment">/* Telisha qetana is postpositive as in '04' above. However, Michigan</font>
00140 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
00141 <font class="comment"># difference.</font>
00142 <font class="comment"> */</font>
00143 subst[<font class="charliteral">'2'</font>] = 5;
00144 subst2[5][<font class="charliteral">'4'</font>] = 1449;
00145
00146
00147 <font class="comment">/* Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
00148 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
00149 <font class="comment"># right of the vowel). It is also used for silluq.</font>
00150 <font class="comment"> */</font>
00151 subst[<font class="charliteral">'3'</font>] = 6;
00152 subst2[6][<font class="charliteral">'3'</font>] = 1433;
00153 subst2[6][<font class="charliteral">'5'</font>] = 1469;
00154
00155
00156 <font class="comment">/* The Michigan code of telisha gedola in medial position. Graphically,</font>
00157 <font class="comment"># there is no difference.</font>
00158 <font class="comment"> */</font>
00159 subst[<font class="charliteral">'4'</font>] = 7;
00160 subst2[7][<font class="charliteral">'4'</font>] = 1440;
00161
00162 subst[<font class="charliteral">'6'</font>] = 8;
00163 subst2[8][<font class="charliteral">'0'</font>] = 1451;
00164 subst2[8][<font class="charliteral">'1'</font>] = 1436;
00165
00166 subst[<font class="charliteral">'1'</font>] = 4;
00167 subst2[4][<font class="charliteral">'0'</font>] = 1434;
00168
00169 <font class="comment">/* In the poetic books, prepositive dehi occurs; it's unclear whether</font>
00170 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
00171 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
00172 <font class="comment"># code for each.</font>
00173 <font class="comment"> */</font>
00174
00175 subst2[4][<font class="charliteral">'3'</font>] = 1430;
00176
00177 <font class="comment">/* This is the poetic accent mugrash, which also includes rebia, but is</font>
00178 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
00179 <font class="comment"> */</font>
00180 subst2[4][<font class="charliteral">'1'</font>] = 1437;
00181 subst2[4][<font class="charliteral">'4'</font>] = 1440;
00182
00183
00184 subst[<font class="charliteral">'0'</font>] = 3;
00185 subst2[3][<font class="charliteral">'0'</font>] = 1475;
00186 subst2[3][<font class="charliteral">'1'</font>] = 1426;
00187
00188 <font class="comment">/* According to BHS, zarqa and sinnor are both postpositive. However,</font>
00189 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
00190 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
00191 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
00192 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
00193 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
00194 <font class="comment"> */</font>
00195
00196 subst2[3][<font class="charliteral">'2'</font>] = 1454;
00197
00198 <font class="comment">/* Pashta is postpositive, and the Unicode equivalent reflects</font>
00199 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
00200 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
00201 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
00202 <font class="comment"># although it could be algorithmically determined.</font>
00203 <font class="comment"> */</font>
00204
00205 subst2[3][<font class="charliteral">'3'</font>] = 1433;
00206 subst2[3][<font class="charliteral">'4'</font>] = 1449;
00207 subst2[3][<font class="charliteral">'5'</font>] = 1472;
00208
00209
00210 <font class="comment">/* This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
00211 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
00212 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
00213 <font class="comment"># marks.</font>
00214 <font class="comment"> */</font>
00215
00216 subst2[8][<font class="charliteral">'2'</font>] = 1438;
00217
00218 <font class="comment">// Also known as azla</font>
00219 subst2[8][<font class="charliteral">'3'</font>] = 1448;
00220 subst2[8][<font class="charliteral">'4'</font>] = 1452;
00221 subst2[8][<font class="charliteral">'5'</font>] = 1427;
00222
00223
00224 subst[<font class="charliteral">'8'</font>] = 9;
00225 subst2[9][<font class="charliteral">'0'</font>] = 1428;
00226 subst2[9][<font class="charliteral">'1'</font>] = 1431;
00227
00228 <font class="comment">/* Note, this accent is actually sinnorit, but it does not exist as a</font>
00229 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
00230 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
00231 <font class="comment"># get to this.</font>
00232 <font class="comment"> */</font>
00233 subst2[9][<font class="charliteral">'2'</font>] = 1432;
00234
00235 <font class="comment">/* The Unicode form does not match the form used by BHS, but the names</font>
00236 <font class="comment"># are the same.</font>
00237 <font class="comment"> */</font>
00238 subst2[9][<font class="charliteral">'3'</font>] = 1441;
00239 subst2[9][<font class="charliteral">'4'</font>] = 1439;
00240 subst2[9][<font class="charliteral">'5'</font>] = 1429;
00241
00242 subst[<font class="charliteral">'7'</font>] = 10;
00243 subst2[10][<font class="charliteral">'0'</font>] = 1444;
00244 subst2[10][<font class="charliteral">'1'</font>] = 1445;
00245 subst2[10][<font class="charliteral">'2'</font>] = 1446;
00246 subst2[10][<font class="charliteral">'3'</font>] = 1430; <font class="comment">// also '13', '73' also is used for majela</font>
00247 subst2[10][<font class="charliteral">'4'</font>] = 1443;
00248 subst2[10][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// this is silluq; should appear to the left of the vowel</font>
00249
00250 subst[<font class="charliteral">'9'</font>] = 11;
00251 subst2[11][<font class="charliteral">'1'</font>] = 1435;
00252 subst2[11][<font class="charliteral">'2'</font>] = 1425;
00253 subst2[11][<font class="charliteral">'3'</font>] = 1450;
00254 subst2[11][<font class="charliteral">'4'</font>] = 1447;
00255 subst2[11][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// should appear to the right of the vowel</font>
00256
00257 }
00258
00259 <font class="comment">/*</font>
00260 <font class="comment"></font>
00261 <font class="comment"></font>
00262 <font class="comment"># CANTILLION MARKS</font>
00263 <font class="comment"></font>
00264 <font class="comment"> my $ETNAHTA = '&#1425;';</font>
00265 <font class="comment"># officially the Unicode name for this symbol was "SEGOL." However, that is</font>
00266 <font class="comment"># not a unique name, conflicting with the vowel of the same name. Further,</font>
00267 <font class="comment"># the position of the symbol is different. I have changed the name of the</font>
00268 <font class="comment"># accent to "SEGOLTA," the traditional name for this accent.</font>
00269 <font class="comment"> my $SEGOLTA = '&#1426;';</font>
00270 <font class="comment"> my $SHALSHELET = '&#1427;';</font>
00271 <font class="comment"> my $ZAQEF_QATAN = '&#1428;';</font>
00272 <font class="comment"> my $ZAQEF_GADOL = '&#1429;';</font>
00273 <font class="comment"> my $TIPEHA = '&#1430;';</font>
00274 <font class="comment"> my $REVIA = '&#1431;';</font>
00275 <font class="comment"> my $ZARQA = '&#1432;';</font>
00276 <font class="comment"> my $PASHTA = '&#1433;';</font>
00277 <font class="comment"> my $YETIV = '&#1434;';</font>
00278 <font class="comment"> my $TEVIR = '&#1435;';</font>
00279 <font class="comment"> my $GERESH = '&#1436;';</font>
00280 <font class="comment"> my $GERESH_MUQDAM = '&#1437;';</font>
00281 <font class="comment"> my $GERSHAYIM = '&#1438;';</font>
00282 <font class="comment"> my $QARNEY_PARA = '&#1439;';</font>
00283 <font class="comment"> my $TELISHA_GEDOLA = '&#1440;';</font>
00284 <font class="comment"> my $PAZER = '&#1441;';</font>
00285 <font class="comment"> my $MUNAH = '&#1443;';</font>
00286 <font class="comment"> my $MAHAPAKH = '&#1444;';</font>
00287 <font class="comment"> my $MERKHA = '&#1445;';</font>
00288 <font class="comment"> my $MERKHA_KEFULA = '&#1446;';</font>
00289 <font class="comment"> my $DARGA = '&#1447;';</font>
00290 <font class="comment"> my $QADMA = '&#1448;';</font>
00291 <font class="comment"> my $TELISHA_QETANA = '&#1449;';</font>
00292 <font class="comment"> my $YERAH_BEN_YOMO = '&#1450;';</font>
00293 <font class="comment"> my $OLE = '&#1451;';</font>
00294 <font class="comment"> my $ILUY = '&#1452;';</font>
00295 <font class="comment"> my $DEHI = '&#1453;';</font>
00296 <font class="comment"> my $ZINOR = '&#1454;';</font>
00297 <font class="comment"># HEBREW MARK</font>
00298 <font class="comment"> my $MASORA_CIRCLE = '&#1455;';</font>
00299 <font class="comment"># HEBREW EXTENDED-A points and punctuation</font>
00300 <font class="comment"> my $SHEVA = '&#1456;';</font>
00301 <font class="comment"> my $HATAF_SEGOL = '&#1457;';</font>
00302 <font class="comment"> my $HATAF_PATAH = '&#1458;';</font>
00303 <font class="comment"> my $HATAF_QAMATS = '&#1459;';</font>
00304 <font class="comment"> my $HIRIQ = '&#1460;';</font>
00305 <font class="comment"> my $TSERE = '&#1461;';</font>
00306 <font class="comment"> my $SEGOL = '&#1462;';</font>
00307 <font class="comment"># furtive Patah is not a distinct character</font>
00308 <font class="comment"> my $PATAH = '&#1463;';</font>
00309 <font class="comment"> my $QAMATS = '&#1464;';</font>
00310 <font class="comment"> my $HOLAM = '&#1465;';</font>
00311 <font class="comment"> my $QUBUTS = '&#1467;';</font>
00312 <font class="comment"># also used as shuruq</font>
00313 <font class="comment"># falls within the base letter</font>
00314 <font class="comment"> my $DAGESH_OR_MAPIQ = '&#1468;';</font>
00315 <font class="comment"># also used as siluq</font>
00316 <font class="comment"> my $METAG = '&#1469;';</font>
00317 <font class="comment"> my $MAQAF = '&#1470;';</font>
00318 <font class="comment"> my $RAFE = '&#1471;';</font>
00319 <font class="comment"># Also used for legarmeh</font>
00320 <font class="comment"># may be treated as spacing punctuation, not as a point</font>
00321 <font class="comment"> my $PASEQ = '&#1472;';</font>
00322 <font class="comment"> my $SHIN_DOT = '&#1473;';</font>
00323 <font class="comment"> my $SIN_DOT = '&#1474;';</font>
00324 <font class="comment"> my $SOF_PASUQ = '&#1475;';</font>
00325 <font class="comment"># HEBREW MARK</font>
00326 <font class="comment"> my $UPPER_DOT = '&#1476;';</font>
00327 <font class="comment"># HEBREW LETTERS based on ISO 8859-8</font>
00328 <font class="comment"># aleph</font>
00329 <font class="comment"># x (alef symbol - 2135)</font>
00330 <font class="comment"> my $ALEF = '&#1488;';</font>
00331 <font class="comment"># x (bet symbol - 2136)</font>
00332 <font class="comment"> my $BET = '&#1489;';</font>
00333 <font class="comment"># x (gimel symbol - 2137)</font>
00334 <font class="comment"> my $GIMEL = '&#1490;';</font>
00335 <font class="comment"># x (dalet symbol - 2138)</font>
00336 <font class="comment"> my $DALET = '&#1491;';</font>
00337 <font class="comment"> my $HE = '&#1492;';</font>
00338 <font class="comment"> my $VAV = '&#1493;';</font>
00339 <font class="comment"> my $ZAYIN = '&#1494;';</font>
00340 <font class="comment"> my $HET = '&#1495;';</font>
00341 <font class="comment"> my $TET = '&#1496;';</font>
00342 <font class="comment"> my $YOD = '&#1497;';</font>
00343 <font class="comment"> my $FINAL_KAF = '&#1498;';</font>
00344 <font class="comment"> my $KAF = '&#1499;';</font>
00345 <font class="comment"> my $LAMED = '&#1500;';</font>
00346 <font class="comment"> my $FINAL_MEM = '&#1501;';</font>
00347 <font class="comment"> my $MEM = '&#1502;';</font>
00348 <font class="comment"> my $FINAL_NUN = '&#1503;';</font>
00349 <font class="comment"> my $NUN = '&#1504;';</font>
00350 <font class="comment"> my $SAMEKH = '&#1505;';</font>
00351 <font class="comment"> my $AYIN = '&#1506;';</font>
00352 <font class="comment"> my $FINAL_PE = '&#1507;';</font>
00353 <font class="comment"> my $PE = '&#1508;';</font>
00354 <font class="comment"> my $FINAL_TSADI = '&#1509;';</font>
00355 <font class="comment"># also known as zade</font>
00356 <font class="comment"> my $TSADI = '&#1510;';</font>
00357 <font class="comment"> my $QOF = '&#1511;';</font>
00358 <font class="comment"> my $RESH = '&#1512;';</font>
00359 <font class="comment"> my $SHIN = '&#1513;';</font>
00360 <font class="comment"> my $TAV = '&#1514;';</font>
00361 <font class="comment"># Yiddish digraphs</font>
00362 <font class="comment"># Hebrew Ligature</font>
00363 <font class="comment"># tsvey vovn</font>
00364 <font class="comment"> my $DOUBLE_VAV = '&#1520;';</font>
00365 <font class="comment"> my $VAV_YOD = '&#1521;';</font>
00366 <font class="comment"># tsvey yudn</font>
00367 <font class="comment"> my $DOUBLE_YOD = '&#1522;';</font>
00368 <font class="comment"></font>
00369 <font class="comment"># Additional punctuation</font>
00370 <font class="comment"> my $PUNCT_GERESH = '&#1523;';</font>
00371 <font class="comment"> my $PUNCT_GERSHAYIM = '&#1524;';</font>
00372 <font class="comment"># Reserved: 0x05F5"</font>
00373 <font class="comment"># x (hebrew point judeo-spanish varika - FB1E)</font>
00374 <font class="comment">#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E</font>
00375 <font class="comment"></font>
00376 <font class="comment">#############################</font>
00377 <font class="comment"># End of Unicode 2.0 Hebrew #</font>
00378 <font class="comment">#############################</font>
00379 <font class="comment"></font>
00380 <font class="comment"># A hash whose key is a Michagan code, and whose value is a Unicode</font>
00381 <font class="comment"># equvalent</font>
00382 <font class="comment"></font>
00383 <font class="comment"> char subst[] = new char [255];</font>
00384 <font class="comment"> subst[')'] = 1488;</font>
00385 <font class="comment"> 'B' => $BET,</font>
00386 <font class="comment"> 'G' => $GIMEL,</font>
00387 <font class="comment"> 'D' => $DALET,</font>
00388 <font class="comment"> 'H' => $HE,</font>
00389 <font class="comment"> 'W' => $VAV,</font>
00390 <font class="comment"> 'Z' => $ZAYIN,</font>
00391 <font class="comment"> 'X' => $HET,</font>
00392 <font class="comment"> '+' => $TET,</font>
00393 <font class="comment"> 'Y' => $YOD,</font>
00394 <font class="comment"> 'K' => $KAF,</font>
00395 <font class="comment"> 'L' => $LAMED,</font>
00396 <font class="comment"> 'M' => $MEM,</font>
00397 <font class="comment"> 'N' => $NUN,</font>
00398 <font class="comment"> 'S' => $SAMEKH,</font>
00399 <font class="comment"> '(' => $AYIN,</font>
00400 <font class="comment"> 'P' => $PE,</font>
00401 <font class="comment"> 'C' => $TSADI,</font>
00402 <font class="comment"> 'Q' => $QOF,</font>
00403 <font class="comment"> 'R' => $RESH,</font>
00404 <font class="comment"> '#' => $SHIN, # the letter shin without a point</font>
00405 <font class="comment"> '&' => ($SHIN . $SIN_DOT),</font>
00406 <font class="comment"> '$' => ($SHIN . $SHIN_DOT), # '</font>
00407 <font class="comment"> 'T' => $TAV,</font>
00408 <font class="comment"># VOWELS</font>
00409 <font class="comment"> 'A' => $PATAH,</font>
00410 <font class="comment"> 'F' => $QAMATS,</font>
00411 <font class="comment"> 'E' => $SEGOL,</font>
00412 <font class="comment"> '"' => $TSERE,</font>
00413 <font class="comment"> 'I' => $HIRIQ,</font>
00414 <font class="comment"> 'O' => $HOLAM,</font>
00415 <font class="comment"> 'U' => $QUBUTS,</font>
00416 <font class="comment"> ':' => $SHEVA,</font>
00417 <font class="comment"> ':A' => $HATAF_PATAH,</font>
00418 <font class="comment"> ':E' => $HATAF_SEGOL,</font>
00419 <font class="comment"> ':F' => $HATAF_QAMATS,</font>
00420 <font class="comment"># OTHER DIACRITICS</font>
00421 <font class="comment"> '.' => $DAGESH_OR_MAPIQ,</font>
00422 <font class="comment"> '-' => $MAQAF,</font>
00423 <font class="comment"> ',' => $RAFE,</font>
00424 <font class="comment"># CANTILLATION</font>
00425 <font class="comment"> '00' => $SOF_PASUQ,</font>
00426 <font class="comment"> '01' => $SEGOLTA,</font>
00427 <font class="comment"># According to BHS, zarqa and sinnor are both postpositive. However,</font>
00428 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
00429 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
00430 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
00431 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
00432 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
00433 <font class="comment"> '02' => $ZINOR,</font>
00434 <font class="comment"># Pashta is postpositive, and the Unicode equivalent reflects</font>
00435 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
00436 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
00437 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
00438 <font class="comment"># although it could be algorithmically determined.</font>
00439 <font class="comment"> '03' => $PASHTA,</font>
00440 <font class="comment"> '04' => $TELISHA_QETANA,</font>
00441 <font class="comment"> '05' => $PASEQ,</font>
00442 <font class="comment"> '10' => $YETIV,</font>
00443 <font class="comment"># In the poetic books, prepositive dehi occurs; it's unclear whether</font>
00444 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
00445 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
00446 <font class="comment"># code for each.</font>
00447 <font class="comment"> '13' => $TIPEHA, # also $DEHI</font>
00448 <font class="comment"># This is the poetic accent mugrash, which also includes rebia, but is</font>
00449 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
00450 <font class="comment"> '11' => $GERESH_MUQDAM,</font>
00451 <font class="comment"> '14' => $TELISHA_GEDOLA,</font>
00452 <font class="comment"># Telisha qetana is postpositive as in '04' above. However, Michigan</font>
00453 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
00454 <font class="comment"># difference.</font>
00455 <font class="comment"> '24' => $TELISHA_QETANA,</font>
00456 <font class="comment"> '33' => $PASHTA,</font>
00457 <font class="comment"># The Michigan code of telisha gedola in medial position. Graphically,</font>
00458 <font class="comment"># there is no difference.</font>
00459 <font class="comment"> '44' => $TELISHA_GEDOLA,</font>
00460 <font class="comment"> '60' => $OLE,</font>
00461 <font class="comment"> '61' => $GERESH,</font>
00462 <font class="comment"># This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
00463 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
00464 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
00465 <font class="comment"># marks.</font>
00466 <font class="comment"> '62' => $GERSHAYIM,</font>
00467 <font class="comment"># Also known as azla</font>
00468 <font class="comment"> '63' => $QADMA,</font>
00469 <font class="comment"> '64' => $ILUY,</font>
00470 <font class="comment"> '65' => $SHALSHELET,</font>
00471 <font class="comment"> '80' => $ZAQEF_QATAN,</font>
00472 <font class="comment"> '81' => $REVIA,</font>
00473 <font class="comment"># Note, this accent is actually sinnorit, but it does not exist as a</font>
00474 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
00475 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
00476 <font class="comment"># get to this.</font>
00477 <font class="comment"> '82' => $ZARQA,</font>
00478 <font class="comment"># The Unicode form does not match the form used by BHS, but the names</font>
00479 <font class="comment"># are the same.</font>
00480 <font class="comment"> '83' => $PAZER,</font>
00481 <font class="comment"> '84' => $QARNEY_PARA,</font>
00482 <font class="comment"> '85' => $ZAQEF_GADOL,</font>
00483 <font class="comment"># Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
00484 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
00485 <font class="comment"># right of the vowel). It is also used for silluq.</font>
00486 <font class="comment"> '35' => $METAG,</font>
00487 <font class="comment"> '70' => $MAHAPAKH,</font>
00488 <font class="comment"> '71' => $MERKHA,</font>
00489 <font class="comment"> '72' => $MERKHA_KEFULA,</font>
00490 <font class="comment"> '73' => $TIPEHA, # also '13', '73' also is used for majela</font>
00491 <font class="comment"> '74' => $MUNAH,</font>
00492 <font class="comment"> '75' => $METAG, # this is silluq; should appear to the left of the vowel</font>
00493 <font class="comment"> '91' => $TEVIR,</font>
00494 <font class="comment"> '92' => $ETNAHTA,</font>
00495 <font class="comment"> '93' => $YERAH_BEN_YOMO,</font>
00496 <font class="comment"> '94' => $DARGA,</font>
00497 <font class="comment"> '95' => $METAG, # should appear to the right of the vowel</font>
00498 <font class="comment"></font>
00499 <font class="comment"># Not used by the Michigan Encoding</font>
00500 <font class="comment"># $UPPER_DOT = '05C4';</font>
00501 <font class="comment"> );</font>
00502 <font class="comment"></font>
00503 <font class="comment"># declare other variables</font>
00504 <font class="comment"> my (@bhsLines,</font>
00505 <font class="comment"> @bhsVerse,</font>
00506 <font class="comment"> @entity_line) = ();</font>
00507 <font class="comment"></font>
00508 <font class="comment"> my ($i,</font>
00509 <font class="comment"> $verse,</font>
00510 <font class="comment"> $word,</font>
00511 <font class="comment"> $character) = 0;</font>
00512 <font class="comment"></font>
00513 <font class="comment"> my ($element,</font>
00514 <font class="comment"> $saveGuttural) = "";</font>
00515 <font class="comment"></font>
00516 <font class="comment"># read in a line</font>
00517 <font class="comment"> while (<>) {</font>
00518 <font class="comment"># Process one verse</font>
00519 <font class="comment"># iterate over every character and change to XML decimal entity</font>
00520 <font class="comment"> CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {</font>
00521 <font class="comment"> # find and convert final kaf, mem, nun, pe, tsade</font>
00522 <font class="comment"> ( # if final form</font>
00523 <font class="comment"> $bhsVerse[$i] =~ /[KMNPC]/</font>
00524 <font class="comment"> )</font>
00525 <font class="comment"> &&</font>
00526 <font class="comment"> (</font>
00527 <font class="comment"> ( # whitespace or</font>
00528 <font class="comment"> $bhsVerse[$i+1] =~ /[ \-?]/</font>
00529 <font class="comment"> )</font>
00530 <font class="comment"> ||</font>
00531 <font class="comment"> ( # EOL or</font>
00532 <font class="comment"> $i == ( scalar(@bhsVerse) - 1 )</font>
00533 <font class="comment"> )</font>
00534 <font class="comment"> ||</font>
00535 <font class="comment"> ( # sof pasuq or</font>
00536 <font class="comment"> ( $bhsVerse[$i+1] =~ /0/ ) &&</font>
00537 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ )</font>
00538 <font class="comment"> )</font>
00539 <font class="comment"> ||</font>
00540 <font class="comment"> ( # one accent followed by white, eol or</font>
00541 <font class="comment"> (</font>
00542 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font>
00543 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ )</font>
00544 <font class="comment"> ) &&</font>
00545 <font class="comment"> (</font>
00546 <font class="comment"> ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||</font>
00547 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
00548 <font class="comment"> )</font>
00549 <font class="comment"> )</font>
00550 <font class="comment"> ||</font>
00551 <font class="comment"> ( # two accents followed by white, eol</font>
00552 <font class="comment"> (</font>
00553 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font>
00554 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ ) &&</font>
00555 <font class="comment"> ( $bhsVerse[$i+3] =~ /\d/ ) &&</font>
00556 <font class="comment"> ( $bhsVerse[$i+4] =~ /\d/ )</font>
00557 <font class="comment"> ) &&</font>
00558 <font class="comment"> (</font>
00559 <font class="comment"> ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||</font>
00560 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
00561 <font class="comment"> )</font>
00562 <font class="comment"> )</font>
00563 <font class="comment"> ||</font>
00564 <font class="comment"> ( # followed by a vowel and white, eol, sof pasuq</font>
00565 <font class="comment"> ( $bhsVerse[$i+1] =~ /[:F]/ ) &&</font>
00566 <font class="comment"> ( # followed by</font>
00567 <font class="comment"> ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or</font>
00568 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or</font>
00569 <font class="comment"> ( # sof pasuq</font>
00570 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ ) &&</font>
00571 <font class="comment"> ( $bhsVerse[$i+3] =~ /0/ )</font>
00572 <font class="comment"> )</font>
00573 <font class="comment"> )</font>
00574 <font class="comment"> )</font>
00575 <font class="comment"> ) # end of what follows after final letter</font>
00576 <font class="comment"> &&</font>
00577 <font class="comment"> do {</font>
00578 <font class="comment"> $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }</font>
00579 <font class="comment"> && next CHAR;</font>
00580 <font class="comment"> $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }</font>
00581 <font class="comment"> && next CHAR;</font>
00582 <font class="comment"> $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }</font>
00583 <font class="comment"> && next CHAR;</font>
00584 <font class="comment"> $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }</font>
00585 <font class="comment"> && next CHAR;</font>
00586 <font class="comment"> $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }</font>
00587 <font class="comment"> && next CHAR;</font>
00588 <font class="comment"> };</font>
00589 <font class="comment"> # find and convert "furtive patach"</font>
00590 <font class="comment"> ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach</font>
00591 <font class="comment"> ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural</font>
00592 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel</font>
00593 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq</font>
00594 <font class="comment"> ( $bhsVerse[$i-3] =~ /W/ ) ) || #</font>
00595 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)</font>
00596 <font class="comment"> ( $bhsVerse[$i-3] =~ /O/ ) ) || #</font>
00597 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod</font>
00598 <font class="comment"> ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&</font>
00599 <font class="comment"> do {</font>
00600 <font class="comment"> $saveGuttural = pop @entity_line; # snip off the gutteral</font>
00601 <font class="comment"> push @entity_line,$PATAH; # push on the patach</font>
00602 <font class="comment"> push @entity_line,$saveGuttural; # push back on the gutteral</font>
00603 <font class="comment"> next CHAR;</font>
00604 <font class="comment"> };</font>
00605 <font class="comment"> # convert cantillation</font>
00606 <font class="comment"> # since we have previously dealt with all other cases of</font>
00607 <font class="comment"> # numbers, two digit patterns are all we have to search for</font>
00608 <font class="comment"> $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {</font>
00609 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
00610 <font class="comment"> $i++; # accents are two digits long, so advance past the 2nd digit</font>
00611 <font class="comment"> next CHAR;</font>
00612 <font class="comment"> };</font>
00613 <font class="comment"> # convert katef vowels, which are two characters long</font>
00614 <font class="comment"> $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {</font>
00615 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
00616 <font class="comment"> $i++;</font>
00617 <font class="comment"> next CHAR;</font>
00618 <font class="comment"> };</font>
00619 <font class="comment"> # convert everything else</font>
00620 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};</font>
00621 <font class="comment"> } # end CHAR</font>
00622 <font class="comment"># print the line to standard output with XML character-level encoding</font>
00623 <font class="comment"># each character has the following format:</font>
00624 <font class="comment"># <c id="1kg1.verse#.word#.character#">&#1234;</c></font>
00625 <font class="comment"></font>
00626 <font class="comment"># set up the verse element</font>
00627 <font class="comment"> $word = 1;</font>
00628 <font class="comment"> $character = 1;</font>
00629 <font class="comment"> print "<verse>\n<word>\n";</font>
00630 <font class="comment"># print each character element</font>
00631 <font class="comment"># if there is a space, then close the word entity, open a new word</font>
00632 <font class="comment"># entity, increment the word number, reset the character number to</font>
00633 <font class="comment"># zero.</font>
00634 <font class="comment"> foreach $element (@entity_line) {</font>
00635 <font class="comment"> if ( $element =~ " " ) {</font>
00636 <font class="comment"> $word++;</font>
00637 <font class="comment"> $character = 1;</font>
00638 <font class="comment"> print "</word>\n<word>\n";</font>
00639 <font class="comment"> next;</font>
00640 <font class="comment"> }</font>
00641 <font class="comment"> print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";</font>
00642 <font class="comment"> $character++;</font>
00643 <font class="comment"> }</font>
00644 <font class="comment"># close the verse element</font>
00645 <font class="comment"> print "</word></verse>\n";</font>
00646 <font class="comment"># reinitialize variables</font>
00647 <font class="comment"> @bhsVerse = ();</font>
00648 <font class="comment"> @entity_line = ();</font>
00649 <font class="comment"> @bhsLines = ();</font>
00650 <font class="comment"> } # end while</font>
00651 <font class="comment"># close the XML document</font>
00652 <font class="comment"> print "</body>\n";</font>
00653 <font class="comment"> */</font>
</pre></div><hr><address align="right"><small>Generated on Thu Jun 20 22:12:59 2002 for The Sword Project by
<a href="http://www.doxygen.org/index.html">
<img src="doxygen.png" alt="doxygen" align="middle" border=0
width=110 height=53></a>1.2.15 </small></address>
</body>
</html>