path: root/doc/api-documentation/html/hebrewmcim_8cpp-source.html



<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
<title>hebrewmcim.cpp Source File</title>
<link href="doxygen.css" rel="stylesheet" type="text/css">
</head><body>
<!-- Generated by Doxygen 1.2.15 -->
<center>
<a class="qindex" href="index.html">Main Page</a> &nbsp; <a class="qindex" href="namespaces.html">Namespace List</a> &nbsp; <a class="qindex" href="hierarchy.html">Class Hierarchy</a> &nbsp; <a class="qindex" href="classes.html">Alphabetical List</a> &nbsp; <a class="qindex" href="annotated.html">Compound List</a> &nbsp; <a class="qindex" href="files.html">File List</a> &nbsp; <a class="qindex" href="functions.html">Compound Members</a> &nbsp; </center>
<hr><h1>hebrewmcim.cpp</h1><div class="fragment"><pre>00001 
00011 <font class="preprocessor">#include &lt;hebrewmcim.h&gt;</font>
00012 
00013 HebrewMCIM::HebrewMCIM()
00014                 :<a class="code" href="class_s_w_input_method.html">SWInputMethod</a>() {
00015 
00016    init();
00017 }
00018 
00019 
00020 <font class="keywordtype">int</font> *HebrewMCIM::translate(<font class="keywordtype">char</font> in) {
00021         <font class="keywordtype">int</font> retVal = 0;
00022         <font class="keyword">static</font> <font class="keywordtype">int</font> retString[5];
00023         <font class="keywordtype">int</font> retStringIndex = 0;
00024 
00025         memset(retString, 0, 5);
00026 
00027         <font class="keywordflow">if</font> (getState() &gt; 1) {
00028                 <font class="keywordflow">if</font> (getState() &gt;= 12) { <font class="comment">// serious issue with internal structure</font>
00029                         setState(0);
00030                         retString[retStringIndex++] = in;
00031                         <font class="keywordflow">return</font> retString;
00032                 }
00033                 map&lt;int, int&gt;::iterator find = subst2[getState()].find(in);
00034                 <font class="keywordflow">if</font> (find != subst2[getState()].end())
00035                         retVal = find-&gt;second;
00036                 <font class="keywordflow">else</font> retVal = in;
00037 
00038                 setState(0);
00039                 retString[retStringIndex++] = retVal;
00040                 <font class="keywordflow">return</font> retString;
00041         }
00042         <font class="keywordflow">else</font> {
00043                 retVal = subst[in];
00044 
00045                 <font class="keywordflow">if</font> (retVal == 0) {
00046                         setState(0);
00047                         retString[retStringIndex++] = in;
00048                         <font class="keywordflow">return</font> retString;
00049                 }
00050                 <font class="keywordflow">if</font> (retVal &gt; 100) {
00051                         setState(1);
00052                         retString[retStringIndex++] = retVal;
00053                         <font class="keywordflow">return</font> retString;
00054                 }
00055                 <font class="keywordflow">if</font> (retVal == 50) {  <font class="comment">// multiChar</font>
00056                         setState(1);
00057                         <font class="keywordtype">int</font> *chars = multiChars[in];
00058                         <font class="keywordflow">if</font> (chars != 0) {
00059                                 retString[retStringIndex++] = chars[0];
00060                                 retString[retStringIndex++] = chars[1];
00061                                 <font class="keywordflow">return</font> retString;
00062                         }
00063                 }
00064         }
00065         setState(retVal);
00066         <font class="keywordflow">return</font> 0;
00067 }
00068 
00069 
00070 <font class="keywordtype">void</font> HebrewMCIM::init() {
00071         memset(subst, 0, 255);
00072 
00073         subst[<font class="charliteral">')'</font>] = 1488;
00074         subst[<font class="charliteral">'B'</font>] = 1489;
00075         subst[<font class="charliteral">'G'</font>] = 1490;
00076         subst[<font class="charliteral">'D'</font>] = 1491;
00077         subst[<font class="charliteral">'H'</font>] = 1492;
00078         subst[<font class="charliteral">'W'</font>] = 1493;
00079         subst[<font class="charliteral">'Z'</font>] = 1494;
00080         subst[<font class="charliteral">'X'</font>] = 1495;
00081         subst[<font class="charliteral">'+'</font>] = 1496;
00082         subst[<font class="charliteral">'Y'</font>] = 1497;
00083 
00084         subst[<font class="charliteral">'k'</font>] = 1498;  <font class="comment">// finals</font>
00085         subst[<font class="charliteral">'m'</font>] = 1501;
00086         subst[<font class="charliteral">'n'</font>] = 1503;
00087         subst[<font class="charliteral">'c'</font>] = 1509;
00088 
00089         subst[<font class="charliteral">'P'</font>] = 1508;
00090         subst[<font class="charliteral">'K'</font>] = 1499;
00091         subst[<font class="charliteral">'L'</font>] = 1500;
00092         subst[<font class="charliteral">'M'</font>] = 1502;
00093         subst[<font class="charliteral">'N'</font>] = 1504;
00094         subst[<font class="charliteral">'S'</font>] = 1505;
00095         subst[<font class="charliteral">'('</font>] = 1506;
00096         subst[<font class="charliteral">'p'</font>] = 1507;
00097         subst[<font class="charliteral">'C'</font>] = 1510;
00098         subst[<font class="charliteral">'Q'</font>] = 1511;
00099         subst[<font class="charliteral">'R'</font>] = 1512;
00100         subst[<font class="charliteral">'#'</font>] = 1513;
00101 
00102         <font class="comment">// special multiChars</font>
00103         subst[<font class="charliteral">'&amp;'</font>] = 50;
00104         subst[<font class="charliteral">'$'</font>] = 50;
00105 
00106         <font class="keyword">static</font> <font class="keywordtype">int</font> x[] = {1513, 1474};
00107         multiChars[<font class="charliteral">'&amp;'</font>] = x;
00108         <font class="keyword">static</font> <font class="keywordtype">int</font> y[] = {1513, 1473};
00109         multiChars[<font class="charliteral">'$'</font>] = y;
00110 
00111         subst[<font class="charliteral">'T'</font>] = 1514;
00112 
00113         <font class="comment">// VOWELS</font>
00114         subst[<font class="charliteral">'A'</font>] = 1463;
00115         subst[<font class="charliteral">'F'</font>] = 1464;
00116         subst[<font class="charliteral">'E'</font>] = 1462;
00117         subst[<font class="charliteral">'"'</font>] = 1461;
00118         subst[<font class="charliteral">'I'</font>] = 1460;
00119         subst[<font class="charliteral">'O'</font>] = 1465;
00120         subst[<font class="charliteral">'U'</font>] = 1467;
00121 
00122 
00123 
00124         <font class="comment">// OTHER DIACRITICS</font>
00125         subst[<font class="charliteral">'.'</font>] = 1468;
00126         subst[<font class="charliteral">'-'</font>] = 1470;
00127         subst[<font class="charliteral">','</font>] = 1471;
00128 
00129         <font class="comment">// Compound input</font>
00130 
00131         <font class="comment">// CANTILLATION</font>
00132 
00133         subst[<font class="charliteral">':'</font>] = 2;
00134         subst2[2][<font class="charliteral">'A'</font>] = 1458;
00135         subst2[2][<font class="charliteral">'E'</font>] = 1457;
00136         subst2[2][<font class="charliteral">'F'</font>] = 1459;
00137 
00138 
00139         <font class="comment">/* Telisha qetana is postpositive as in '04' above. However, Michigan</font>
00140 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
00141 <font class="comment"># difference.</font>
00142 <font class="comment">        */</font>
00143         subst[<font class="charliteral">'2'</font>] = 5;
00144         subst2[5][<font class="charliteral">'4'</font>] = 1449;
00145 
00146 
00147         <font class="comment">/* Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
00148 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
00149 <font class="comment"># right of the vowel). It is also used for silluq.</font>
00150 <font class="comment">        */</font>
00151         subst[<font class="charliteral">'3'</font>] = 6;
00152         subst2[6][<font class="charliteral">'3'</font>] = 1433;
00153         subst2[6][<font class="charliteral">'5'</font>] = 1469;
00154 
00155 
00156         <font class="comment">/* The Michigan code of telisha gedola in medial position. Graphically,</font>
00157 <font class="comment"># there is no difference.</font>
00158 <font class="comment">        */</font>
00159         subst[<font class="charliteral">'4'</font>] = 7;
00160         subst2[7][<font class="charliteral">'4'</font>] = 1440;
00161 
00162         subst[<font class="charliteral">'6'</font>] = 8;
00163         subst2[8][<font class="charliteral">'0'</font>] = 1451;
00164         subst2[8][<font class="charliteral">'1'</font>] = 1436;
00165 
00166         subst[<font class="charliteral">'1'</font>] = 4;
00167         subst2[4][<font class="charliteral">'0'</font>] = 1434;
00168 
00169         <font class="comment">/* In the poetic books, prepositive dehi occurs; it's unclear whether</font>
00170 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
00171 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
00172 <font class="comment"># code for each.</font>
00173 <font class="comment">        */</font>
00174 
00175         subst2[4][<font class="charliteral">'3'</font>] = 1430;
00176 
00177         <font class="comment">/* This is the poetic accent mugrash, which also includes rebia, but is</font>
00178 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
00179 <font class="comment">        */</font>
00180         subst2[4][<font class="charliteral">'1'</font>] = 1437;
00181         subst2[4][<font class="charliteral">'4'</font>] = 1440;
00182 
00183 
00184         subst[<font class="charliteral">'0'</font>] = 3;
00185         subst2[3][<font class="charliteral">'0'</font>] = 1475;
00186         subst2[3][<font class="charliteral">'1'</font>] = 1426;
00187 
00188         <font class="comment">/* According to BHS, zarqa and sinnor are both postpositive. However,</font>
00189 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
00190 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
00191 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
00192 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
00193 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
00194 <font class="comment">        */</font>
00195 
00196         subst2[3][<font class="charliteral">'2'</font>] = 1454;
00197 
00198         <font class="comment">/* Pashta is postpositive, and the Unicode equivalent reflects</font>
00199 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
00200 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
00201 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
00202 <font class="comment"># although it could be algorithmically determined.</font>
00203 <font class="comment">        */</font>
00204 
00205         subst2[3][<font class="charliteral">'3'</font>] = 1433;
00206         subst2[3][<font class="charliteral">'4'</font>] = 1449;
00207         subst2[3][<font class="charliteral">'5'</font>] = 1472;
00208 
00209 
00210         <font class="comment">/* This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
00211 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
00212 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
00213 <font class="comment"># marks.</font>
00214 <font class="comment">        */</font>
00215 
00216         subst2[8][<font class="charliteral">'2'</font>] = 1438;
00217 
00218         <font class="comment">// Also known as azla</font>
00219         subst2[8][<font class="charliteral">'3'</font>] = 1448;
00220         subst2[8][<font class="charliteral">'4'</font>] = 1452;
00221         subst2[8][<font class="charliteral">'5'</font>] = 1427;
00222 
00223 
00224         subst[<font class="charliteral">'8'</font>] = 9;
00225         subst2[9][<font class="charliteral">'0'</font>] = 1428;
00226         subst2[9][<font class="charliteral">'1'</font>] = 1431;
00227 
00228         <font class="comment">/* Note, this accent is actually sinnorit, but it does not exist as a</font>
00229 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
00230 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
00231 <font class="comment"># get to this.</font>
00232 <font class="comment">        */</font>
00233         subst2[9][<font class="charliteral">'2'</font>] = 1432;
00234 
00235         <font class="comment">/* The Unicode form does not match the form used by BHS, but the names</font>
00236 <font class="comment"># are the same.</font>
00237 <font class="comment">        */</font>
00238         subst2[9][<font class="charliteral">'3'</font>] = 1441;
00239         subst2[9][<font class="charliteral">'4'</font>] = 1439;
00240         subst2[9][<font class="charliteral">'5'</font>] = 1429;
00241 
00242         subst[<font class="charliteral">'7'</font>] = 10;
00243         subst2[10][<font class="charliteral">'0'</font>] = 1444;
00244         subst2[10][<font class="charliteral">'1'</font>] = 1445;
00245         subst2[10][<font class="charliteral">'2'</font>] = 1446;
00246         subst2[10][<font class="charliteral">'3'</font>] = 1430;  <font class="comment">// also '13', '73' also is used for majela</font>
00247         subst2[10][<font class="charliteral">'4'</font>] = 1443;
00248         subst2[10][<font class="charliteral">'5'</font>] = 1469;  <font class="comment">// this is silluq; should appear to the left of the vowel</font>
00249 
00250         subst[<font class="charliteral">'9'</font>] = 11;
00251         subst2[11][<font class="charliteral">'1'</font>] = 1435;
00252         subst2[11][<font class="charliteral">'2'</font>] = 1425;
00253         subst2[11][<font class="charliteral">'3'</font>] = 1450;
00254         subst2[11][<font class="charliteral">'4'</font>] = 1447;
00255         subst2[11][<font class="charliteral">'5'</font>] = 1469;  <font class="comment">// should appear to the right of the vowel</font>
00256 
00257 }
00258 
00259         <font class="comment">/*</font>
00260 <font class="comment"></font>
00261 <font class="comment"></font>
00262 <font class="comment"># CANTILLION MARKS</font>
00263 <font class="comment"></font>
00264 <font class="comment">        my  $ETNAHTA =           '&amp;#1425;';</font>
00265 <font class="comment"># officially the Unicode name for this symbol was "SEGOL." However, that is</font>
00266 <font class="comment"># not a unique name, conflicting with the vowel of the same name. Further,</font>
00267 <font class="comment"># the position of the symbol is different. I have changed the name of the</font>
00268 <font class="comment"># accent to "SEGOLTA," the traditional name for this accent.</font>
00269 <font class="comment">        my  $SEGOLTA =           '&amp;#1426;';</font>
00270 <font class="comment">        my  $SHALSHELET =        '&amp;#1427;';</font>
00271 <font class="comment">        my  $ZAQEF_QATAN =       '&amp;#1428;';</font>
00272 <font class="comment">        my  $ZAQEF_GADOL =       '&amp;#1429;';</font>
00273 <font class="comment">        my  $TIPEHA =            '&amp;#1430;';</font>
00274 <font class="comment">        my  $REVIA =             '&amp;#1431;';</font>
00275 <font class="comment">        my  $ZARQA =             '&amp;#1432;';</font>
00276 <font class="comment">        my  $PASHTA =            '&amp;#1433;';</font>
00277 <font class="comment">        my  $YETIV =             '&amp;#1434;';</font>
00278 <font class="comment">        my  $TEVIR =             '&amp;#1435;';</font>
00279 <font class="comment">        my  $GERESH =            '&amp;#1436;';</font>
00280 <font class="comment">        my  $GERESH_MUQDAM =     '&amp;#1437;';</font>
00281 <font class="comment">        my  $GERSHAYIM =         '&amp;#1438;';</font>
00282 <font class="comment">        my  $QARNEY_PARA =       '&amp;#1439;';</font>
00283 <font class="comment">        my  $TELISHA_GEDOLA =    '&amp;#1440;';</font>
00284 <font class="comment">        my  $PAZER =             '&amp;#1441;';</font>
00285 <font class="comment">        my  $MUNAH =             '&amp;#1443;';</font>
00286 <font class="comment">        my  $MAHAPAKH =          '&amp;#1444;';</font>
00287 <font class="comment">        my  $MERKHA =            '&amp;#1445;';</font>
00288 <font class="comment">        my  $MERKHA_KEFULA =     '&amp;#1446;';</font>
00289 <font class="comment">        my  $DARGA =             '&amp;#1447;';</font>
00290 <font class="comment">        my  $QADMA =             '&amp;#1448;';</font>
00291 <font class="comment">        my  $TELISHA_QETANA =    '&amp;#1449;';</font>
00292 <font class="comment">        my  $YERAH_BEN_YOMO =    '&amp;#1450;';</font>
00293 <font class="comment">        my  $OLE =               '&amp;#1451;';</font>
00294 <font class="comment">        my  $ILUY =              '&amp;#1452;';</font>
00295 <font class="comment">        my  $DEHI =              '&amp;#1453;';</font>
00296 <font class="comment">        my  $ZINOR =             '&amp;#1454;';</font>
00297 <font class="comment"># HEBREW MARK</font>
00298 <font class="comment">        my  $MASORA_CIRCLE =     '&amp;#1455;';</font>
00299 <font class="comment"># HEBREW EXTENDED-A  points and punctuation</font>
00300 <font class="comment">        my  $SHEVA =             '&amp;#1456;';</font>
00301 <font class="comment">        my  $HATAF_SEGOL =       '&amp;#1457;';</font>
00302 <font class="comment">        my  $HATAF_PATAH =       '&amp;#1458;';</font>
00303 <font class="comment">        my  $HATAF_QAMATS =      '&amp;#1459;';</font>
00304 <font class="comment">        my  $HIRIQ =             '&amp;#1460;';</font>
00305 <font class="comment">        my  $TSERE =             '&amp;#1461;';</font>
00306 <font class="comment">        my  $SEGOL =             '&amp;#1462;';</font>
00307 <font class="comment"># furtive Patah is not a distinct character</font>
00308 <font class="comment">        my  $PATAH =             '&amp;#1463;';</font>
00309 <font class="comment">        my  $QAMATS =            '&amp;#1464;';</font>
00310 <font class="comment">        my  $HOLAM =             '&amp;#1465;';</font>
00311 <font class="comment">        my  $QUBUTS =            '&amp;#1467;';</font>
00312 <font class="comment"># also used as shuruq</font>
00313 <font class="comment"># falls within the base letter</font>
00314 <font class="comment">        my  $DAGESH_OR_MAPIQ =   '&amp;#1468;';</font>
00315 <font class="comment"># also used as siluq</font>
00316 <font class="comment">        my  $METAG =             '&amp;#1469;';</font>
00317 <font class="comment">        my  $MAQAF =             '&amp;#1470;';</font>
00318 <font class="comment">        my  $RAFE =              '&amp;#1471;';</font>
00319 <font class="comment"># Also used for legarmeh</font>
00320 <font class="comment">#   may be treated as spacing punctuation, not as a point</font>
00321 <font class="comment">        my  $PASEQ =             '&amp;#1472;';</font>
00322 <font class="comment">        my  $SHIN_DOT =          '&amp;#1473;';</font>
00323 <font class="comment">        my  $SIN_DOT =           '&amp;#1474;';</font>
00324 <font class="comment">        my  $SOF_PASUQ =         '&amp;#1475;';</font>
00325 <font class="comment"># HEBREW MARK</font>
00326 <font class="comment">        my  $UPPER_DOT =         '&amp;#1476;';</font>
00327 <font class="comment"># HEBREW LETTERS based on ISO 8859-8</font>
00328 <font class="comment"># aleph</font>
00329 <font class="comment">#  x (alef symbol - 2135)</font>
00330 <font class="comment">        my  $ALEF =              '&amp;#1488;';</font>
00331 <font class="comment">#  x (bet symbol - 2136)</font>
00332 <font class="comment">        my  $BET =               '&amp;#1489;';</font>
00333 <font class="comment">#  x (gimel symbol - 2137)</font>
00334 <font class="comment">        my  $GIMEL =             '&amp;#1490;';</font>
00335 <font class="comment">#  x (dalet symbol - 2138)</font>
00336 <font class="comment">        my  $DALET =             '&amp;#1491;';</font>
00337 <font class="comment">        my  $HE =                '&amp;#1492;';</font>
00338 <font class="comment">        my  $VAV =               '&amp;#1493;';</font>
00339 <font class="comment">        my  $ZAYIN =             '&amp;#1494;';</font>
00340 <font class="comment">        my  $HET =               '&amp;#1495;';</font>
00341 <font class="comment">        my  $TET =               '&amp;#1496;';</font>
00342 <font class="comment">        my  $YOD =               '&amp;#1497;';</font>
00343 <font class="comment">        my  $FINAL_KAF =         '&amp;#1498;';</font>
00344 <font class="comment">        my  $KAF =               '&amp;#1499;';</font>
00345 <font class="comment">        my  $LAMED =             '&amp;#1500;';</font>
00346 <font class="comment">        my  $FINAL_MEM =         '&amp;#1501;';</font>
00347 <font class="comment">        my  $MEM =               '&amp;#1502;';</font>
00348 <font class="comment">        my  $FINAL_NUN =         '&amp;#1503;';</font>
00349 <font class="comment">        my  $NUN =               '&amp;#1504;';</font>
00350 <font class="comment">        my  $SAMEKH =            '&amp;#1505;';</font>
00351 <font class="comment">        my  $AYIN =              '&amp;#1506;';</font>
00352 <font class="comment">        my  $FINAL_PE =          '&amp;#1507;';</font>
00353 <font class="comment">        my  $PE =                '&amp;#1508;';</font>
00354 <font class="comment">        my  $FINAL_TSADI =       '&amp;#1509;';</font>
00355 <font class="comment"># also known as zade</font>
00356 <font class="comment">        my  $TSADI =             '&amp;#1510;';</font>
00357 <font class="comment">        my  $QOF =               '&amp;#1511;';</font>
00358 <font class="comment">        my  $RESH =              '&amp;#1512;';</font>
00359 <font class="comment">        my  $SHIN =              '&amp;#1513;';</font>
00360 <font class="comment">        my  $TAV =               '&amp;#1514;';</font>
00361 <font class="comment"># Yiddish digraphs</font>
00362 <font class="comment">#   Hebrew Ligature</font>
00363 <font class="comment"># tsvey vovn</font>
00364 <font class="comment">        my  $DOUBLE_VAV =        '&amp;#1520;';</font>
00365 <font class="comment">        my  $VAV_YOD =           '&amp;#1521;';</font>
00366 <font class="comment"># tsvey yudn</font>
00367 <font class="comment">        my  $DOUBLE_YOD =        '&amp;#1522;';</font>
00368 <font class="comment"></font>
00369 <font class="comment"># Additional punctuation</font>
00370 <font class="comment">        my  $PUNCT_GERESH =      '&amp;#1523;';</font>
00371 <font class="comment">        my  $PUNCT_GERSHAYIM =   '&amp;#1524;';</font>
00372 <font class="comment"># Reserved: 0x05F5"</font>
00373 <font class="comment"># x (hebrew point judeo-spanish varika - FB1E)</font>
00374 <font class="comment">#my  $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E</font>
00375 <font class="comment"></font>
00376 <font class="comment">#############################</font>
00377 <font class="comment"># End of Unicode 2.0 Hebrew #</font>
00378 <font class="comment">#############################</font>
00379 <font class="comment"></font>
00380 <font class="comment"># A hash whose key is a Michagan code, and whose value is a Unicode</font>
00381 <font class="comment"># equvalent</font>
00382 <font class="comment"></font>
00383 <font class="comment">        char subst[] = new char [255];</font>
00384 <font class="comment">        subst[')'] = 1488;</font>
00385 <font class="comment">        'B'  =&gt; $BET,</font>
00386 <font class="comment">        'G'  =&gt; $GIMEL,</font>
00387 <font class="comment">        'D'  =&gt; $DALET,</font>
00388 <font class="comment">        'H'  =&gt; $HE,</font>
00389 <font class="comment">        'W'  =&gt; $VAV,</font>
00390 <font class="comment">        'Z'  =&gt; $ZAYIN,</font>
00391 <font class="comment">        'X'  =&gt; $HET,</font>
00392 <font class="comment">        '+'  =&gt; $TET,</font>
00393 <font class="comment">        'Y'  =&gt; $YOD,</font>
00394 <font class="comment">        'K'  =&gt; $KAF,</font>
00395 <font class="comment">        'L'  =&gt; $LAMED,</font>
00396 <font class="comment">        'M'  =&gt; $MEM,</font>
00397 <font class="comment">        'N'  =&gt; $NUN,</font>
00398 <font class="comment">        'S'  =&gt; $SAMEKH,</font>
00399 <font class="comment">        '('  =&gt; $AYIN,</font>
00400 <font class="comment">        'P'  =&gt; $PE,</font>
00401 <font class="comment">        'C'  =&gt; $TSADI,</font>
00402 <font class="comment">        'Q'  =&gt; $QOF,</font>
00403 <font class="comment">        'R'  =&gt; $RESH,</font>
00404 <font class="comment">        '#'  =&gt; $SHIN, # the letter shin without a point</font>
00405 <font class="comment">        '&amp;'  =&gt; ($SHIN . $SIN_DOT),</font>
00406 <font class="comment">        '$'  =&gt; ($SHIN . $SHIN_DOT), # '</font>
00407 <font class="comment">        'T'  =&gt; $TAV,</font>
00408 <font class="comment"># VOWELS</font>
00409 <font class="comment">        'A'  =&gt; $PATAH,</font>
00410 <font class="comment">        'F'  =&gt; $QAMATS,</font>
00411 <font class="comment">        'E'  =&gt; $SEGOL,</font>
00412 <font class="comment">        '"'  =&gt; $TSERE,</font>
00413 <font class="comment">        'I'  =&gt; $HIRIQ,</font>
00414 <font class="comment">        'O'  =&gt; $HOLAM,</font>
00415 <font class="comment">        'U'  =&gt; $QUBUTS,</font>
00416 <font class="comment">        ':'  =&gt; $SHEVA,</font>
00417 <font class="comment">        ':A' =&gt; $HATAF_PATAH,</font>
00418 <font class="comment">        ':E' =&gt; $HATAF_SEGOL,</font>
00419 <font class="comment">        ':F' =&gt; $HATAF_QAMATS,</font>
00420 <font class="comment"># OTHER DIACRITICS</font>
00421 <font class="comment">        '.'  =&gt; $DAGESH_OR_MAPIQ,</font>
00422 <font class="comment">        '-'  =&gt; $MAQAF,</font>
00423 <font class="comment">        ','  =&gt; $RAFE,</font>
00424 <font class="comment"># CANTILLATION</font>
00425 <font class="comment">        '00' =&gt; $SOF_PASUQ,</font>
00426 <font class="comment">        '01' =&gt; $SEGOLTA,</font>
00427 <font class="comment"># According to BHS, zarqa and sinnor are both postpositive. However,</font>
00428 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
00429 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
00430 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
00431 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
00432 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
00433 <font class="comment">        '02' =&gt; $ZINOR,</font>
00434 <font class="comment"># Pashta is postpositive, and the Unicode equivalent reflects</font>
00435 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
00436 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
00437 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
00438 <font class="comment"># although it could be algorithmically determined.</font>
00439 <font class="comment">        '03' =&gt; $PASHTA,</font>
00440 <font class="comment">        '04' =&gt; $TELISHA_QETANA,</font>
00441 <font class="comment">        '05' =&gt; $PASEQ,</font>
00442 <font class="comment">        '10' =&gt; $YETIV,</font>
00443 <font class="comment"># In the poetic books, prepositive dehi occurs; it's unclear whether</font>
00444 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
00445 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
00446 <font class="comment"># code for each.</font>
00447 <font class="comment">        '13' =&gt; $TIPEHA, # also $DEHI</font>
00448 <font class="comment"># This is the poetic accent mugrash, which also includes rebia, but is</font>
00449 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
00450 <font class="comment">        '11' =&gt; $GERESH_MUQDAM,</font>
00451 <font class="comment">        '14' =&gt; $TELISHA_GEDOLA,</font>
00452 <font class="comment"># Telisha qetana is postpositive as in '04' above. However, Michigan</font>
00453 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
00454 <font class="comment"># difference.</font>
00455 <font class="comment">        '24' =&gt; $TELISHA_QETANA,</font>
00456 <font class="comment">        '33' =&gt; $PASHTA,</font>
00457 <font class="comment"># The Michigan code of telisha gedola in medial position. Graphically,</font>
00458 <font class="comment"># there is no difference.</font>
00459 <font class="comment">        '44' =&gt; $TELISHA_GEDOLA,</font>
00460 <font class="comment">        '60' =&gt; $OLE,</font>
00461 <font class="comment">        '61' =&gt; $GERESH,</font>
00462 <font class="comment"># This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
00463 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
00464 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
00465 <font class="comment"># marks.</font>
00466 <font class="comment">        '62' =&gt; $GERSHAYIM,</font>
00467 <font class="comment"># Also known as azla</font>
00468 <font class="comment">        '63' =&gt; $QADMA,</font>
00469 <font class="comment">        '64' =&gt; $ILUY,</font>
00470 <font class="comment">        '65' =&gt; $SHALSHELET,</font>
00471 <font class="comment">        '80' =&gt; $ZAQEF_QATAN,</font>
00472 <font class="comment">        '81' =&gt; $REVIA,</font>
00473 <font class="comment"># Note, this accent is actually sinnorit, but it does not exist as a</font>
00474 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
00475 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
00476 <font class="comment"># get to this.</font>
00477 <font class="comment">        '82' =&gt; $ZARQA,</font>
00478 <font class="comment"># The Unicode form does not match the form used by BHS, but the names</font>
00479 <font class="comment"># are the same.</font>
00480 <font class="comment">        '83' =&gt; $PAZER,</font>
00481 <font class="comment">        '84' =&gt; $QARNEY_PARA,</font>
00482 <font class="comment">        '85' =&gt; $ZAQEF_GADOL,</font>
00483 <font class="comment"># Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
00484 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
00485 <font class="comment"># right of the vowel). It is also used for silluq.</font>
00486 <font class="comment">        '35' =&gt; $METAG,</font>
00487 <font class="comment">        '70' =&gt; $MAHAPAKH,</font>
00488 <font class="comment">        '71' =&gt; $MERKHA,</font>
00489 <font class="comment">        '72' =&gt; $MERKHA_KEFULA,</font>
00490 <font class="comment">        '73' =&gt; $TIPEHA, # also '13', '73' also is used for majela</font>
00491 <font class="comment">        '74' =&gt; $MUNAH,</font>
00492 <font class="comment">        '75' =&gt; $METAG, # this is silluq; should appear to the left of the vowel</font>
00493 <font class="comment">        '91' =&gt; $TEVIR,</font>
00494 <font class="comment">        '92' =&gt; $ETNAHTA,</font>
00495 <font class="comment">        '93' =&gt; $YERAH_BEN_YOMO,</font>
00496 <font class="comment">        '94' =&gt; $DARGA,</font>
00497 <font class="comment">        '95' =&gt; $METAG, # should appear to the right of the vowel</font>
00498 <font class="comment"></font>
00499 <font class="comment"># Not used by the Michigan Encoding</font>
00500 <font class="comment"># $UPPER_DOT = '05C4';</font>
00501 <font class="comment">        );</font>
00502 <font class="comment"></font>
00503 <font class="comment"># declare other variables</font>
00504 <font class="comment">        my (@bhsLines,</font>
00505 <font class="comment">        @bhsVerse,</font>
00506 <font class="comment">        @entity_line) = ();</font>
00507 <font class="comment"></font>
00508 <font class="comment">        my ($i,</font>
00509 <font class="comment">        $verse,</font>
00510 <font class="comment">        $word,</font>
00511 <font class="comment">        $character) = 0;</font>
00512 <font class="comment"></font>
00513 <font class="comment">        my ($element,</font>
00514 <font class="comment">        $saveGuttural) = "";</font>
00515 <font class="comment"></font>
00516 <font class="comment"># read in a line</font>
00517 <font class="comment">        while (&lt;&gt;) {</font>
00518 <font class="comment"># Process one verse</font>
00519 <font class="comment"># iterate over every character and change to XML decimal entity</font>
00520 <font class="comment">        CHAR: for ( $i = 0; ($i &lt; scalar(@bhsVerse)); $i++) {</font>
00521 <font class="comment">         # find and convert final kaf, mem, nun, pe, tsade</font>
00522 <font class="comment">         ( # if final form</font>
00523 <font class="comment">          $bhsVerse[$i] =~ /[KMNPC]/</font>
00524 <font class="comment">         )</font>
00525 <font class="comment">           &amp;&amp;</font>
00526 <font class="comment">                (</font>
00527 <font class="comment">                 ( # whitespace or</font>
00528 <font class="comment">                  $bhsVerse[$i+1] =~ /[ \-?]/</font>
00529 <font class="comment">                 )</font>
00530 <font class="comment">                 ||</font>
00531 <font class="comment">                 ( # EOL or</font>
00532 <font class="comment">                  $i == ( scalar(@bhsVerse) - 1 )</font>
00533 <font class="comment">                 )</font>
00534 <font class="comment">                 ||</font>
00535 <font class="comment">                 ( # sof pasuq or</font>
00536 <font class="comment">                  ( $bhsVerse[$i+1] =~ /0/ ) &amp;&amp;</font>
00537 <font class="comment">                  ( $bhsVerse[$i+2] =~ /0/ )</font>
00538 <font class="comment">                 )</font>
00539 <font class="comment">                 ||</font>
00540 <font class="comment">                 ( # one accent followed by white, eol or</font>
00541 <font class="comment">                  (</font>
00542 <font class="comment">                   ( $bhsVerse[$i+1] =~ /\d/ ) &amp;&amp;</font>
00543 <font class="comment">                   ( $bhsVerse[$i+2] =~ /\d/ )</font>
00544 <font class="comment">                  ) &amp;&amp;</font>
00545 <font class="comment">                  (</font>
00546 <font class="comment">                   ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||</font>
00547 <font class="comment">                   ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
00548 <font class="comment">                  )</font>
00549 <font class="comment">                 )</font>
00550 <font class="comment">                 ||</font>
00551 <font class="comment">                 ( # two accents followed by white, eol</font>
00552 <font class="comment">                  (</font>
00553 <font class="comment">                   ( $bhsVerse[$i+1] =~ /\d/ ) &amp;&amp;</font>
00554 <font class="comment">                   ( $bhsVerse[$i+2] =~ /\d/ ) &amp;&amp;</font>
00555 <font class="comment">                   ( $bhsVerse[$i+3] =~ /\d/ ) &amp;&amp;</font>
00556 <font class="comment">                   ( $bhsVerse[$i+4] =~ /\d/ )</font>
00557 <font class="comment">                  ) &amp;&amp;</font>
00558 <font class="comment">                  (</font>
00559 <font class="comment">                   ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||</font>
00560 <font class="comment">                   ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
00561 <font class="comment">                  )</font>
00562 <font class="comment">                 )</font>
00563 <font class="comment">                 ||</font>
00564 <font class="comment">                 ( # followed by a vowel and white, eol, sof pasuq</font>
00565 <font class="comment">                  ( $bhsVerse[$i+1] =~ /[:F]/ ) &amp;&amp;</font>
00566 <font class="comment">                  ( # followed by</font>
00567 <font class="comment">                   ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or</font>
00568 <font class="comment">                   ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or</font>
00569 <font class="comment">                   ( # sof pasuq</font>
00570 <font class="comment">                    ( $bhsVerse[$i+2] =~ /0/ ) &amp;&amp;</font>
00571 <font class="comment">                    ( $bhsVerse[$i+3] =~ /0/ )</font>
00572 <font class="comment">                   )</font>
00573 <font class="comment">                  )</font>
00574 <font class="comment">                 )</font>
00575 <font class="comment">                ) # end of what follows after final letter</font>
00576 <font class="comment">                  &amp;&amp;</font>
00577 <font class="comment">                    do {</font>
00578 <font class="comment">                         $bhsVerse[$i] =~ /K/ &amp;&amp; eval { push @entity_line,$FINAL_KAF; }</font>
00579 <font class="comment">                           &amp;&amp; next CHAR;</font>
00580 <font class="comment">                         $bhsVerse[$i] =~ /M/ &amp;&amp; eval { push @entity_line,$FINAL_MEM; }</font>
00581 <font class="comment">                           &amp;&amp; next CHAR;</font>
00582 <font class="comment">                         $bhsVerse[$i] =~ /N/ &amp;&amp; eval { push @entity_line,$FINAL_NUN; }</font>
00583 <font class="comment">                           &amp;&amp; next CHAR;</font>
00584 <font class="comment">                         $bhsVerse[$i] =~ /P/ &amp;&amp; eval { push @entity_line,$FINAL_PE; }</font>
00585 <font class="comment">                           &amp;&amp; next CHAR;</font>
00586 <font class="comment">                         $bhsVerse[$i] =~ /C/ &amp;&amp; eval { push @entity_line,$FINAL_TSADI; }</font>
00587 <font class="comment">                           &amp;&amp; next CHAR;</font>
00588 <font class="comment">                    };</font>
00589 <font class="comment">         # find and convert "furtive patach"</font>
00590 <font class="comment">         ( $bhsVerse[$i] =~ /A/ ) &amp;&amp;             # If the letter is a patach</font>
00591 <font class="comment">           ( $bhsVerse[$i-1] =~ /[)HX(]/ ) &amp;&amp;    #  and is preceeded by a guttural</font>
00592 <font class="comment">           ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || #  and is preceeded by a vowel</font>
00593 <font class="comment">                ( ( $bhsVerse[$i-2] =~ /\./ ) &amp;&amp;    #  or by suruq</font>
00594 <font class="comment">                  ( $bhsVerse[$i-3] =~ /W/ ) ) ||    #</font>
00595 <font class="comment">                ( ( $bhsVerse[$i-2] =~ /W/ ) &amp;&amp;      #  or by holem (written plene)</font>
00596 <font class="comment">                  ( $bhsVerse[$i-3] =~ /O/ ) ) ||    #</font>
00597 <font class="comment">                ( ( $bhsVerse[$i-2] =~ /Y/ ) &amp;&amp;      #  or by hiriq-yod</font>
00598 <font class="comment">                  ( $bhsVerse[$i-3] =~ /I/ ) ) ) &amp;&amp;</font>
00599 <font class="comment">                  do {</font>
00600 <font class="comment">                         $saveGuttural = pop @entity_line; # snip off the gutteral</font>
00601 <font class="comment">                         push @entity_line,$PATAH;         # push on the patach</font>
00602 <font class="comment">                         push @entity_line,$saveGuttural;  # push back on the gutteral</font>
00603 <font class="comment">                         next CHAR;</font>
00604 <font class="comment">                  };</font>
00605 <font class="comment">         # convert cantillation</font>
00606 <font class="comment">         #   since we have previously dealt with all other cases of</font>
00607 <font class="comment">         #   numbers, two digit patterns are all we have to search for</font>
00608 <font class="comment">         $bhsVerse[$i] =~ /\d/ &amp;&amp; $bhsVerse[$i+1] =~ /\d/ &amp;&amp; do {</font>
00609 <font class="comment">                push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
00610 <font class="comment">                $i++; # accents are two digits long, so advance past the 2nd digit</font>
00611 <font class="comment">                next CHAR;</font>
00612 <font class="comment">         };</font>
00613 <font class="comment">         # convert katef vowels, which are two characters long</font>
00614 <font class="comment">         $bhsVerse[$i] =~ /:/ &amp;&amp; $bhsVerse[$i+1] =~ /[AEF]/ &amp;&amp; do {</font>
00615 <font class="comment">                push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
00616 <font class="comment">                $i++;</font>
00617 <font class="comment">                next CHAR;</font>
00618 <font class="comment">         };</font>
00619 <font class="comment">         # convert everything else</font>
00620 <font class="comment">         push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};</font>
00621 <font class="comment">        } # end CHAR</font>
00622 <font class="comment"># print the line to standard output with XML character-level encoding</font>
00623 <font class="comment"># each character has the following format:</font>
00624 <font class="comment"># &lt;c id="1kg1.verse#.word#.character#"&gt;&amp;#1234;&lt;/c&gt;</font>
00625 <font class="comment"></font>
00626 <font class="comment"># set up the verse element</font>
00627 <font class="comment">        $word = 1;</font>
00628 <font class="comment">        $character = 1;</font>
00629 <font class="comment">        print "&lt;verse&gt;\n&lt;word&gt;\n";</font>
00630 <font class="comment"># print each character element</font>
00631 <font class="comment"># if there is a space, then close the word entity, open a new word</font>
00632 <font class="comment"># entity, increment the word number, reset the character number to</font>
00633 <font class="comment"># zero.</font>
00634 <font class="comment">        foreach $element (@entity_line) {</font>
00635 <font class="comment">         if ( $element =~ " " ) {</font>
00636 <font class="comment">           $word++;</font>
00637 <font class="comment">           $character = 1;</font>
00638 <font class="comment">           print "&lt;/word&gt;\n&lt;word&gt;\n";</font>
00639 <font class="comment">           next;</font>
00640 <font class="comment">         }</font>
00641 <font class="comment">         print "&lt;c id=\"1kg1.$verse.$word.$character\"&gt;$element&lt;/c&gt;\n";</font>
00642 <font class="comment">         $character++;</font>
00643 <font class="comment">        }</font>
00644 <font class="comment"># close the verse element</font>
00645 <font class="comment">        print "&lt;/word&gt;&lt;/verse&gt;\n";</font>
00646 <font class="comment"># reinitialize variables</font>
00647 <font class="comment">        @bhsVerse = ();</font>
00648 <font class="comment">        @entity_line = ();</font>
00649 <font class="comment">        @bhsLines = ();</font>
00650 <font class="comment">        } # end while</font>
00651 <font class="comment"># close the XML document</font>
00652 <font class="comment">        print "&lt;/body&gt;\n";</font>
00653 <font class="comment">        */</font>
</pre></div><hr><address align="right"><small>Generated on Thu Jun 20 22:12:59 2002 for The Sword Project by
<a href="http://www.doxygen.org/index.html">
<img src="doxygen.png" alt="doxygen" align="middle" border=0 
width=110 height=53></a>1.2.15 </small></address>
</body>
</html>