aboutsummaryrefslogtreecommitdiffstats
path: root/doc/api-documentation/html/hebrewmcim_8cpp-source.html
diff options
context:
space:
mode:
authordanglassey <danglassey>2002-08-14 09:57:17 +0000
committerdanglassey <danglassey>2002-08-14 09:57:17 +0000
commitc9458897ebbb739d8db83c80e06512d8a612f743 (patch)
treef8c5381045887e34388cc6b26cfccc254bf766dc /doc/api-documentation/html/hebrewmcim_8cpp-source.html
downloadsword-sf-cvs-c9458897ebbb739d8db83c80e06512d8a612f743.tar.gz
*** empty log message ***
Diffstat (limited to 'doc/api-documentation/html/hebrewmcim_8cpp-source.html')
-rw-r--r--doc/api-documentation/html/hebrewmcim_8cpp-source.html658
1 files changed, 658 insertions, 0 deletions
diff --git a/doc/api-documentation/html/hebrewmcim_8cpp-source.html b/doc/api-documentation/html/hebrewmcim_8cpp-source.html
new file mode 100644
index 0000000..6ec69fa
--- /dev/null
+++ b/doc/api-documentation/html/hebrewmcim_8cpp-source.html
@@ -0,0 +1,658 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
+<title>hebrewmcim.cpp Source File</title>
+<link href="doxygen.css" rel="stylesheet" type="text/css">
+</head><body>
+<!-- Generated by Doxygen 1.2.15 -->
+<center>
+<a class="qindex" href="index.html">Main Page</a> &nbsp; <a class="qindex" href="namespaces.html">Namespace List</a> &nbsp; <a class="qindex" href="hierarchy.html">Class Hierarchy</a> &nbsp; <a class="qindex" href="classes.html">Alphabetical List</a> &nbsp; <a class="qindex" href="annotated.html">Compound List</a> &nbsp; <a class="qindex" href="files.html">File List</a> &nbsp; <a class="qindex" href="functions.html">Compound Members</a> &nbsp; </center>
+<hr><h1>hebrewmcim.cpp</h1><div class="fragment"><pre>00001
+00011 <font class="preprocessor">#include &lt;hebrewmcim.h&gt;</font>
+00012
+00013 HebrewMCIM::HebrewMCIM()
+00014 :<a class="code" href="class_s_w_input_method.html">SWInputMethod</a>() {
+00015
+00016 init();
+00017 }
+00018
+00019
+00020 <font class="keywordtype">int</font> *HebrewMCIM::translate(<font class="keywordtype">char</font> in) {
+00021 <font class="keywordtype">int</font> retVal = 0;
+00022 <font class="keyword">static</font> <font class="keywordtype">int</font> retString[5];
+00023 <font class="keywordtype">int</font> retStringIndex = 0;
+00024
+00025 memset(retString, 0, 5);
+00026
+00027 <font class="keywordflow">if</font> (getState() &gt; 1) {
+00028 <font class="keywordflow">if</font> (getState() &gt;= 12) { <font class="comment">// serious issue with internal structure</font>
+00029 setState(0);
+00030 retString[retStringIndex++] = in;
+00031 <font class="keywordflow">return</font> retString;
+00032 }
+00033 map&lt;int, int&gt;::iterator find = subst2[getState()].find(in);
+00034 <font class="keywordflow">if</font> (find != subst2[getState()].end())
+00035 retVal = find-&gt;second;
+00036 <font class="keywordflow">else</font> retVal = in;
+00037
+00038 setState(0);
+00039 retString[retStringIndex++] = retVal;
+00040 <font class="keywordflow">return</font> retString;
+00041 }
+00042 <font class="keywordflow">else</font> {
+00043 retVal = subst[in];
+00044
+00045 <font class="keywordflow">if</font> (retVal == 0) {
+00046 setState(0);
+00047 retString[retStringIndex++] = in;
+00048 <font class="keywordflow">return</font> retString;
+00049 }
+00050 <font class="keywordflow">if</font> (retVal &gt; 100) {
+00051 setState(1);
+00052 retString[retStringIndex++] = retVal;
+00053 <font class="keywordflow">return</font> retString;
+00054 }
+00055 <font class="keywordflow">if</font> (retVal == 50) { <font class="comment">// multiChar</font>
+00056 setState(1);
+00057 <font class="keywordtype">int</font> *chars = multiChars[in];
+00058 <font class="keywordflow">if</font> (chars != 0) {
+00059 retString[retStringIndex++] = chars[0];
+00060 retString[retStringIndex++] = chars[1];
+00061 <font class="keywordflow">return</font> retString;
+00062 }
+00063 }
+00064 }
+00065 setState(retVal);
+00066 <font class="keywordflow">return</font> 0;
+00067 }
+00068
+00069
+00070 <font class="keywordtype">void</font> HebrewMCIM::init() {
+00071 memset(subst, 0, 255);
+00072
+00073 subst[<font class="charliteral">')'</font>] = 1488;
+00074 subst[<font class="charliteral">'B'</font>] = 1489;
+00075 subst[<font class="charliteral">'G'</font>] = 1490;
+00076 subst[<font class="charliteral">'D'</font>] = 1491;
+00077 subst[<font class="charliteral">'H'</font>] = 1492;
+00078 subst[<font class="charliteral">'W'</font>] = 1493;
+00079 subst[<font class="charliteral">'Z'</font>] = 1494;
+00080 subst[<font class="charliteral">'X'</font>] = 1495;
+00081 subst[<font class="charliteral">'+'</font>] = 1496;
+00082 subst[<font class="charliteral">'Y'</font>] = 1497;
+00083
+00084 subst[<font class="charliteral">'k'</font>] = 1498; <font class="comment">// finals</font>
+00085 subst[<font class="charliteral">'m'</font>] = 1501;
+00086 subst[<font class="charliteral">'n'</font>] = 1503;
+00087 subst[<font class="charliteral">'c'</font>] = 1509;
+00088
+00089 subst[<font class="charliteral">'P'</font>] = 1508;
+00090 subst[<font class="charliteral">'K'</font>] = 1499;
+00091 subst[<font class="charliteral">'L'</font>] = 1500;
+00092 subst[<font class="charliteral">'M'</font>] = 1502;
+00093 subst[<font class="charliteral">'N'</font>] = 1504;
+00094 subst[<font class="charliteral">'S'</font>] = 1505;
+00095 subst[<font class="charliteral">'('</font>] = 1506;
+00096 subst[<font class="charliteral">'p'</font>] = 1507;
+00097 subst[<font class="charliteral">'C'</font>] = 1510;
+00098 subst[<font class="charliteral">'Q'</font>] = 1511;
+00099 subst[<font class="charliteral">'R'</font>] = 1512;
+00100 subst[<font class="charliteral">'#'</font>] = 1513;
+00101
+00102 <font class="comment">// special multiChars</font>
+00103 subst[<font class="charliteral">'&amp;'</font>] = 50;
+00104 subst[<font class="charliteral">'$'</font>] = 50;
+00105
+00106 <font class="keyword">static</font> <font class="keywordtype">int</font> x[] = {1513, 1474};
+00107 multiChars[<font class="charliteral">'&amp;'</font>] = x;
+00108 <font class="keyword">static</font> <font class="keywordtype">int</font> y[] = {1513, 1473};
+00109 multiChars[<font class="charliteral">'$'</font>] = y;
+00110
+00111 subst[<font class="charliteral">'T'</font>] = 1514;
+00112
+00113 <font class="comment">// VOWELS</font>
+00114 subst[<font class="charliteral">'A'</font>] = 1463;
+00115 subst[<font class="charliteral">'F'</font>] = 1464;
+00116 subst[<font class="charliteral">'E'</font>] = 1462;
+00117 subst[<font class="charliteral">'"'</font>] = 1461;
+00118 subst[<font class="charliteral">'I'</font>] = 1460;
+00119 subst[<font class="charliteral">'O'</font>] = 1465;
+00120 subst[<font class="charliteral">'U'</font>] = 1467;
+00121
+00122
+00123
+00124 <font class="comment">// OTHER DIACRITICS</font>
+00125 subst[<font class="charliteral">'.'</font>] = 1468;
+00126 subst[<font class="charliteral">'-'</font>] = 1470;
+00127 subst[<font class="charliteral">','</font>] = 1471;
+00128
+00129 <font class="comment">// Compound input</font>
+00130
+00131 <font class="comment">// CANTILLATION</font>
+00132
+00133 subst[<font class="charliteral">':'</font>] = 2;
+00134 subst2[2][<font class="charliteral">'A'</font>] = 1458;
+00135 subst2[2][<font class="charliteral">'E'</font>] = 1457;
+00136 subst2[2][<font class="charliteral">'F'</font>] = 1459;
+00137
+00138
+00139 <font class="comment">/* Telisha qetana is postpositive as in '04' above. However, Michigan</font>
+00140 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
+00141 <font class="comment"># difference.</font>
+00142 <font class="comment"> */</font>
+00143 subst[<font class="charliteral">'2'</font>] = 5;
+00144 subst2[5][<font class="charliteral">'4'</font>] = 1449;
+00145
+00146
+00147 <font class="comment">/* Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
+00148 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
+00149 <font class="comment"># right of the vowel). It is also used for silluq.</font>
+00150 <font class="comment"> */</font>
+00151 subst[<font class="charliteral">'3'</font>] = 6;
+00152 subst2[6][<font class="charliteral">'3'</font>] = 1433;
+00153 subst2[6][<font class="charliteral">'5'</font>] = 1469;
+00154
+00155
+00156 <font class="comment">/* The Michigan code of telisha gedola in medial position. Graphically,</font>
+00157 <font class="comment"># there is no difference.</font>
+00158 <font class="comment"> */</font>
+00159 subst[<font class="charliteral">'4'</font>] = 7;
+00160 subst2[7][<font class="charliteral">'4'</font>] = 1440;
+00161
+00162 subst[<font class="charliteral">'6'</font>] = 8;
+00163 subst2[8][<font class="charliteral">'0'</font>] = 1451;
+00164 subst2[8][<font class="charliteral">'1'</font>] = 1436;
+00165
+00166 subst[<font class="charliteral">'1'</font>] = 4;
+00167 subst2[4][<font class="charliteral">'0'</font>] = 1434;
+00168
+00169 <font class="comment">/* In the poetic books, prepositive dehi occurs; it's unclear whether</font>
+00170 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
+00171 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
+00172 <font class="comment"># code for each.</font>
+00173 <font class="comment"> */</font>
+00174
+00175 subst2[4][<font class="charliteral">'3'</font>] = 1430;
+00176
+00177 <font class="comment">/* This is the poetic accent mugrash, which also includes rebia, but is</font>
+00178 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
+00179 <font class="comment"> */</font>
+00180 subst2[4][<font class="charliteral">'1'</font>] = 1437;
+00181 subst2[4][<font class="charliteral">'4'</font>] = 1440;
+00182
+00183
+00184 subst[<font class="charliteral">'0'</font>] = 3;
+00185 subst2[3][<font class="charliteral">'0'</font>] = 1475;
+00186 subst2[3][<font class="charliteral">'1'</font>] = 1426;
+00187
+00188 <font class="comment">/* According to BHS, zarqa and sinnor are both postpositive. However,</font>
+00189 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
+00190 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
+00191 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
+00192 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
+00193 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
+00194 <font class="comment"> */</font>
+00195
+00196 subst2[3][<font class="charliteral">'2'</font>] = 1454;
+00197
+00198 <font class="comment">/* Pashta is postpositive, and the Unicode equivalent reflects</font>
+00199 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
+00200 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
+00201 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
+00202 <font class="comment"># although it could be algorithmically determined.</font>
+00203 <font class="comment"> */</font>
+00204
+00205 subst2[3][<font class="charliteral">'3'</font>] = 1433;
+00206 subst2[3][<font class="charliteral">'4'</font>] = 1449;
+00207 subst2[3][<font class="charliteral">'5'</font>] = 1472;
+00208
+00209
+00210 <font class="comment">/* This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
+00211 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
+00212 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
+00213 <font class="comment"># marks.</font>
+00214 <font class="comment"> */</font>
+00215
+00216 subst2[8][<font class="charliteral">'2'</font>] = 1438;
+00217
+00218 <font class="comment">// Also known as azla</font>
+00219 subst2[8][<font class="charliteral">'3'</font>] = 1448;
+00220 subst2[8][<font class="charliteral">'4'</font>] = 1452;
+00221 subst2[8][<font class="charliteral">'5'</font>] = 1427;
+00222
+00223
+00224 subst[<font class="charliteral">'8'</font>] = 9;
+00225 subst2[9][<font class="charliteral">'0'</font>] = 1428;
+00226 subst2[9][<font class="charliteral">'1'</font>] = 1431;
+00227
+00228 <font class="comment">/* Note, this accent is actually sinnorit, but it does not exist as a</font>
+00229 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
+00230 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
+00231 <font class="comment"># get to this.</font>
+00232 <font class="comment"> */</font>
+00233 subst2[9][<font class="charliteral">'2'</font>] = 1432;
+00234
+00235 <font class="comment">/* The Unicode form does not match the form used by BHS, but the names</font>
+00236 <font class="comment"># are the same.</font>
+00237 <font class="comment"> */</font>
+00238 subst2[9][<font class="charliteral">'3'</font>] = 1441;
+00239 subst2[9][<font class="charliteral">'4'</font>] = 1439;
+00240 subst2[9][<font class="charliteral">'5'</font>] = 1429;
+00241
+00242 subst[<font class="charliteral">'7'</font>] = 10;
+00243 subst2[10][<font class="charliteral">'0'</font>] = 1444;
+00244 subst2[10][<font class="charliteral">'1'</font>] = 1445;
+00245 subst2[10][<font class="charliteral">'2'</font>] = 1446;
+00246 subst2[10][<font class="charliteral">'3'</font>] = 1430; <font class="comment">// also '13', '73' also is used for majela</font>
+00247 subst2[10][<font class="charliteral">'4'</font>] = 1443;
+00248 subst2[10][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// this is silluq; should appear to the left of the vowel</font>
+00249
+00250 subst[<font class="charliteral">'9'</font>] = 11;
+00251 subst2[11][<font class="charliteral">'1'</font>] = 1435;
+00252 subst2[11][<font class="charliteral">'2'</font>] = 1425;
+00253 subst2[11][<font class="charliteral">'3'</font>] = 1450;
+00254 subst2[11][<font class="charliteral">'4'</font>] = 1447;
+00255 subst2[11][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// should appear to the right of the vowel</font>
+00256
+00257 }
+00258
+00259 <font class="comment">/*</font>
+00260 <font class="comment"></font>
+00261 <font class="comment"></font>
+00262 <font class="comment"># CANTILLION MARKS</font>
+00263 <font class="comment"></font>
+00264 <font class="comment"> my $ETNAHTA = '&amp;#1425;';</font>
+00265 <font class="comment"># officially the Unicode name for this symbol was "SEGOL." However, that is</font>
+00266 <font class="comment"># not a unique name, conflicting with the vowel of the same name. Further,</font>
+00267 <font class="comment"># the position of the symbol is different. I have changed the name of the</font>
+00268 <font class="comment"># accent to "SEGOLTA," the traditional name for this accent.</font>
+00269 <font class="comment"> my $SEGOLTA = '&amp;#1426;';</font>
+00270 <font class="comment"> my $SHALSHELET = '&amp;#1427;';</font>
+00271 <font class="comment"> my $ZAQEF_QATAN = '&amp;#1428;';</font>
+00272 <font class="comment"> my $ZAQEF_GADOL = '&amp;#1429;';</font>
+00273 <font class="comment"> my $TIPEHA = '&amp;#1430;';</font>
+00274 <font class="comment"> my $REVIA = '&amp;#1431;';</font>
+00275 <font class="comment"> my $ZARQA = '&amp;#1432;';</font>
+00276 <font class="comment"> my $PASHTA = '&amp;#1433;';</font>
+00277 <font class="comment"> my $YETIV = '&amp;#1434;';</font>
+00278 <font class="comment"> my $TEVIR = '&amp;#1435;';</font>
+00279 <font class="comment"> my $GERESH = '&amp;#1436;';</font>
+00280 <font class="comment"> my $GERESH_MUQDAM = '&amp;#1437;';</font>
+00281 <font class="comment"> my $GERSHAYIM = '&amp;#1438;';</font>
+00282 <font class="comment"> my $QARNEY_PARA = '&amp;#1439;';</font>
+00283 <font class="comment"> my $TELISHA_GEDOLA = '&amp;#1440;';</font>
+00284 <font class="comment"> my $PAZER = '&amp;#1441;';</font>
+00285 <font class="comment"> my $MUNAH = '&amp;#1443;';</font>
+00286 <font class="comment"> my $MAHAPAKH = '&amp;#1444;';</font>
+00287 <font class="comment"> my $MERKHA = '&amp;#1445;';</font>
+00288 <font class="comment"> my $MERKHA_KEFULA = '&amp;#1446;';</font>
+00289 <font class="comment"> my $DARGA = '&amp;#1447;';</font>
+00290 <font class="comment"> my $QADMA = '&amp;#1448;';</font>
+00291 <font class="comment"> my $TELISHA_QETANA = '&amp;#1449;';</font>
+00292 <font class="comment"> my $YERAH_BEN_YOMO = '&amp;#1450;';</font>
+00293 <font class="comment"> my $OLE = '&amp;#1451;';</font>
+00294 <font class="comment"> my $ILUY = '&amp;#1452;';</font>
+00295 <font class="comment"> my $DEHI = '&amp;#1453;';</font>
+00296 <font class="comment"> my $ZINOR = '&amp;#1454;';</font>
+00297 <font class="comment"># HEBREW MARK</font>
+00298 <font class="comment"> my $MASORA_CIRCLE = '&amp;#1455;';</font>
+00299 <font class="comment"># HEBREW EXTENDED-A points and punctuation</font>
+00300 <font class="comment"> my $SHEVA = '&amp;#1456;';</font>
+00301 <font class="comment"> my $HATAF_SEGOL = '&amp;#1457;';</font>
+00302 <font class="comment"> my $HATAF_PATAH = '&amp;#1458;';</font>
+00303 <font class="comment"> my $HATAF_QAMATS = '&amp;#1459;';</font>
+00304 <font class="comment"> my $HIRIQ = '&amp;#1460;';</font>
+00305 <font class="comment"> my $TSERE = '&amp;#1461;';</font>
+00306 <font class="comment"> my $SEGOL = '&amp;#1462;';</font>
+00307 <font class="comment"># furtive Patah is not a distinct character</font>
+00308 <font class="comment"> my $PATAH = '&amp;#1463;';</font>
+00309 <font class="comment"> my $QAMATS = '&amp;#1464;';</font>
+00310 <font class="comment"> my $HOLAM = '&amp;#1465;';</font>
+00311 <font class="comment"> my $QUBUTS = '&amp;#1467;';</font>
+00312 <font class="comment"># also used as shuruq</font>
+00313 <font class="comment"># falls within the base letter</font>
+00314 <font class="comment"> my $DAGESH_OR_MAPIQ = '&amp;#1468;';</font>
+00315 <font class="comment"># also used as siluq</font>
+00316 <font class="comment"> my $METAG = '&amp;#1469;';</font>
+00317 <font class="comment"> my $MAQAF = '&amp;#1470;';</font>
+00318 <font class="comment"> my $RAFE = '&amp;#1471;';</font>
+00319 <font class="comment"># Also used for legarmeh</font>
+00320 <font class="comment"># may be treated as spacing punctuation, not as a point</font>
+00321 <font class="comment"> my $PASEQ = '&amp;#1472;';</font>
+00322 <font class="comment"> my $SHIN_DOT = '&amp;#1473;';</font>
+00323 <font class="comment"> my $SIN_DOT = '&amp;#1474;';</font>
+00324 <font class="comment"> my $SOF_PASUQ = '&amp;#1475;';</font>
+00325 <font class="comment"># HEBREW MARK</font>
+00326 <font class="comment"> my $UPPER_DOT = '&amp;#1476;';</font>
+00327 <font class="comment"># HEBREW LETTERS based on ISO 8859-8</font>
+00328 <font class="comment"># aleph</font>
+00329 <font class="comment"># x (alef symbol - 2135)</font>
+00330 <font class="comment"> my $ALEF = '&amp;#1488;';</font>
+00331 <font class="comment"># x (bet symbol - 2136)</font>
+00332 <font class="comment"> my $BET = '&amp;#1489;';</font>
+00333 <font class="comment"># x (gimel symbol - 2137)</font>
+00334 <font class="comment"> my $GIMEL = '&amp;#1490;';</font>
+00335 <font class="comment"># x (dalet symbol - 2138)</font>
+00336 <font class="comment"> my $DALET = '&amp;#1491;';</font>
+00337 <font class="comment"> my $HE = '&amp;#1492;';</font>
+00338 <font class="comment"> my $VAV = '&amp;#1493;';</font>
+00339 <font class="comment"> my $ZAYIN = '&amp;#1494;';</font>
+00340 <font class="comment"> my $HET = '&amp;#1495;';</font>
+00341 <font class="comment"> my $TET = '&amp;#1496;';</font>
+00342 <font class="comment"> my $YOD = '&amp;#1497;';</font>
+00343 <font class="comment"> my $FINAL_KAF = '&amp;#1498;';</font>
+00344 <font class="comment"> my $KAF = '&amp;#1499;';</font>
+00345 <font class="comment"> my $LAMED = '&amp;#1500;';</font>
+00346 <font class="comment"> my $FINAL_MEM = '&amp;#1501;';</font>
+00347 <font class="comment"> my $MEM = '&amp;#1502;';</font>
+00348 <font class="comment"> my $FINAL_NUN = '&amp;#1503;';</font>
+00349 <font class="comment"> my $NUN = '&amp;#1504;';</font>
+00350 <font class="comment"> my $SAMEKH = '&amp;#1505;';</font>
+00351 <font class="comment"> my $AYIN = '&amp;#1506;';</font>
+00352 <font class="comment"> my $FINAL_PE = '&amp;#1507;';</font>
+00353 <font class="comment"> my $PE = '&amp;#1508;';</font>
+00354 <font class="comment"> my $FINAL_TSADI = '&amp;#1509;';</font>
+00355 <font class="comment"># also known as zade</font>
+00356 <font class="comment"> my $TSADI = '&amp;#1510;';</font>
+00357 <font class="comment"> my $QOF = '&amp;#1511;';</font>
+00358 <font class="comment"> my $RESH = '&amp;#1512;';</font>
+00359 <font class="comment"> my $SHIN = '&amp;#1513;';</font>
+00360 <font class="comment"> my $TAV = '&amp;#1514;';</font>
+00361 <font class="comment"># Yiddish digraphs</font>
+00362 <font class="comment"># Hebrew Ligature</font>
+00363 <font class="comment"># tsvey vovn</font>
+00364 <font class="comment"> my $DOUBLE_VAV = '&amp;#1520;';</font>
+00365 <font class="comment"> my $VAV_YOD = '&amp;#1521;';</font>
+00366 <font class="comment"># tsvey yudn</font>
+00367 <font class="comment"> my $DOUBLE_YOD = '&amp;#1522;';</font>
+00368 <font class="comment"></font>
+00369 <font class="comment"># Additional punctuation</font>
+00370 <font class="comment"> my $PUNCT_GERESH = '&amp;#1523;';</font>
+00371 <font class="comment"> my $PUNCT_GERSHAYIM = '&amp;#1524;';</font>
+00372 <font class="comment"># Reserved: 0x05F5"</font>
+00373 <font class="comment"># x (hebrew point judeo-spanish varika - FB1E)</font>
+00374 <font class="comment">#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E</font>
+00375 <font class="comment"></font>
+00376 <font class="comment">#############################</font>
+00377 <font class="comment"># End of Unicode 2.0 Hebrew #</font>
+00378 <font class="comment">#############################</font>
+00379 <font class="comment"></font>
+00380 <font class="comment"># A hash whose key is a Michagan code, and whose value is a Unicode</font>
+00381 <font class="comment"># equvalent</font>
+00382 <font class="comment"></font>
+00383 <font class="comment"> char subst[] = new char [255];</font>
+00384 <font class="comment"> subst[')'] = 1488;</font>
+00385 <font class="comment"> 'B' =&gt; $BET,</font>
+00386 <font class="comment"> 'G' =&gt; $GIMEL,</font>
+00387 <font class="comment"> 'D' =&gt; $DALET,</font>
+00388 <font class="comment"> 'H' =&gt; $HE,</font>
+00389 <font class="comment"> 'W' =&gt; $VAV,</font>
+00390 <font class="comment"> 'Z' =&gt; $ZAYIN,</font>
+00391 <font class="comment"> 'X' =&gt; $HET,</font>
+00392 <font class="comment"> '+' =&gt; $TET,</font>
+00393 <font class="comment"> 'Y' =&gt; $YOD,</font>
+00394 <font class="comment"> 'K' =&gt; $KAF,</font>
+00395 <font class="comment"> 'L' =&gt; $LAMED,</font>
+00396 <font class="comment"> 'M' =&gt; $MEM,</font>
+00397 <font class="comment"> 'N' =&gt; $NUN,</font>
+00398 <font class="comment"> 'S' =&gt; $SAMEKH,</font>
+00399 <font class="comment"> '(' =&gt; $AYIN,</font>
+00400 <font class="comment"> 'P' =&gt; $PE,</font>
+00401 <font class="comment"> 'C' =&gt; $TSADI,</font>
+00402 <font class="comment"> 'Q' =&gt; $QOF,</font>
+00403 <font class="comment"> 'R' =&gt; $RESH,</font>
+00404 <font class="comment"> '#' =&gt; $SHIN, # the letter shin without a point</font>
+00405 <font class="comment"> '&amp;' =&gt; ($SHIN . $SIN_DOT),</font>
+00406 <font class="comment"> '$' =&gt; ($SHIN . $SHIN_DOT), # '</font>
+00407 <font class="comment"> 'T' =&gt; $TAV,</font>
+00408 <font class="comment"># VOWELS</font>
+00409 <font class="comment"> 'A' =&gt; $PATAH,</font>
+00410 <font class="comment"> 'F' =&gt; $QAMATS,</font>
+00411 <font class="comment"> 'E' =&gt; $SEGOL,</font>
+00412 <font class="comment"> '"' =&gt; $TSERE,</font>
+00413 <font class="comment"> 'I' =&gt; $HIRIQ,</font>
+00414 <font class="comment"> 'O' =&gt; $HOLAM,</font>
+00415 <font class="comment"> 'U' =&gt; $QUBUTS,</font>
+00416 <font class="comment"> ':' =&gt; $SHEVA,</font>
+00417 <font class="comment"> ':A' =&gt; $HATAF_PATAH,</font>
+00418 <font class="comment"> ':E' =&gt; $HATAF_SEGOL,</font>
+00419 <font class="comment"> ':F' =&gt; $HATAF_QAMATS,</font>
+00420 <font class="comment"># OTHER DIACRITICS</font>
+00421 <font class="comment"> '.' =&gt; $DAGESH_OR_MAPIQ,</font>
+00422 <font class="comment"> '-' =&gt; $MAQAF,</font>
+00423 <font class="comment"> ',' =&gt; $RAFE,</font>
+00424 <font class="comment"># CANTILLATION</font>
+00425 <font class="comment"> '00' =&gt; $SOF_PASUQ,</font>
+00426 <font class="comment"> '01' =&gt; $SEGOLTA,</font>
+00427 <font class="comment"># According to BHS, zarqa and sinnor are both postpositive. However,</font>
+00428 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font>
+00429 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font>
+00430 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font>
+00431 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font>
+00432 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font>
+00433 <font class="comment"> '02' =&gt; $ZINOR,</font>
+00434 <font class="comment"># Pashta is postpositive, and the Unicode equivalent reflects</font>
+00435 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font>
+00436 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font>
+00437 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font>
+00438 <font class="comment"># although it could be algorithmically determined.</font>
+00439 <font class="comment"> '03' =&gt; $PASHTA,</font>
+00440 <font class="comment"> '04' =&gt; $TELISHA_QETANA,</font>
+00441 <font class="comment"> '05' =&gt; $PASEQ,</font>
+00442 <font class="comment"> '10' =&gt; $YETIV,</font>
+00443 <font class="comment"># In the poetic books, prepositive dehi occurs; it's unclear whether</font>
+00444 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font>
+00445 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font>
+00446 <font class="comment"># code for each.</font>
+00447 <font class="comment"> '13' =&gt; $TIPEHA, # also $DEHI</font>
+00448 <font class="comment"># This is the poetic accent mugrash, which also includes rebia, but is</font>
+00449 <font class="comment"># encoded separately as '81' in the Michigan text.</font>
+00450 <font class="comment"> '11' =&gt; $GERESH_MUQDAM,</font>
+00451 <font class="comment"> '14' =&gt; $TELISHA_GEDOLA,</font>
+00452 <font class="comment"># Telisha qetana is postpositive as in '04' above. However, Michigan</font>
+00453 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font>
+00454 <font class="comment"># difference.</font>
+00455 <font class="comment"> '24' =&gt; $TELISHA_QETANA,</font>
+00456 <font class="comment"> '33' =&gt; $PASHTA,</font>
+00457 <font class="comment"># The Michigan code of telisha gedola in medial position. Graphically,</font>
+00458 <font class="comment"># there is no difference.</font>
+00459 <font class="comment"> '44' =&gt; $TELISHA_GEDOLA,</font>
+00460 <font class="comment"> '60' =&gt; $OLE,</font>
+00461 <font class="comment"> '61' =&gt; $GERESH,</font>
+00462 <font class="comment"># This is the Unicode Hebrew *accent*; there is also another Hebrew</font>
+00463 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font>
+00464 <font class="comment"># traditional rounded marks, rather than the alternate straight</font>
+00465 <font class="comment"># marks.</font>
+00466 <font class="comment"> '62' =&gt; $GERSHAYIM,</font>
+00467 <font class="comment"># Also known as azla</font>
+00468 <font class="comment"> '63' =&gt; $QADMA,</font>
+00469 <font class="comment"> '64' =&gt; $ILUY,</font>
+00470 <font class="comment"> '65' =&gt; $SHALSHELET,</font>
+00471 <font class="comment"> '80' =&gt; $ZAQEF_QATAN,</font>
+00472 <font class="comment"> '81' =&gt; $REVIA,</font>
+00473 <font class="comment"># Note, this accent is actually sinnorit, but it does not exist as a</font>
+00474 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font>
+00475 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font>
+00476 <font class="comment"># get to this.</font>
+00477 <font class="comment"> '82' =&gt; $ZARQA,</font>
+00478 <font class="comment"># The Unicode form does not match the form used by BHS, but the names</font>
+00479 <font class="comment"># are the same.</font>
+00480 <font class="comment"> '83' =&gt; $PAZER,</font>
+00481 <font class="comment"> '84' =&gt; $QARNEY_PARA,</font>
+00482 <font class="comment"> '85' =&gt; $ZAQEF_GADOL,</font>
+00483 <font class="comment"># Note Michigan encoding distinguishes between medial metheg '35' (occuring</font>
+00484 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font>
+00485 <font class="comment"># right of the vowel). It is also used for silluq.</font>
+00486 <font class="comment"> '35' =&gt; $METAG,</font>
+00487 <font class="comment"> '70' =&gt; $MAHAPAKH,</font>
+00488 <font class="comment"> '71' =&gt; $MERKHA,</font>
+00489 <font class="comment"> '72' =&gt; $MERKHA_KEFULA,</font>
+00490 <font class="comment"> '73' =&gt; $TIPEHA, # also '13', '73' also is used for majela</font>
+00491 <font class="comment"> '74' =&gt; $MUNAH,</font>
+00492 <font class="comment"> '75' =&gt; $METAG, # this is silluq; should appear to the left of the vowel</font>
+00493 <font class="comment"> '91' =&gt; $TEVIR,</font>
+00494 <font class="comment"> '92' =&gt; $ETNAHTA,</font>
+00495 <font class="comment"> '93' =&gt; $YERAH_BEN_YOMO,</font>
+00496 <font class="comment"> '94' =&gt; $DARGA,</font>
+00497 <font class="comment"> '95' =&gt; $METAG, # should appear to the right of the vowel</font>
+00498 <font class="comment"></font>
+00499 <font class="comment"># Not used by the Michigan Encoding</font>
+00500 <font class="comment"># $UPPER_DOT = '05C4';</font>
+00501 <font class="comment"> );</font>
+00502 <font class="comment"></font>
+00503 <font class="comment"># declare other variables</font>
+00504 <font class="comment"> my (@bhsLines,</font>
+00505 <font class="comment"> @bhsVerse,</font>
+00506 <font class="comment"> @entity_line) = ();</font>
+00507 <font class="comment"></font>
+00508 <font class="comment"> my ($i,</font>
+00509 <font class="comment"> $verse,</font>
+00510 <font class="comment"> $word,</font>
+00511 <font class="comment"> $character) = 0;</font>
+00512 <font class="comment"></font>
+00513 <font class="comment"> my ($element,</font>
+00514 <font class="comment"> $saveGuttural) = "";</font>
+00515 <font class="comment"></font>
+00516 <font class="comment"># read in a line</font>
+00517 <font class="comment"> while (&lt;&gt;) {</font>
+00518 <font class="comment"># Process one verse</font>
+00519 <font class="comment"># iterate over every character and change to XML decimal entity</font>
+00520 <font class="comment"> CHAR: for ( $i = 0; ($i &lt; scalar(@bhsVerse)); $i++) {</font>
+00521 <font class="comment"> # find and convert final kaf, mem, nun, pe, tsade</font>
+00522 <font class="comment"> ( # if final form</font>
+00523 <font class="comment"> $bhsVerse[$i] =~ /[KMNPC]/</font>
+00524 <font class="comment"> )</font>
+00525 <font class="comment"> &amp;&amp;</font>
+00526 <font class="comment"> (</font>
+00527 <font class="comment"> ( # whitespace or</font>
+00528 <font class="comment"> $bhsVerse[$i+1] =~ /[ \-?]/</font>
+00529 <font class="comment"> )</font>
+00530 <font class="comment"> ||</font>
+00531 <font class="comment"> ( # EOL or</font>
+00532 <font class="comment"> $i == ( scalar(@bhsVerse) - 1 )</font>
+00533 <font class="comment"> )</font>
+00534 <font class="comment"> ||</font>
+00535 <font class="comment"> ( # sof pasuq or</font>
+00536 <font class="comment"> ( $bhsVerse[$i+1] =~ /0/ ) &amp;&amp;</font>
+00537 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ )</font>
+00538 <font class="comment"> )</font>
+00539 <font class="comment"> ||</font>
+00540 <font class="comment"> ( # one accent followed by white, eol or</font>
+00541 <font class="comment"> (</font>
+00542 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &amp;&amp;</font>
+00543 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ )</font>
+00544 <font class="comment"> ) &amp;&amp;</font>
+00545 <font class="comment"> (</font>
+00546 <font class="comment"> ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||</font>
+00547 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
+00548 <font class="comment"> )</font>
+00549 <font class="comment"> )</font>
+00550 <font class="comment"> ||</font>
+00551 <font class="comment"> ( # two accents followed by white, eol</font>
+00552 <font class="comment"> (</font>
+00553 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &amp;&amp;</font>
+00554 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ ) &amp;&amp;</font>
+00555 <font class="comment"> ( $bhsVerse[$i+3] =~ /\d/ ) &amp;&amp;</font>
+00556 <font class="comment"> ( $bhsVerse[$i+4] =~ /\d/ )</font>
+00557 <font class="comment"> ) &amp;&amp;</font>
+00558 <font class="comment"> (</font>
+00559 <font class="comment"> ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||</font>
+00560 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font>
+00561 <font class="comment"> )</font>
+00562 <font class="comment"> )</font>
+00563 <font class="comment"> ||</font>
+00564 <font class="comment"> ( # followed by a vowel and white, eol, sof pasuq</font>
+00565 <font class="comment"> ( $bhsVerse[$i+1] =~ /[:F]/ ) &amp;&amp;</font>
+00566 <font class="comment"> ( # followed by</font>
+00567 <font class="comment"> ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or</font>
+00568 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or</font>
+00569 <font class="comment"> ( # sof pasuq</font>
+00570 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ ) &amp;&amp;</font>
+00571 <font class="comment"> ( $bhsVerse[$i+3] =~ /0/ )</font>
+00572 <font class="comment"> )</font>
+00573 <font class="comment"> )</font>
+00574 <font class="comment"> )</font>
+00575 <font class="comment"> ) # end of what follows after final letter</font>
+00576 <font class="comment"> &amp;&amp;</font>
+00577 <font class="comment"> do {</font>
+00578 <font class="comment"> $bhsVerse[$i] =~ /K/ &amp;&amp; eval { push @entity_line,$FINAL_KAF; }</font>
+00579 <font class="comment"> &amp;&amp; next CHAR;</font>
+00580 <font class="comment"> $bhsVerse[$i] =~ /M/ &amp;&amp; eval { push @entity_line,$FINAL_MEM; }</font>
+00581 <font class="comment"> &amp;&amp; next CHAR;</font>
+00582 <font class="comment"> $bhsVerse[$i] =~ /N/ &amp;&amp; eval { push @entity_line,$FINAL_NUN; }</font>
+00583 <font class="comment"> &amp;&amp; next CHAR;</font>
+00584 <font class="comment"> $bhsVerse[$i] =~ /P/ &amp;&amp; eval { push @entity_line,$FINAL_PE; }</font>
+00585 <font class="comment"> &amp;&amp; next CHAR;</font>
+00586 <font class="comment"> $bhsVerse[$i] =~ /C/ &amp;&amp; eval { push @entity_line,$FINAL_TSADI; }</font>
+00587 <font class="comment"> &amp;&amp; next CHAR;</font>
+00588 <font class="comment"> };</font>
+00589 <font class="comment"> # find and convert "furtive patach"</font>
+00590 <font class="comment"> ( $bhsVerse[$i] =~ /A/ ) &amp;&amp; # If the letter is a patach</font>
+00591 <font class="comment"> ( $bhsVerse[$i-1] =~ /[)HX(]/ ) &amp;&amp; # and is preceeded by a guttural</font>
+00592 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel</font>
+00593 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /\./ ) &amp;&amp; # or by suruq</font>
+00594 <font class="comment"> ( $bhsVerse[$i-3] =~ /W/ ) ) || #</font>
+00595 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /W/ ) &amp;&amp; # or by holem (written plene)</font>
+00596 <font class="comment"> ( $bhsVerse[$i-3] =~ /O/ ) ) || #</font>
+00597 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /Y/ ) &amp;&amp; # or by hiriq-yod</font>
+00598 <font class="comment"> ( $bhsVerse[$i-3] =~ /I/ ) ) ) &amp;&amp;</font>
+00599 <font class="comment"> do {</font>
+00600 <font class="comment"> $saveGuttural = pop @entity_line; # snip off the gutteral</font>
+00601 <font class="comment"> push @entity_line,$PATAH; # push on the patach</font>
+00602 <font class="comment"> push @entity_line,$saveGuttural; # push back on the gutteral</font>
+00603 <font class="comment"> next CHAR;</font>
+00604 <font class="comment"> };</font>
+00605 <font class="comment"> # convert cantillation</font>
+00606 <font class="comment"> # since we have previously dealt with all other cases of</font>
+00607 <font class="comment"> # numbers, two digit patterns are all we have to search for</font>
+00608 <font class="comment"> $bhsVerse[$i] =~ /\d/ &amp;&amp; $bhsVerse[$i+1] =~ /\d/ &amp;&amp; do {</font>
+00609 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
+00610 <font class="comment"> $i++; # accents are two digits long, so advance past the 2nd digit</font>
+00611 <font class="comment"> next CHAR;</font>
+00612 <font class="comment"> };</font>
+00613 <font class="comment"> # convert katef vowels, which are two characters long</font>
+00614 <font class="comment"> $bhsVerse[$i] =~ /:/ &amp;&amp; $bhsVerse[$i+1] =~ /[AEF]/ &amp;&amp; do {</font>
+00615 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font>
+00616 <font class="comment"> $i++;</font>
+00617 <font class="comment"> next CHAR;</font>
+00618 <font class="comment"> };</font>
+00619 <font class="comment"> # convert everything else</font>
+00620 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};</font>
+00621 <font class="comment"> } # end CHAR</font>
+00622 <font class="comment"># print the line to standard output with XML character-level encoding</font>
+00623 <font class="comment"># each character has the following format:</font>
+00624 <font class="comment"># &lt;c id="1kg1.verse#.word#.character#"&gt;&amp;#1234;&lt;/c&gt;</font>
+00625 <font class="comment"></font>
+00626 <font class="comment"># set up the verse element</font>
+00627 <font class="comment"> $word = 1;</font>
+00628 <font class="comment"> $character = 1;</font>
+00629 <font class="comment"> print "&lt;verse&gt;\n&lt;word&gt;\n";</font>
+00630 <font class="comment"># print each character element</font>
+00631 <font class="comment"># if there is a space, then close the word entity, open a new word</font>
+00632 <font class="comment"># entity, increment the word number, reset the character number to</font>
+00633 <font class="comment"># zero.</font>
+00634 <font class="comment"> foreach $element (@entity_line) {</font>
+00635 <font class="comment"> if ( $element =~ " " ) {</font>
+00636 <font class="comment"> $word++;</font>
+00637 <font class="comment"> $character = 1;</font>
+00638 <font class="comment"> print "&lt;/word&gt;\n&lt;word&gt;\n";</font>
+00639 <font class="comment"> next;</font>
+00640 <font class="comment"> }</font>
+00641 <font class="comment"> print "&lt;c id=\"1kg1.$verse.$word.$character\"&gt;$element&lt;/c&gt;\n";</font>
+00642 <font class="comment"> $character++;</font>
+00643 <font class="comment"> }</font>
+00644 <font class="comment"># close the verse element</font>
+00645 <font class="comment"> print "&lt;/word&gt;&lt;/verse&gt;\n";</font>
+00646 <font class="comment"># reinitialize variables</font>
+00647 <font class="comment"> @bhsVerse = ();</font>
+00648 <font class="comment"> @entity_line = ();</font>
+00649 <font class="comment"> @bhsLines = ();</font>
+00650 <font class="comment"> } # end while</font>
+00651 <font class="comment"># close the XML document</font>
+00652 <font class="comment"> print "&lt;/body&gt;\n";</font>
+00653 <font class="comment"> */</font>
+</pre></div><hr><address align="right"><small>Generated on Thu Jun 20 22:12:59 2002 for The Sword Project by
+<a href="http://www.doxygen.org/index.html">
+<img src="doxygen.png" alt="doxygen" align="middle" border=0
+width=110 height=53></a>1.2.15 </small></address>
+</body>
+</html>