diff options
author | danglassey <danglassey> | 2002-08-14 09:57:17 +0000 |
---|---|---|
committer | danglassey <danglassey> | 2002-08-14 09:57:17 +0000 |
commit | c9458897ebbb739d8db83c80e06512d8a612f743 (patch) | |
tree | f8c5381045887e34388cc6b26cfccc254bf766dc /doc/api-documentation/html/hebrewmcim_8cpp-source.html | |
download | sword-sf-cvs-c9458897ebbb739d8db83c80e06512d8a612f743.tar.gz |
*** empty log message ***
Diffstat (limited to 'doc/api-documentation/html/hebrewmcim_8cpp-source.html')
-rw-r--r-- | doc/api-documentation/html/hebrewmcim_8cpp-source.html | 658 |
1 files changed, 658 insertions, 0 deletions
diff --git a/doc/api-documentation/html/hebrewmcim_8cpp-source.html b/doc/api-documentation/html/hebrewmcim_8cpp-source.html new file mode 100644 index 0000000..6ec69fa --- /dev/null +++ b/doc/api-documentation/html/hebrewmcim_8cpp-source.html @@ -0,0 +1,658 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1"> +<title>hebrewmcim.cpp Source File</title> +<link href="doxygen.css" rel="stylesheet" type="text/css"> +</head><body> +<!-- Generated by Doxygen 1.2.15 --> +<center> +<a class="qindex" href="index.html">Main Page</a> <a class="qindex" href="namespaces.html">Namespace List</a> <a class="qindex" href="hierarchy.html">Class Hierarchy</a> <a class="qindex" href="classes.html">Alphabetical List</a> <a class="qindex" href="annotated.html">Compound List</a> <a class="qindex" href="files.html">File List</a> <a class="qindex" href="functions.html">Compound Members</a> </center> +<hr><h1>hebrewmcim.cpp</h1><div class="fragment"><pre>00001 +00011 <font class="preprocessor">#include <hebrewmcim.h></font> +00012 +00013 HebrewMCIM::HebrewMCIM() +00014 :<a class="code" href="class_s_w_input_method.html">SWInputMethod</a>() { +00015 +00016 init(); +00017 } +00018 +00019 +00020 <font class="keywordtype">int</font> *HebrewMCIM::translate(<font class="keywordtype">char</font> in) { +00021 <font class="keywordtype">int</font> retVal = 0; +00022 <font class="keyword">static</font> <font class="keywordtype">int</font> retString[5]; +00023 <font class="keywordtype">int</font> retStringIndex = 0; +00024 +00025 memset(retString, 0, 5); +00026 +00027 <font class="keywordflow">if</font> (getState() > 1) { +00028 <font class="keywordflow">if</font> (getState() >= 12) { <font class="comment">// serious issue with internal structure</font> +00029 setState(0); +00030 retString[retStringIndex++] = in; +00031 <font class="keywordflow">return</font> retString; +00032 } +00033 map<int, int>::iterator find = subst2[getState()].find(in); +00034 <font class="keywordflow">if</font> (find != subst2[getState()].end()) +00035 retVal = find->second; +00036 <font class="keywordflow">else</font> retVal = in; +00037 +00038 setState(0); +00039 retString[retStringIndex++] = retVal; +00040 <font class="keywordflow">return</font> retString; +00041 } +00042 <font class="keywordflow">else</font> { +00043 retVal = subst[in]; +00044 +00045 <font class="keywordflow">if</font> (retVal == 0) { +00046 setState(0); +00047 retString[retStringIndex++] = in; +00048 <font class="keywordflow">return</font> retString; +00049 } +00050 <font class="keywordflow">if</font> (retVal > 100) { +00051 setState(1); +00052 retString[retStringIndex++] = retVal; +00053 <font class="keywordflow">return</font> retString; +00054 } +00055 <font class="keywordflow">if</font> (retVal == 50) { <font class="comment">// multiChar</font> +00056 setState(1); +00057 <font class="keywordtype">int</font> *chars = multiChars[in]; +00058 <font class="keywordflow">if</font> (chars != 0) { +00059 retString[retStringIndex++] = chars[0]; +00060 retString[retStringIndex++] = chars[1]; +00061 <font class="keywordflow">return</font> retString; +00062 } +00063 } +00064 } +00065 setState(retVal); +00066 <font class="keywordflow">return</font> 0; +00067 } +00068 +00069 +00070 <font class="keywordtype">void</font> HebrewMCIM::init() { +00071 memset(subst, 0, 255); +00072 +00073 subst[<font class="charliteral">')'</font>] = 1488; +00074 subst[<font class="charliteral">'B'</font>] = 1489; +00075 subst[<font class="charliteral">'G'</font>] = 1490; +00076 subst[<font class="charliteral">'D'</font>] = 1491; +00077 subst[<font class="charliteral">'H'</font>] = 1492; +00078 subst[<font class="charliteral">'W'</font>] = 1493; +00079 subst[<font class="charliteral">'Z'</font>] = 1494; +00080 subst[<font class="charliteral">'X'</font>] = 1495; +00081 subst[<font class="charliteral">'+'</font>] = 1496; +00082 subst[<font class="charliteral">'Y'</font>] = 1497; +00083 +00084 subst[<font class="charliteral">'k'</font>] = 1498; <font class="comment">// finals</font> +00085 subst[<font class="charliteral">'m'</font>] = 1501; +00086 subst[<font class="charliteral">'n'</font>] = 1503; +00087 subst[<font class="charliteral">'c'</font>] = 1509; +00088 +00089 subst[<font class="charliteral">'P'</font>] = 1508; +00090 subst[<font class="charliteral">'K'</font>] = 1499; +00091 subst[<font class="charliteral">'L'</font>] = 1500; +00092 subst[<font class="charliteral">'M'</font>] = 1502; +00093 subst[<font class="charliteral">'N'</font>] = 1504; +00094 subst[<font class="charliteral">'S'</font>] = 1505; +00095 subst[<font class="charliteral">'('</font>] = 1506; +00096 subst[<font class="charliteral">'p'</font>] = 1507; +00097 subst[<font class="charliteral">'C'</font>] = 1510; +00098 subst[<font class="charliteral">'Q'</font>] = 1511; +00099 subst[<font class="charliteral">'R'</font>] = 1512; +00100 subst[<font class="charliteral">'#'</font>] = 1513; +00101 +00102 <font class="comment">// special multiChars</font> +00103 subst[<font class="charliteral">'&'</font>] = 50; +00104 subst[<font class="charliteral">'$'</font>] = 50; +00105 +00106 <font class="keyword">static</font> <font class="keywordtype">int</font> x[] = {1513, 1474}; +00107 multiChars[<font class="charliteral">'&'</font>] = x; +00108 <font class="keyword">static</font> <font class="keywordtype">int</font> y[] = {1513, 1473}; +00109 multiChars[<font class="charliteral">'$'</font>] = y; +00110 +00111 subst[<font class="charliteral">'T'</font>] = 1514; +00112 +00113 <font class="comment">// VOWELS</font> +00114 subst[<font class="charliteral">'A'</font>] = 1463; +00115 subst[<font class="charliteral">'F'</font>] = 1464; +00116 subst[<font class="charliteral">'E'</font>] = 1462; +00117 subst[<font class="charliteral">'"'</font>] = 1461; +00118 subst[<font class="charliteral">'I'</font>] = 1460; +00119 subst[<font class="charliteral">'O'</font>] = 1465; +00120 subst[<font class="charliteral">'U'</font>] = 1467; +00121 +00122 +00123 +00124 <font class="comment">// OTHER DIACRITICS</font> +00125 subst[<font class="charliteral">'.'</font>] = 1468; +00126 subst[<font class="charliteral">'-'</font>] = 1470; +00127 subst[<font class="charliteral">','</font>] = 1471; +00128 +00129 <font class="comment">// Compound input</font> +00130 +00131 <font class="comment">// CANTILLATION</font> +00132 +00133 subst[<font class="charliteral">':'</font>] = 2; +00134 subst2[2][<font class="charliteral">'A'</font>] = 1458; +00135 subst2[2][<font class="charliteral">'E'</font>] = 1457; +00136 subst2[2][<font class="charliteral">'F'</font>] = 1459; +00137 +00138 +00139 <font class="comment">/* Telisha qetana is postpositive as in '04' above. However, Michigan</font> +00140 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font> +00141 <font class="comment"># difference.</font> +00142 <font class="comment"> */</font> +00143 subst[<font class="charliteral">'2'</font>] = 5; +00144 subst2[5][<font class="charliteral">'4'</font>] = 1449; +00145 +00146 +00147 <font class="comment">/* Note Michigan encoding distinguishes between medial metheg '35' (occuring</font> +00148 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font> +00149 <font class="comment"># right of the vowel). It is also used for silluq.</font> +00150 <font class="comment"> */</font> +00151 subst[<font class="charliteral">'3'</font>] = 6; +00152 subst2[6][<font class="charliteral">'3'</font>] = 1433; +00153 subst2[6][<font class="charliteral">'5'</font>] = 1469; +00154 +00155 +00156 <font class="comment">/* The Michigan code of telisha gedola in medial position. Graphically,</font> +00157 <font class="comment"># there is no difference.</font> +00158 <font class="comment"> */</font> +00159 subst[<font class="charliteral">'4'</font>] = 7; +00160 subst2[7][<font class="charliteral">'4'</font>] = 1440; +00161 +00162 subst[<font class="charliteral">'6'</font>] = 8; +00163 subst2[8][<font class="charliteral">'0'</font>] = 1451; +00164 subst2[8][<font class="charliteral">'1'</font>] = 1436; +00165 +00166 subst[<font class="charliteral">'1'</font>] = 4; +00167 subst2[4][<font class="charliteral">'0'</font>] = 1434; +00168 +00169 <font class="comment">/* In the poetic books, prepositive dehi occurs; it's unclear whether</font> +00170 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font> +00171 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font> +00172 <font class="comment"># code for each.</font> +00173 <font class="comment"> */</font> +00174 +00175 subst2[4][<font class="charliteral">'3'</font>] = 1430; +00176 +00177 <font class="comment">/* This is the poetic accent mugrash, which also includes rebia, but is</font> +00178 <font class="comment"># encoded separately as '81' in the Michigan text.</font> +00179 <font class="comment"> */</font> +00180 subst2[4][<font class="charliteral">'1'</font>] = 1437; +00181 subst2[4][<font class="charliteral">'4'</font>] = 1440; +00182 +00183 +00184 subst[<font class="charliteral">'0'</font>] = 3; +00185 subst2[3][<font class="charliteral">'0'</font>] = 1475; +00186 subst2[3][<font class="charliteral">'1'</font>] = 1426; +00187 +00188 <font class="comment">/* According to BHS, zarqa and sinnor are both postpositive. However,</font> +00189 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font> +00190 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font> +00191 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font> +00192 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font> +00193 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font> +00194 <font class="comment"> */</font> +00195 +00196 subst2[3][<font class="charliteral">'2'</font>] = 1454; +00197 +00198 <font class="comment">/* Pashta is postpositive, and the Unicode equivalent reflects</font> +00199 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font> +00200 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font> +00201 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font> +00202 <font class="comment"># although it could be algorithmically determined.</font> +00203 <font class="comment"> */</font> +00204 +00205 subst2[3][<font class="charliteral">'3'</font>] = 1433; +00206 subst2[3][<font class="charliteral">'4'</font>] = 1449; +00207 subst2[3][<font class="charliteral">'5'</font>] = 1472; +00208 +00209 +00210 <font class="comment">/* This is the Unicode Hebrew *accent*; there is also another Hebrew</font> +00211 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font> +00212 <font class="comment"># traditional rounded marks, rather than the alternate straight</font> +00213 <font class="comment"># marks.</font> +00214 <font class="comment"> */</font> +00215 +00216 subst2[8][<font class="charliteral">'2'</font>] = 1438; +00217 +00218 <font class="comment">// Also known as azla</font> +00219 subst2[8][<font class="charliteral">'3'</font>] = 1448; +00220 subst2[8][<font class="charliteral">'4'</font>] = 1452; +00221 subst2[8][<font class="charliteral">'5'</font>] = 1427; +00222 +00223 +00224 subst[<font class="charliteral">'8'</font>] = 9; +00225 subst2[9][<font class="charliteral">'0'</font>] = 1428; +00226 subst2[9][<font class="charliteral">'1'</font>] = 1431; +00227 +00228 <font class="comment">/* Note, this accent is actually sinnorit, but it does not exist as a</font> +00229 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font> +00230 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font> +00231 <font class="comment"># get to this.</font> +00232 <font class="comment"> */</font> +00233 subst2[9][<font class="charliteral">'2'</font>] = 1432; +00234 +00235 <font class="comment">/* The Unicode form does not match the form used by BHS, but the names</font> +00236 <font class="comment"># are the same.</font> +00237 <font class="comment"> */</font> +00238 subst2[9][<font class="charliteral">'3'</font>] = 1441; +00239 subst2[9][<font class="charliteral">'4'</font>] = 1439; +00240 subst2[9][<font class="charliteral">'5'</font>] = 1429; +00241 +00242 subst[<font class="charliteral">'7'</font>] = 10; +00243 subst2[10][<font class="charliteral">'0'</font>] = 1444; +00244 subst2[10][<font class="charliteral">'1'</font>] = 1445; +00245 subst2[10][<font class="charliteral">'2'</font>] = 1446; +00246 subst2[10][<font class="charliteral">'3'</font>] = 1430; <font class="comment">// also '13', '73' also is used for majela</font> +00247 subst2[10][<font class="charliteral">'4'</font>] = 1443; +00248 subst2[10][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// this is silluq; should appear to the left of the vowel</font> +00249 +00250 subst[<font class="charliteral">'9'</font>] = 11; +00251 subst2[11][<font class="charliteral">'1'</font>] = 1435; +00252 subst2[11][<font class="charliteral">'2'</font>] = 1425; +00253 subst2[11][<font class="charliteral">'3'</font>] = 1450; +00254 subst2[11][<font class="charliteral">'4'</font>] = 1447; +00255 subst2[11][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// should appear to the right of the vowel</font> +00256 +00257 } +00258 +00259 <font class="comment">/*</font> +00260 <font class="comment"></font> +00261 <font class="comment"></font> +00262 <font class="comment"># CANTILLION MARKS</font> +00263 <font class="comment"></font> +00264 <font class="comment"> my $ETNAHTA = '&#1425;';</font> +00265 <font class="comment"># officially the Unicode name for this symbol was "SEGOL." However, that is</font> +00266 <font class="comment"># not a unique name, conflicting with the vowel of the same name. Further,</font> +00267 <font class="comment"># the position of the symbol is different. I have changed the name of the</font> +00268 <font class="comment"># accent to "SEGOLTA," the traditional name for this accent.</font> +00269 <font class="comment"> my $SEGOLTA = '&#1426;';</font> +00270 <font class="comment"> my $SHALSHELET = '&#1427;';</font> +00271 <font class="comment"> my $ZAQEF_QATAN = '&#1428;';</font> +00272 <font class="comment"> my $ZAQEF_GADOL = '&#1429;';</font> +00273 <font class="comment"> my $TIPEHA = '&#1430;';</font> +00274 <font class="comment"> my $REVIA = '&#1431;';</font> +00275 <font class="comment"> my $ZARQA = '&#1432;';</font> +00276 <font class="comment"> my $PASHTA = '&#1433;';</font> +00277 <font class="comment"> my $YETIV = '&#1434;';</font> +00278 <font class="comment"> my $TEVIR = '&#1435;';</font> +00279 <font class="comment"> my $GERESH = '&#1436;';</font> +00280 <font class="comment"> my $GERESH_MUQDAM = '&#1437;';</font> +00281 <font class="comment"> my $GERSHAYIM = '&#1438;';</font> +00282 <font class="comment"> my $QARNEY_PARA = '&#1439;';</font> +00283 <font class="comment"> my $TELISHA_GEDOLA = '&#1440;';</font> +00284 <font class="comment"> my $PAZER = '&#1441;';</font> +00285 <font class="comment"> my $MUNAH = '&#1443;';</font> +00286 <font class="comment"> my $MAHAPAKH = '&#1444;';</font> +00287 <font class="comment"> my $MERKHA = '&#1445;';</font> +00288 <font class="comment"> my $MERKHA_KEFULA = '&#1446;';</font> +00289 <font class="comment"> my $DARGA = '&#1447;';</font> +00290 <font class="comment"> my $QADMA = '&#1448;';</font> +00291 <font class="comment"> my $TELISHA_QETANA = '&#1449;';</font> +00292 <font class="comment"> my $YERAH_BEN_YOMO = '&#1450;';</font> +00293 <font class="comment"> my $OLE = '&#1451;';</font> +00294 <font class="comment"> my $ILUY = '&#1452;';</font> +00295 <font class="comment"> my $DEHI = '&#1453;';</font> +00296 <font class="comment"> my $ZINOR = '&#1454;';</font> +00297 <font class="comment"># HEBREW MARK</font> +00298 <font class="comment"> my $MASORA_CIRCLE = '&#1455;';</font> +00299 <font class="comment"># HEBREW EXTENDED-A points and punctuation</font> +00300 <font class="comment"> my $SHEVA = '&#1456;';</font> +00301 <font class="comment"> my $HATAF_SEGOL = '&#1457;';</font> +00302 <font class="comment"> my $HATAF_PATAH = '&#1458;';</font> +00303 <font class="comment"> my $HATAF_QAMATS = '&#1459;';</font> +00304 <font class="comment"> my $HIRIQ = '&#1460;';</font> +00305 <font class="comment"> my $TSERE = '&#1461;';</font> +00306 <font class="comment"> my $SEGOL = '&#1462;';</font> +00307 <font class="comment"># furtive Patah is not a distinct character</font> +00308 <font class="comment"> my $PATAH = '&#1463;';</font> +00309 <font class="comment"> my $QAMATS = '&#1464;';</font> +00310 <font class="comment"> my $HOLAM = '&#1465;';</font> +00311 <font class="comment"> my $QUBUTS = '&#1467;';</font> +00312 <font class="comment"># also used as shuruq</font> +00313 <font class="comment"># falls within the base letter</font> +00314 <font class="comment"> my $DAGESH_OR_MAPIQ = '&#1468;';</font> +00315 <font class="comment"># also used as siluq</font> +00316 <font class="comment"> my $METAG = '&#1469;';</font> +00317 <font class="comment"> my $MAQAF = '&#1470;';</font> +00318 <font class="comment"> my $RAFE = '&#1471;';</font> +00319 <font class="comment"># Also used for legarmeh</font> +00320 <font class="comment"># may be treated as spacing punctuation, not as a point</font> +00321 <font class="comment"> my $PASEQ = '&#1472;';</font> +00322 <font class="comment"> my $SHIN_DOT = '&#1473;';</font> +00323 <font class="comment"> my $SIN_DOT = '&#1474;';</font> +00324 <font class="comment"> my $SOF_PASUQ = '&#1475;';</font> +00325 <font class="comment"># HEBREW MARK</font> +00326 <font class="comment"> my $UPPER_DOT = '&#1476;';</font> +00327 <font class="comment"># HEBREW LETTERS based on ISO 8859-8</font> +00328 <font class="comment"># aleph</font> +00329 <font class="comment"># x (alef symbol - 2135)</font> +00330 <font class="comment"> my $ALEF = '&#1488;';</font> +00331 <font class="comment"># x (bet symbol - 2136)</font> +00332 <font class="comment"> my $BET = '&#1489;';</font> +00333 <font class="comment"># x (gimel symbol - 2137)</font> +00334 <font class="comment"> my $GIMEL = '&#1490;';</font> +00335 <font class="comment"># x (dalet symbol - 2138)</font> +00336 <font class="comment"> my $DALET = '&#1491;';</font> +00337 <font class="comment"> my $HE = '&#1492;';</font> +00338 <font class="comment"> my $VAV = '&#1493;';</font> +00339 <font class="comment"> my $ZAYIN = '&#1494;';</font> +00340 <font class="comment"> my $HET = '&#1495;';</font> +00341 <font class="comment"> my $TET = '&#1496;';</font> +00342 <font class="comment"> my $YOD = '&#1497;';</font> +00343 <font class="comment"> my $FINAL_KAF = '&#1498;';</font> +00344 <font class="comment"> my $KAF = '&#1499;';</font> +00345 <font class="comment"> my $LAMED = '&#1500;';</font> +00346 <font class="comment"> my $FINAL_MEM = '&#1501;';</font> +00347 <font class="comment"> my $MEM = '&#1502;';</font> +00348 <font class="comment"> my $FINAL_NUN = '&#1503;';</font> +00349 <font class="comment"> my $NUN = '&#1504;';</font> +00350 <font class="comment"> my $SAMEKH = '&#1505;';</font> +00351 <font class="comment"> my $AYIN = '&#1506;';</font> +00352 <font class="comment"> my $FINAL_PE = '&#1507;';</font> +00353 <font class="comment"> my $PE = '&#1508;';</font> +00354 <font class="comment"> my $FINAL_TSADI = '&#1509;';</font> +00355 <font class="comment"># also known as zade</font> +00356 <font class="comment"> my $TSADI = '&#1510;';</font> +00357 <font class="comment"> my $QOF = '&#1511;';</font> +00358 <font class="comment"> my $RESH = '&#1512;';</font> +00359 <font class="comment"> my $SHIN = '&#1513;';</font> +00360 <font class="comment"> my $TAV = '&#1514;';</font> +00361 <font class="comment"># Yiddish digraphs</font> +00362 <font class="comment"># Hebrew Ligature</font> +00363 <font class="comment"># tsvey vovn</font> +00364 <font class="comment"> my $DOUBLE_VAV = '&#1520;';</font> +00365 <font class="comment"> my $VAV_YOD = '&#1521;';</font> +00366 <font class="comment"># tsvey yudn</font> +00367 <font class="comment"> my $DOUBLE_YOD = '&#1522;';</font> +00368 <font class="comment"></font> +00369 <font class="comment"># Additional punctuation</font> +00370 <font class="comment"> my $PUNCT_GERESH = '&#1523;';</font> +00371 <font class="comment"> my $PUNCT_GERSHAYIM = '&#1524;';</font> +00372 <font class="comment"># Reserved: 0x05F5"</font> +00373 <font class="comment"># x (hebrew point judeo-spanish varika - FB1E)</font> +00374 <font class="comment">#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E</font> +00375 <font class="comment"></font> +00376 <font class="comment">#############################</font> +00377 <font class="comment"># End of Unicode 2.0 Hebrew #</font> +00378 <font class="comment">#############################</font> +00379 <font class="comment"></font> +00380 <font class="comment"># A hash whose key is a Michagan code, and whose value is a Unicode</font> +00381 <font class="comment"># equvalent</font> +00382 <font class="comment"></font> +00383 <font class="comment"> char subst[] = new char [255];</font> +00384 <font class="comment"> subst[')'] = 1488;</font> +00385 <font class="comment"> 'B' => $BET,</font> +00386 <font class="comment"> 'G' => $GIMEL,</font> +00387 <font class="comment"> 'D' => $DALET,</font> +00388 <font class="comment"> 'H' => $HE,</font> +00389 <font class="comment"> 'W' => $VAV,</font> +00390 <font class="comment"> 'Z' => $ZAYIN,</font> +00391 <font class="comment"> 'X' => $HET,</font> +00392 <font class="comment"> '+' => $TET,</font> +00393 <font class="comment"> 'Y' => $YOD,</font> +00394 <font class="comment"> 'K' => $KAF,</font> +00395 <font class="comment"> 'L' => $LAMED,</font> +00396 <font class="comment"> 'M' => $MEM,</font> +00397 <font class="comment"> 'N' => $NUN,</font> +00398 <font class="comment"> 'S' => $SAMEKH,</font> +00399 <font class="comment"> '(' => $AYIN,</font> +00400 <font class="comment"> 'P' => $PE,</font> +00401 <font class="comment"> 'C' => $TSADI,</font> +00402 <font class="comment"> 'Q' => $QOF,</font> +00403 <font class="comment"> 'R' => $RESH,</font> +00404 <font class="comment"> '#' => $SHIN, # the letter shin without a point</font> +00405 <font class="comment"> '&' => ($SHIN . $SIN_DOT),</font> +00406 <font class="comment"> '$' => ($SHIN . $SHIN_DOT), # '</font> +00407 <font class="comment"> 'T' => $TAV,</font> +00408 <font class="comment"># VOWELS</font> +00409 <font class="comment"> 'A' => $PATAH,</font> +00410 <font class="comment"> 'F' => $QAMATS,</font> +00411 <font class="comment"> 'E' => $SEGOL,</font> +00412 <font class="comment"> '"' => $TSERE,</font> +00413 <font class="comment"> 'I' => $HIRIQ,</font> +00414 <font class="comment"> 'O' => $HOLAM,</font> +00415 <font class="comment"> 'U' => $QUBUTS,</font> +00416 <font class="comment"> ':' => $SHEVA,</font> +00417 <font class="comment"> ':A' => $HATAF_PATAH,</font> +00418 <font class="comment"> ':E' => $HATAF_SEGOL,</font> +00419 <font class="comment"> ':F' => $HATAF_QAMATS,</font> +00420 <font class="comment"># OTHER DIACRITICS</font> +00421 <font class="comment"> '.' => $DAGESH_OR_MAPIQ,</font> +00422 <font class="comment"> '-' => $MAQAF,</font> +00423 <font class="comment"> ',' => $RAFE,</font> +00424 <font class="comment"># CANTILLATION</font> +00425 <font class="comment"> '00' => $SOF_PASUQ,</font> +00426 <font class="comment"> '01' => $SEGOLTA,</font> +00427 <font class="comment"># According to BHS, zarqa and sinnor are both postpositive. However,</font> +00428 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font> +00429 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font> +00430 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font> +00431 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font> +00432 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font> +00433 <font class="comment"> '02' => $ZINOR,</font> +00434 <font class="comment"># Pashta is postpositive, and the Unicode equivalent reflects</font> +00435 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font> +00436 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font> +00437 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font> +00438 <font class="comment"># although it could be algorithmically determined.</font> +00439 <font class="comment"> '03' => $PASHTA,</font> +00440 <font class="comment"> '04' => $TELISHA_QETANA,</font> +00441 <font class="comment"> '05' => $PASEQ,</font> +00442 <font class="comment"> '10' => $YETIV,</font> +00443 <font class="comment"># In the poetic books, prepositive dehi occurs; it's unclear whether</font> +00444 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font> +00445 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font> +00446 <font class="comment"># code for each.</font> +00447 <font class="comment"> '13' => $TIPEHA, # also $DEHI</font> +00448 <font class="comment"># This is the poetic accent mugrash, which also includes rebia, but is</font> +00449 <font class="comment"># encoded separately as '81' in the Michigan text.</font> +00450 <font class="comment"> '11' => $GERESH_MUQDAM,</font> +00451 <font class="comment"> '14' => $TELISHA_GEDOLA,</font> +00452 <font class="comment"># Telisha qetana is postpositive as in '04' above. However, Michigan</font> +00453 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font> +00454 <font class="comment"># difference.</font> +00455 <font class="comment"> '24' => $TELISHA_QETANA,</font> +00456 <font class="comment"> '33' => $PASHTA,</font> +00457 <font class="comment"># The Michigan code of telisha gedola in medial position. Graphically,</font> +00458 <font class="comment"># there is no difference.</font> +00459 <font class="comment"> '44' => $TELISHA_GEDOLA,</font> +00460 <font class="comment"> '60' => $OLE,</font> +00461 <font class="comment"> '61' => $GERESH,</font> +00462 <font class="comment"># This is the Unicode Hebrew *accent*; there is also another Hebrew</font> +00463 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font> +00464 <font class="comment"># traditional rounded marks, rather than the alternate straight</font> +00465 <font class="comment"># marks.</font> +00466 <font class="comment"> '62' => $GERSHAYIM,</font> +00467 <font class="comment"># Also known as azla</font> +00468 <font class="comment"> '63' => $QADMA,</font> +00469 <font class="comment"> '64' => $ILUY,</font> +00470 <font class="comment"> '65' => $SHALSHELET,</font> +00471 <font class="comment"> '80' => $ZAQEF_QATAN,</font> +00472 <font class="comment"> '81' => $REVIA,</font> +00473 <font class="comment"># Note, this accent is actually sinnorit, but it does not exist as a</font> +00474 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font> +00475 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font> +00476 <font class="comment"># get to this.</font> +00477 <font class="comment"> '82' => $ZARQA,</font> +00478 <font class="comment"># The Unicode form does not match the form used by BHS, but the names</font> +00479 <font class="comment"># are the same.</font> +00480 <font class="comment"> '83' => $PAZER,</font> +00481 <font class="comment"> '84' => $QARNEY_PARA,</font> +00482 <font class="comment"> '85' => $ZAQEF_GADOL,</font> +00483 <font class="comment"># Note Michigan encoding distinguishes between medial metheg '35' (occuring</font> +00484 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font> +00485 <font class="comment"># right of the vowel). It is also used for silluq.</font> +00486 <font class="comment"> '35' => $METAG,</font> +00487 <font class="comment"> '70' => $MAHAPAKH,</font> +00488 <font class="comment"> '71' => $MERKHA,</font> +00489 <font class="comment"> '72' => $MERKHA_KEFULA,</font> +00490 <font class="comment"> '73' => $TIPEHA, # also '13', '73' also is used for majela</font> +00491 <font class="comment"> '74' => $MUNAH,</font> +00492 <font class="comment"> '75' => $METAG, # this is silluq; should appear to the left of the vowel</font> +00493 <font class="comment"> '91' => $TEVIR,</font> +00494 <font class="comment"> '92' => $ETNAHTA,</font> +00495 <font class="comment"> '93' => $YERAH_BEN_YOMO,</font> +00496 <font class="comment"> '94' => $DARGA,</font> +00497 <font class="comment"> '95' => $METAG, # should appear to the right of the vowel</font> +00498 <font class="comment"></font> +00499 <font class="comment"># Not used by the Michigan Encoding</font> +00500 <font class="comment"># $UPPER_DOT = '05C4';</font> +00501 <font class="comment"> );</font> +00502 <font class="comment"></font> +00503 <font class="comment"># declare other variables</font> +00504 <font class="comment"> my (@bhsLines,</font> +00505 <font class="comment"> @bhsVerse,</font> +00506 <font class="comment"> @entity_line) = ();</font> +00507 <font class="comment"></font> +00508 <font class="comment"> my ($i,</font> +00509 <font class="comment"> $verse,</font> +00510 <font class="comment"> $word,</font> +00511 <font class="comment"> $character) = 0;</font> +00512 <font class="comment"></font> +00513 <font class="comment"> my ($element,</font> +00514 <font class="comment"> $saveGuttural) = "";</font> +00515 <font class="comment"></font> +00516 <font class="comment"># read in a line</font> +00517 <font class="comment"> while (<>) {</font> +00518 <font class="comment"># Process one verse</font> +00519 <font class="comment"># iterate over every character and change to XML decimal entity</font> +00520 <font class="comment"> CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {</font> +00521 <font class="comment"> # find and convert final kaf, mem, nun, pe, tsade</font> +00522 <font class="comment"> ( # if final form</font> +00523 <font class="comment"> $bhsVerse[$i] =~ /[KMNPC]/</font> +00524 <font class="comment"> )</font> +00525 <font class="comment"> &&</font> +00526 <font class="comment"> (</font> +00527 <font class="comment"> ( # whitespace or</font> +00528 <font class="comment"> $bhsVerse[$i+1] =~ /[ \-?]/</font> +00529 <font class="comment"> )</font> +00530 <font class="comment"> ||</font> +00531 <font class="comment"> ( # EOL or</font> +00532 <font class="comment"> $i == ( scalar(@bhsVerse) - 1 )</font> +00533 <font class="comment"> )</font> +00534 <font class="comment"> ||</font> +00535 <font class="comment"> ( # sof pasuq or</font> +00536 <font class="comment"> ( $bhsVerse[$i+1] =~ /0/ ) &&</font> +00537 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ )</font> +00538 <font class="comment"> )</font> +00539 <font class="comment"> ||</font> +00540 <font class="comment"> ( # one accent followed by white, eol or</font> +00541 <font class="comment"> (</font> +00542 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font> +00543 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ )</font> +00544 <font class="comment"> ) &&</font> +00545 <font class="comment"> (</font> +00546 <font class="comment"> ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||</font> +00547 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font> +00548 <font class="comment"> )</font> +00549 <font class="comment"> )</font> +00550 <font class="comment"> ||</font> +00551 <font class="comment"> ( # two accents followed by white, eol</font> +00552 <font class="comment"> (</font> +00553 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font> +00554 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ ) &&</font> +00555 <font class="comment"> ( $bhsVerse[$i+3] =~ /\d/ ) &&</font> +00556 <font class="comment"> ( $bhsVerse[$i+4] =~ /\d/ )</font> +00557 <font class="comment"> ) &&</font> +00558 <font class="comment"> (</font> +00559 <font class="comment"> ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||</font> +00560 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font> +00561 <font class="comment"> )</font> +00562 <font class="comment"> )</font> +00563 <font class="comment"> ||</font> +00564 <font class="comment"> ( # followed by a vowel and white, eol, sof pasuq</font> +00565 <font class="comment"> ( $bhsVerse[$i+1] =~ /[:F]/ ) &&</font> +00566 <font class="comment"> ( # followed by</font> +00567 <font class="comment"> ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or</font> +00568 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or</font> +00569 <font class="comment"> ( # sof pasuq</font> +00570 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ ) &&</font> +00571 <font class="comment"> ( $bhsVerse[$i+3] =~ /0/ )</font> +00572 <font class="comment"> )</font> +00573 <font class="comment"> )</font> +00574 <font class="comment"> )</font> +00575 <font class="comment"> ) # end of what follows after final letter</font> +00576 <font class="comment"> &&</font> +00577 <font class="comment"> do {</font> +00578 <font class="comment"> $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }</font> +00579 <font class="comment"> && next CHAR;</font> +00580 <font class="comment"> $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }</font> +00581 <font class="comment"> && next CHAR;</font> +00582 <font class="comment"> $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }</font> +00583 <font class="comment"> && next CHAR;</font> +00584 <font class="comment"> $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }</font> +00585 <font class="comment"> && next CHAR;</font> +00586 <font class="comment"> $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }</font> +00587 <font class="comment"> && next CHAR;</font> +00588 <font class="comment"> };</font> +00589 <font class="comment"> # find and convert "furtive patach"</font> +00590 <font class="comment"> ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach</font> +00591 <font class="comment"> ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural</font> +00592 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel</font> +00593 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq</font> +00594 <font class="comment"> ( $bhsVerse[$i-3] =~ /W/ ) ) || #</font> +00595 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)</font> +00596 <font class="comment"> ( $bhsVerse[$i-3] =~ /O/ ) ) || #</font> +00597 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod</font> +00598 <font class="comment"> ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&</font> +00599 <font class="comment"> do {</font> +00600 <font class="comment"> $saveGuttural = pop @entity_line; # snip off the gutteral</font> +00601 <font class="comment"> push @entity_line,$PATAH; # push on the patach</font> +00602 <font class="comment"> push @entity_line,$saveGuttural; # push back on the gutteral</font> +00603 <font class="comment"> next CHAR;</font> +00604 <font class="comment"> };</font> +00605 <font class="comment"> # convert cantillation</font> +00606 <font class="comment"> # since we have previously dealt with all other cases of</font> +00607 <font class="comment"> # numbers, two digit patterns are all we have to search for</font> +00608 <font class="comment"> $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {</font> +00609 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font> +00610 <font class="comment"> $i++; # accents are two digits long, so advance past the 2nd digit</font> +00611 <font class="comment"> next CHAR;</font> +00612 <font class="comment"> };</font> +00613 <font class="comment"> # convert katef vowels, which are two characters long</font> +00614 <font class="comment"> $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {</font> +00615 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font> +00616 <font class="comment"> $i++;</font> +00617 <font class="comment"> next CHAR;</font> +00618 <font class="comment"> };</font> +00619 <font class="comment"> # convert everything else</font> +00620 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};</font> +00621 <font class="comment"> } # end CHAR</font> +00622 <font class="comment"># print the line to standard output with XML character-level encoding</font> +00623 <font class="comment"># each character has the following format:</font> +00624 <font class="comment"># <c id="1kg1.verse#.word#.character#">&#1234;</c></font> +00625 <font class="comment"></font> +00626 <font class="comment"># set up the verse element</font> +00627 <font class="comment"> $word = 1;</font> +00628 <font class="comment"> $character = 1;</font> +00629 <font class="comment"> print "<verse>\n<word>\n";</font> +00630 <font class="comment"># print each character element</font> +00631 <font class="comment"># if there is a space, then close the word entity, open a new word</font> +00632 <font class="comment"># entity, increment the word number, reset the character number to</font> +00633 <font class="comment"># zero.</font> +00634 <font class="comment"> foreach $element (@entity_line) {</font> +00635 <font class="comment"> if ( $element =~ " " ) {</font> +00636 <font class="comment"> $word++;</font> +00637 <font class="comment"> $character = 1;</font> +00638 <font class="comment"> print "</word>\n<word>\n";</font> +00639 <font class="comment"> next;</font> +00640 <font class="comment"> }</font> +00641 <font class="comment"> print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";</font> +00642 <font class="comment"> $character++;</font> +00643 <font class="comment"> }</font> +00644 <font class="comment"># close the verse element</font> +00645 <font class="comment"> print "</word></verse>\n";</font> +00646 <font class="comment"># reinitialize variables</font> +00647 <font class="comment"> @bhsVerse = ();</font> +00648 <font class="comment"> @entity_line = ();</font> +00649 <font class="comment"> @bhsLines = ();</font> +00650 <font class="comment"> } # end while</font> +00651 <font class="comment"># close the XML document</font> +00652 <font class="comment"> print "</body>\n";</font> +00653 <font class="comment"> */</font> +</pre></div><hr><address align="right"><small>Generated on Thu Jun 20 22:12:59 2002 for The Sword Project by +<a href="http://www.doxygen.org/index.html"> +<img src="doxygen.png" alt="doxygen" align="middle" border=0 +width=110 height=53></a>1.2.15 </small></address> +</body> +</html> |