diff options
author | Martin Gruner <mg.pub@gmx.net> | 2005-10-07 19:46:52 +0000 |
---|---|---|
committer | Martin Gruner <mg.pub@gmx.net> | 2005-10-07 19:46:52 +0000 |
commit | 57a25c7dbdd9127c9f0b6b0e019095fbc5430598 (patch) | |
tree | 25e97abd4eb26f92fae649416075740a6e18f3b1 /modules/mt-lxx-parallel | |
parent | 87f638c6ee7f7f1bf27fffb50a60f765c40d6983 (diff) | |
download | sword-tools-57a25c7dbdd9127c9f0b6b0e019095fbc5430598.tar.gz |
update. still lots to do....
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@45 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules/mt-lxx-parallel')
-rw-r--r-- | modules/mt-lxx-parallel/convert.pl | 388 |
1 files changed, 386 insertions, 2 deletions
diff --git a/modules/mt-lxx-parallel/convert.pl b/modules/mt-lxx-parallel/convert.pl index 7ffff05..018439e 100644 --- a/modules/mt-lxx-parallel/convert.pl +++ b/modules/mt-lxx-parallel/convert.pl @@ -10,8 +10,390 @@ use strict; +binmode(STDOUT, ":utf8"); #see "man perluniintro" + my $prefix = "parallel/"; +my $MorphologicalSegmentStart = "<seg type=\"morph\">" ; +my $MorphologicalSegmentEnd = "</seg>" ; +my $MorphologicalDivisionMarker = $MorphologicalSegmentEnd . $MorphologicalSegmentStart; + +my $hebrewLetters="A-Z\(\)\+\#\$\*\&/"; #used in a character class of a regexp later +my %hebrew2utf8 = ( +")" =>chr(0x05D0), #HEBREW LETTER ALEF +"B" =>chr(0x05D1), #HEBREW LETTER BET +"G" =>chr(0x05D2), #HEBREW LETTER GIMEL +"D" =>chr(0x05D3), #HEBREW LETTER DALET +"H" =>chr(0x05D4), #HEBREW LETTER HE +"W" =>chr(0x05D5), #HEBREW LETTER VAV +"Z" =>chr(0x05D6), #HEBREW LETTER ZAYIN +"X" =>chr(0x05D7), #HEBREW LETTER HET +"+" =>chr(0x05D8), #HEBREW LETTER TET +"Y" =>chr(0x05D9), #HEBREW LETTER YOD +#"K" =>chr(0x05DA), #HEBREW LETTER FINAL KAF # TODO: HANDLE FINAL LETTERS +"K" =>chr(0x05DB), #HEBREW LETTER KAF +"L" =>chr(0x05DC), #HEBREW LETTER LAMED +#chr(0x6D)=>chr(0x05DD), #HEBREW LETTER FINAL MEM +"M" =>chr(0x05DE), #HEBREW LETTER MEM +#"N" =>chr(0x05DF), #HEBREW LETTER FINAL NUN +"N" =>chr(0x05E0), #HEBREW LETTER NUN +"S" =>chr(0x05E1), #HEBREW LETTER SAMEKH +"(" =>chr(0x05E2), #HEBREW LETTER AYIN +#"P" =>chr(0x05E3), #HEBREW LETTER FINAL PE +"P" =>chr(0x05E4), #HEBREW LETTER PE +#"C" =>chr(0x05E5), #HEBREW LETTER FINAL TSADI +"C" =>chr(0x05E6), #HEBREW LETTER TSADI +"Q" =>chr(0x05E7), #HEBREW LETTER QOF / KOF +"R" =>chr(0x05E8), #HEBREW LETTER RESH +"#" =>chr(0x05E9).chr(0x05C1), #HEBREW LETTER SHIN == SIN/SHIN without dot +"\$" =>chr(0x05E9).chr(0x05C1), #HEBREW LETTER SHIN + SHIN DOT == SHIN +"&" =>chr(0x05E9).chr(0x05C2), #HEBREW LETTER SHIN + SIN DOT == SIN +"T" =>chr(0x05EA), #HEBREW LETTER TAV +"-" =>chr(0x05BE), #MAQQEF + +# Special stuff +"*" =>"(ketiv)", # TODO:FIX +"**" =>"(qere)", # +"/" => $MorphologicalDivisionMarker, + +"," => ",", #separate words in colB + +"{" => "{", # TODO: CHECK IF NECCESSARY +"}" => "}", +"." => ".", + +); + +my %greek2utf8 = ( +"\\"=>chr(0x0300), #COMBINING GRAVE ACCENT +"/" =>chr(0x0301), #COMBINING ACUTE ACCENT +"+" =>chr(0x0308), #COMBINING DIAERESIS +"=" =>chr(0x0342), #COMBINING GREEK PERISPOMENI / CIRCUMFLEX +")" =>chr(0x0313), #COMBINING COMMA ABOVE / SMOOTH BREATHING +"(" =>chr(0x0314), #COMBINING REVERSED COMMA ABOVE / ROUGH BREATHING +"|" =>chr(0x0345), #COMBINING GREEK YPOGEGRAMMENI / IOTA SUBSCRIPT + +"'" => "'", +"{" => "{", # TODO: CHECK IF NECCESSARY +"}" => "}", +"." => ".", +"^" => "^", +"?" => "?", + + + +# "*A" =>chr(0x0391), #GREEK CAPITAL LETTER ALPHA +# "*B" =>chr(0x0392), #GREEK CAPITAL LETTER BETA +# "*G" =>chr(0x0393), #GREEK CAPITAL LETTER GAMMA +# "*D" =>chr(0x0394), #GREEK CAPITAL LETTER DELTA +# "*E" =>chr(0x0395), #GREEK CAPITAL LETTER EPSILON +# "*V" =>chr(0x03DC), #GREEK LETTER DIGAMMA +# "*Z" =>chr(0x0396), #GREEK CAPITAL LETTER ZETA +# "*H" =>chr(0x0397), #GREEK CAPITAL LETTER ETA +# "*Q" =>chr(0x0398), #GREEK CAPITAL LETTER THETA +# "*I" =>chr(0x0399), #GREEK CAPITAL LETTER IOTA +# "*K" =>chr(0x039A), #GREEK CAPITAL LETTER KAPPA +# "*L" =>chr(0x039B), #GREEK CAPITAL LETTER LAMDA +# "*M" =>chr(0x039C), #GREEK CAPITAL LETTER MU +# "*N" =>chr(0x039D), #GREEK CAPITAL LETTER NU +# "*C" =>chr(0x039E), #GREEK CAPITAL LETTER XI +# "*O" =>chr(0x039F), #GREEK CAPITAL LETTER OMICRON +# "*P" =>chr(0x03A0), #GREEK CAPITAL LETTER PI +# "*R" =>chr(0x03A1), #GREEK CAPITAL LETTER RHO +# "*S" =>chr(0x03A3), #GREEK CAPITAL LETTER SIGMA +# "*J" =>chr(0x03A3), #GREEK CAPITAL LETTER SIGMA #at end of Word +# "*T" =>chr(0x03A4), #GREEK CAPITAL LETTER TAU +# "*U" =>chr(0x03A5), #GREEK CAPITAL LETTER UPSILON +# "*F" =>chr(0x03A6), #GREEK CAPITAL LETTER PHI +# "*X" =>chr(0x03A7), #GREEK CAPITAL LETTER CHI +# "*Y" =>chr(0x03A8), #GREEK CAPITAL LETTER PSI +# "*W" =>chr(0x03A9), #GREEK CAPITAL LETTER OMEGA + +"A" =>chr(0x03B1), #GREEK SMALL LETTER ALPHA +"B" =>chr(0x03B2), #GREEK SM LETT BETA / SM LETTER BETA BEGINNING OF WORD +"G" =>chr(0x03B3), #GREEK SMALL LETTER GAMMA +"D" =>chr(0x03B4), #GREEK SMALL LETTER DELTA +"E" =>chr(0x03B5), #GREEK SMALL LETTER EPSILON +"V" =>chr(0x03DD), #GREEK SMALL LETTER DIGAMMA +"Z" =>chr(0x03B6), #GREEK SMALL LETTER ZETA +"H" =>chr(0x03B7), #GREEK SMALL LETTER ETA +"Q" =>chr(0x03B8), #GREEK SMALL LETTER THETA +"I" =>chr(0x03B9), #GREEK SMALL LETTER IOTA +"K" =>chr(0x03BA), #GREEK SMALL LETTER KAPPA +"L" =>chr(0x03BB), #GREEK SMALL LETTER LAMDA +"M" =>chr(0x03BC), #GREEK SMALL LETTER MU +"N" =>chr(0x03BD), #GREEK SMALL LETTER NU +"C" =>chr(0x03BE), #GREEK SMALL LETTER XI +"O" =>chr(0x03BF), #GREEK SMALL LETTER OMICRON +"P" =>chr(0x03C0), #GREEK SMALL LETTER PI +"#3"=>chr(0x03DF), #GREEK SMALL LETTER KOPPA +"R" =>chr(0x03C1), #GREEK SMALL LETTER RHO +"S" =>chr(0x03C3), #GREEK SMALL LETTER SIGMA +"J" =>chr(0x03C2), #GREEK SM LETT FINAL SIGMA / SM LETT SIGMA END OF WORD +"T" =>chr(0x03C4), #GREEK SMALL LETTER TAU +"U" =>chr(0x03C5), #GREEK SMALL LETTER UPSILON +"F" =>chr(0x03C6), #GREEK SMALL LETTER PHI +"X" =>chr(0x03C7), #GREEK SMALL LETTER CHI +"Y" =>chr(0x03C8), #GREEK SMALL LETTER PSI +"W" =>chr(0x03C9), #GREEK SMALL LETTER OMEGA +); + +my %notes = ( +"{#}" => "Asterized passage (in Job).", +"{g}" => "Reference to difference between the text of Rahlfs and that of the relevant Göttingen edition.", +"..a" => "Word included in one of the Aramaic sections.", +"*" => "Ketib.", +"**" => "Qere.", +"*z" => "Qere wela ketib, ketib wela qere.", +"[ ]" => "Reference of number of verse in LXX, different from MT.", # TODO: MAKE USE OF IT +"[[ ]]" => "Reference number of verse in MT, different from the LXX.", +"--- {x}" => "Apparent minus created by lack of equivalence between long stretches of text in the LXX and MT.", +"--+ {x}" => "Apparent plus created by lack of equivalence between long stretches of text in the LXX and MT.", +"{...}" => "Equivalent reflected elsewhere in the text, disregarded by indexing program.", +"~" => "Difference in sequence between MT and LXX, denoted after the first Hebrew word and before the second one, as well as between two Greek words.", +"~~~" => "Equivalent of the Hebrew or Greek word(s) occurring elsewhere in the verse or context (transposition).", +"{..~}" => "Stylistic or grammatical transposition.", +"---" => "In the Greek column: Hebrew counterpart lacking in the LXX (minus in the LXX).", +"--+" => "In col a. of the Hebrew: element \"added\" in the Greek (plus in the LXX).", +"---+" => "In col a. of the Hebrew: element \"added\" in the Greek (plus in the LXX).", # TODO: my addition +"''" => "Long minus or plus (at least four lines).", +"{d}" => "Reference to doublet (occurring between the two elements of the doublet).", +"{d?}" => "Reference to doublet (occurring between the two elements of the doublet)?", +"{..d}" => "Distributive rendering, occurring once in the translation but referring to more than one Hebrew word.", +"{..r}" => "Notation in Hebrew column of elements repeated in the translation.", +"?" => "Questionable notation, equivalent, etc.", +"{p}" => "Greek preverb representing Hebrew preposition.", +"{..p}" => "Preposition added in the LXX in accordance with the rules of the Greek language or translational habits.", +"{!}" => "Infinitive absolute.", +"{!}na" => "Infinitive absolute.", # TODO: occurs in the text, but unknown meaning +"{!}nd" => "Infinitive absolute.", # TODO: occurs in the text, but unknown meaning +"{!}p" => "Infinitive absolute.", # TODO: occurs in the text, but unknown meaning +"{!}pd" => "Infinitive absolute.", # TODO: occurs in the text, but unknown meaning +"{!}-" => "Infinitive absolute.", # TODO: occurs in the text, but unknown meaning +"{s}" => "Hebrew M/, MN (comparative, superlative) reflected by Greek comparative or superlative.", # TODO: UTF-8 +"{t}" => "Transliterated Hebrew word.", +"\#" => "Long line continuing in next one, placed both at the end of the line running over and at the beginning of the following line in the opposite column.", +"{v}" => "The reading of the main text of the LXX seems to reflect a secondary text, while the \"original\" reading is reflected in a variant.", + + +"=" => "Introducing col. b of the Hebrew (a selection of retroverted readings, presumably found in the parent text of the LXX).", +"=?" => "? Introducing col. b of the Hebrew (a selection of retroverted readings, presumably found in the parent text of the LXX).", # TODO: my addition, check +"={d}" => "Reference to doublet (occurring between the two elements of the doublet).", # TODO: my addition, check +"={d?}" => "Reference to doublet (occurring between the two elements of the doublet)?", # TODO: my addition, check +"=\%" => "Introducing categories of translation technique recorded in col. b.", +"=\%vap" => "Change from active to passive form in verbs.", +"=\%vpa" => "Change from passive to active form in verbs.", +"=\%p" => "Difference in preposition or particle.", +"=\%p+" => "Addition of preposition or particle.", +"=\%p-" => "Omission of preposition or particle.", +"=;" => "Retroversion in col. b based on equivalence occurring in immediate or remote context.", +"G" => "Hebrew variant, but at this stage no plausible retroversion is suggested.", +"=+" => "Difference in numbers between MT and the LXX.", +"=\@" => "Etymological exegesis.", +"=\@...a" => "Etymological exegesis according to Aramaic.", +"=:" => "Introducing reconstructed proper noun.", +"=v" => "Difference in vocalization (reading).", +"=r" => "Incomplete retroversion.", + +"{*}" => "Agreement of LXX with ketib.", +"{**}" => "Agreement of LXX with qere.", + + +"." => "Interchange of consonants between MT and the presumed Hebrew parent text of the LXX.", +".)(" => "Interchange of consonants (א/ע) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".(q" => "Interchange of consonants (ע/ק) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".q(" => "Interchange of consonants (ק/ע) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".)x" => "Interchange of consonants (א/ח) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".x)" => "Interchange of consonants (ח/א) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".(x" => "Interchange of consonants (ע/ח) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".x(" => "Interchange of consonants (ח/ע) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".+d" => "Interchange of consonants (ט/ד) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".br" => "Interchange of consonants (ב/ר) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rb" => "Interchange of consonants (ר/ב) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rd" => "Interchange of consonants (ר/ד) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".dr" => "Interchange of consonants (ד/ר) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rg" => "Interchange of consonants (ר/ג) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rh" => "Interchange of consonants (ר/ה) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rl" => "Interchange of consonants (ר/ל) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".\mb" => "Interchange of consonants (ק/מ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".mn" => "Interchange of consonants (מ/נ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".nm" => "Interchange of consonants (נ/מ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".nr" => "Interchange of consonants (נ/ר) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".ny" => "Interchange of consonants (נ/י) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".yn" => "Interchange of consonants (י/נ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".rn" => "Interchange of consonants (ר/נ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".\$c" => "Interchange of consonants (שׁ/צ) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".\qb" => "Interchange of consonants (ק/ב) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".\wy" => "Interchange of consonants (ו/י) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition +".\yw" => "Interchange of consonants (י/ו) between MT and the presumed Hebrew parent text of the LXX.", # TODO: my addition + + +".m" => "Metathesis of consonants between MT and the presumed Hebrew parent text of the LXX.", +".z" => "Possible abbreviation.", +".s" => "One word of MT separated into two or more words in the parent text of the LXX.", +".j" => "Two words of MT joined into one word in the parent text of the LXX.", +".w" => "Different word-division reflected in the parent text of the LXX.", + +"<sp" => "<sp", #TODO: FIX, occurs in text + +"^" => "^", #Notsure what these are +"^^^" => "^^^", + +); + +sub createNote(){ + my $noteText = shift; + return("<note type=\"textual\">$noteText</note> "); +} +sub openNote(){ + my $noteText = shift; + return("<note type=\"textual\">$noteText "); +} +sub closeNote(){ + my $noteText = shift; + return("$noteText</note> "); +} + + +sub translateHebrewNote(){ + my $origNote = shift; + #print("TranslateHebrewNote $origNote\n"); + + ($origNote =~ m/{\.\.d(.+)}/) and + return( &createNote("(".&translateHebrewWordorNote( $1 ).") ". $notes{ "{..d}" }) ); + + ($origNote =~ m/{\.\.r(.+)}/) and + return( &createNote("(".&translateHebrewWordorNote( $1 ).") ". $notes{ "{..r}" }) ); + + ($origNote =~ m/{\.\.\.(.+)}/) and + return( &createNote("(".&translateHebrewWordorNote( $1 ).") ". $notes{ "{...}" }) ); + + #Special cases: the note includes more than one hebrew word + ($origNote =~ m/^{\.\.\.([^}]+)$/) and + return( &openNote("(".&translateHebrewWordorNote($1).") ".$notes{"{...}"} ) ); + ($origNote =~ m/^([^{]+)}$/) and + return( &closeNote("(".&translateHebrewWordorNote( $1 ).") ". $notes{ "{...}" }) ); + + ($origNote =~ m/^=[^?$hebrewLetters]*([?$hebrewLetters]+)$/) and + return( &createNote( $notes{"?"} ) . &translateHebrewWordorNote( $1 ) ); + + #special case: no note, but a crossref + ($origNote =~ m/<(.+)>/) and + return("<reference osisRef=\"$1\"><$1></reference> "); + + ($origNote =~ m/^[?].*/) and + return( &createNote( $notes{"?"} ) . &translateHebrewWordorNote( $1 ) ); + + ($notes{ $origNote }) or die("Note $origNote not found.\n"); + return( &createNote( $notes{$origNote} ) ); +} + +sub translateHebrewWordorNote(){ #will return unicode hebrew with morph separation + my $hebrew = shift; + +# print("TranslateHebrew of: $hebrew"); + + ( $hebrew =~ m/^[^$hebrewLetters]/ ) and return &translateHebrewNote( $hebrew ); + ( $hebrew =~ m/[}]$/ ) and return &translateHebrewNote( $hebrew ); + + + my $length = length($hebrew); + my $index = 0; + my $result = $MorphologicalSegmentStart; + CHAR_LOOP: while ( $index < $length ) { + my $hsubstr = substr( $hebrew, $index, 1); + $hebrew2utf8{ $hsubstr } || die("could not find Hebrew: $hsubstr of word $hebrew at index $index length $length\n"); + $result .= $hebrew2utf8{ $hsubstr }; + ++$index; + } + $result .= $MorphologicalSegmentEnd; + return $result; +} + +sub translateGreekWordorNote(){ + my $greek = shift; + + foreach my $key (keys %notes){ + if ( $greek eq $key ){ #we have a note and not a word + return $notes{ $greek }; # TODO: format OSIS note + } + } + + my $length = length($greek); + my $index = 0; + my $result =""; + CHAR_LOOP: while ( $index < $length ) { +# if ( $greek2utf8{ substr( $greek, $index, 1) } eq "*"){ +# die("FOUND capital character in $greek\n"); +# $result .= $greek2utf8{ substr( $greek, $index, 2) }; +# $index += 2; +# } +# else{ + my $gsubstr = substr( $greek, $index, 1); +# $greek2utf8{ $gsubstr } or die("Could not find greek: $gsubstr\n"); + $result .= $greek2utf8{ $gsubstr }; + ++$index; +# } + } + return $result; +} + +sub parseLine(){ + my $origLine = shift; + my $result; + +# printf("parsing %s\n", $origLine); + + $origLine =~ s/--=/--+/; # TODO: UGLY HACK, this appears in the text but not the notes + + ($origLine =~ m/^([^=]+)?([=].+)?\t(.+)$/) or die("No match in parseLine().\n"); + ($1 or $2) or die("Hebrew not found.\n"); + $3 or die("Greek not found.\n"); + $1 and my @hebrewWordsColA = split(/\s+/, $1); + $2 and my @hebrewWordsColB = split(/\s+/, $2); + my @greekWords = split(/\s+/, $3); + +# printf("1: $1 2: $2 3: $3\n"); + + # 3 columns= Hebrew ColA, Hebrew ColB, Greek + + $result .= "<row>\n <cell>"; + foreach my $wordA (@hebrewWordsColA){ + $result .= &translateHebrewWordorNote( $wordA ); + } + $result .= "</cell>\n <cell>"; + + foreach my $wordB (@hebrewWordsColB){ + if ( substr($wordB, 0, 1) eq "=" ){ + $wordB =~ m/(=[^$hebrewLetters]*)([$hebrewLetters].*)?/ or die("No match in ColB.\n"); + $1 and $result .= &translateHebrewWordorNote( $1 ); #This isolates the notes introducing colB (=*) + $result .= &translateHebrewWordorNote( $2 ); + } + else { + $result .= &translateHebrewWordorNote( $wordB ); + } + } + $result .= "</cell>\n <cell>"; + + my $index = 0; + foreach my $wordG (@greekWords){ + if ( $greekWords[$index] eq "{x}" ){ #special case: note containing a space, has to be handled together +# $result .= &translateGreekWordorNote( "$wordG $greekWords[$index+1]" ); + $index += 2; + } + elsif ( $wordG eq "{x}" ){ #skip + ++$index; + } + else{ +# $result .= &translateGreekWordorNote( $wordG ); + ++$index; + } + } + $result .= " </cell>\n</row>"; +# printf("Result: %s\n", $result); + return $result; +} + # # grabVerseContent - if the Verse can be found, returns its Content, otherwise nothing # @@ -21,11 +403,13 @@ sub grabVerseContent(){ #Bookname, chapter, verse, @list my $index=0; + printf("Parsing $bookname $chapter:$verse...\n"); + if ($bookname eq "Obad"){ #special handling, no chapter:verse structure LOOP: foreach my $current_item (@buffer){ if ($chapter == 1 and $current_item =~ m/^$bookname $verse/){ #only for the first chapter while ( not $buffer[++$index] =~ m/^\n|^\s*$/ ){ - push(@result, $buffer[$index] ); + push(@result, &parseLine( $buffer[$index] ) ); } return @result; } @@ -36,7 +420,7 @@ sub grabVerseContent(){ #Bookname, chapter, verse, @list LOOP: foreach my $current_item (@buffer){ if ($current_item =~ m/^$bookname $chapter:$verse/){ while ( not $buffer[++$index] =~ m/^\n|^\s*$/ ){ - push(@result, $buffer[$index] ); + push(@result, &parseLine( $buffer[$index] ) ); } return @result; } |