blob: dea60c1c0ab2a340053a6fc2ead35f254a63e286 (
plain) (
tree)
|
|
#!/usr/bin/perl
## farsijoin - a simple filter to prepare texts for SE mobile phones
# This is the below described programme arabjoin by Roman Czyborra
# with some, only minor changes
# a) We do not reverse the text
# b) some ligatures which are irrelevant for Farsi and for the purpose
# in particular have been taken out.
# Mr Czyborra set his script under the following licence:
## Czyborra.com Freeware License
# Does the ".com" stands for communism, communications, computer addiction or commerce?
# I dunno. All of my works you find here are free software, with no stupid M$-¢opyrites
# to introduce artificial scarcities. You may freely copy, use, quote, modify or redistribute
# them as long as you properly attribute my contribution and have
# given a quick thought about whether Roman might perhaps be
# interested to read what you did with his stuff. Horizontal rules don't apply.
# arabjoin - a simple filter to render Arabic text
# © 1998-06-18 roman@czyborra.com
# Freeware license at http://czyborra.com/
# Latest version at http://czyborra.com/unicode/
# PostScript printout at http://czyborra.com/unicode/arabjoin.ps.gz
# This filter takes Arabic text (encoded in UTF-8 using the Unicode
# characters from the U+0600 Arabic block in logical order) as input
# and performs Arabic glyph joining on it and outputs a UTF-8 octet
# stream that is no longer logically arranged but in a visual order
# which gives readable results when formatted with a simple Unicode
# renderer like Yudit that does not handle Arabic differently yet
# but simply outputs all glyphs in left-to-right order.
# This little script also demonstrates that Arabic rendering is not
# that complicated after all (it makes you wonder why some software
# companies are still asking hundreds of dollars from poor students
# who just want to print their Arabic texts) and that even Perl 4 can
# handle Unicode text in UTF-8 without any nifty new add-ons.
# Usage examples:
# echo "أهلاً بالعالم!" | arabjoin
# prints !ﻢﻟﺎﻌﻟﺎﺑ ًﻼﻫﺃ
# which is the Arabic version of "Hello world!"
# | recode ISO-8859-6..UTF-8 | arabjoin | uniprint -f cyberbit.ttf
# prints an Arabic mail of charset=iso-8859-6-i on your printer
# | arabjoin | xviewer yudit
# delegates an Arabic UTF-8 message to a better viewer
# ftp://sunsite.unc.edu/pub/Linux/apps/editors/X/ has uniprint in yudit-1.0
# ftp://ftp.iro.umontreal.ca/pub/contrib/pinard/pretest/ has recode-3.4g
# http://czyborra.com/unicode/ has arabjoin
# http://czyborra.com/unix/ has xviewer
# http://www.bitstream.com/cyberbit.htm or
# ftp://ccic.ifcss.org/pub/software/fonts/unicode/ms-win/ or
# ftp://ftp.irdu.nus.sg/pub/language/bitstream/ has cyberbit.ttf
# This is how we do it: First we learn the presentation forms of each
# Arabic letter from the end of this script:
while(<DATA>)
{
($char, $_) = /^(\S+)\s+(\S+)/;
($isolated{$char},$final{$char},$medial{$char},$initial{$char}) =
/([\xC0-\xFF][\x80-\xBF]+)/g;
}
# Then learn the (incomplete set of) transparent characters:
foreach $char (split (" ", "
ً ٌ ٍ َ ُ ِ ٰ
ۗ ۘ ۙ ۚ ۛ ۜ ۟ ۠ ۡ ۢ ۣ ۤ ۧ ۨ ۪ ۫ ۬ ۭ"))
{
$transparent{$char}=1;
}
# Finally we can process our text:
while (<>)
{
s/\n$//; # chop off the end of the line so it won't jump upfront
@uchar = # UTF-8 character chunks
/([\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+)/g;
# We walk through the line of text and do contextual analysis:
for ($i = $[; $i <= $#uchar; $i = $j)
{
for ($b=$uchar[$j=$i]; $transparent{$c=$uchar[++$j]};){};
# The following assignment is the heart of the algorithm.
s/ﻢﺗ/ﰎ/g;
s/ﻲﻓ/ﰲ/g;
s/ﺞﻟ/ﰿ/g;
s/ﺢﻟ/ﱀ/g;
s/ﺦﻟ/ﱁ/g;
s/ﻢﻟ/ﱂ/g;
s/ﻰﻟ/ﱃ/g;
s/ﻲﻟ/ﱄ/g;
s/ﻢﻧ/ﱎ/g;
s/ٌّ/ﱞ/g;
s/ٍّ/ﱟ/g;
s/َّ/ﱠ/g;
s/ُّ/ﱡ/g;
s/ِّ/ﱢ/g;
s/ﺮﺒ/ﱪ/g;
s/ﻦﺒ/ﱭ/g;
s/ﻲﺒ/ﱯ/g;
s/ﺮﺘ/ﱰ/g;
s/ﻦﺘ/ﱳ/g;
s/ﻲﺘ/ﱵ/g;
s/ﻲﻨ/ﲏ/g;
s/ﺮﻴ/ﲑ/g;
s/ﻦﻴ/ﲔ/g;
s/ﺠﺑ/ﲜ/g;
s/ﺤﺑ/ﲝ/g;
s/ﺨﺑ/ﲞ/g;
s/ﻤﺑ/ﲟ/g;
s/ﺠﺗ/ﲡ/g;
s/ﺤﺗ/ﲢ/g;
s/ﺨﺗ/ﲣ/g;
s/ﻤﺗ/ﲤ/g;
s/ﻤﺛ/ﲦ/g;
s/ﻤﺟ/ﲨ/g;
s/ﻤﺣ/ﲪ/g;
s/ﻤﺧ/ﲬ/g;
s/ﻤﺳ/ﲰ/g;
s/ﺠﻟ/ﳉ/g;
s/ﺤﻟ/ﳊ/g;
s/ﺨﻟ/ﳋ/g;
s/ﻤﻟ/ﳌ/g;
s/ﻬﻟ/ﳍ/g;
s/ﺠﻣ/ﳎ/g;
s/ﺤﻣ/ﳏ/g;
s/ﺨﻣ/ﳐ/g;
s/ﻤﻣ/ﳑ/g;
s/ﺠﻧ/ﳒ/g;
s/ﺤﻧ/ﳓ/g;
s/ﺨﻧ/ﳔ/g;
s/ﻤﻧ/ﳕ/g;
s/ﺠﻳ/ﳚ/g;
s/ﺤﻳ/ﳛ/g;
s/ﺨﻳ/ﳜ/g;
s/ﻤﻳ/ﳝ/g;
s/ﺤﻤﻟ/ﶈ/g;
s/ﻪﻠﻟﺍ/ﷲ/g;
s/ﻢﻠﺳﻭ/ﻪﻴﻠﻋ/g;
s/ﻪﻟﺎﻠﺟ/ﻞﺟ/g;
# It reduces the Arabic joining algorithm described on
# pages 6-24 to 6-26 of the Arabic character block description
# in the Unicode 2.0 Standard to four lines of Perl:
$uchar[$i] = $a && $final{$c} && $medial{$b}
|| $final{$c} && $initial{$b}
|| $a && $final{$b}
|| $isolated{$b}
|| $b;
$a = $initial{$b} && $final{$c};
}
# Until the Unicode Consortium publishes its Unicode Technical
# Report #9 (Bidirectional Algorithm Reference Implementation)
# at http://www.unicode.org/unicode/reports/techreports.html
# let us oversimplify things a bit and reverse everything:
$_= join ('', @uchar);
# The following 8 obligatory LAM+ALEF ligatures are encoded in the
# U+FE70 Arabic Presentation Forms-B block in Unicode's
# compatibility zone:
s/ﺂﻟ/ﻵ/g;
s/ﺂﻠ/ﻶ/g;
s/ﺄﻟ/ﻷ/g;
s/ﺄﻠ/ﻸ/g;
s/ﺈﻟ/ﻹ/g;
s/ﺈﻠ/ﻺ/g;
s/ﺎﻟ/ﻻ/g;
s/ﺎﻠ/ﻼ/g;
# Bitstream's Cyberbit font offers 57 of the other 466 optional
# ligatures in the U+FB50 Arabic Presentation Forms-A block:
s/ﻢﺗ/ﰎ/g;
s/ﻲﻓ/ﰲ/g;
s/ﺞﻟ/ﰿ/g;
s/ﺢﻟ/ﱀ/g;
s/ﺦﻟ/ﱁ/g;
s/ﻢﻟ/ﱂ/g;
s/ﻰﻟ/ﱃ/g;
s/ﻲﻟ/ﱄ/g;
s/ﻢﻧ/ﱎ/g;
s/ٌّ/ﱞ/g;
s/ٍّ/ﱟ/g;
s/َّ/ﱠ/g;
s/ُّ/ﱡ/g;
s/ِّ/ﱢ/g;
s/ﺮﺒ/ﱪ/g;
s/ﻦﺒ/ﱭ/g;
s/ﻲﺒ/ﱯ/g;
s/ﺮﺘ/ﱰ/g;
s/ﻦﺘ/ﱳ/g;
s/ﻲﺘ/ﱵ/g;
s/ﻲﻨ/ﲏ/g;
s/ﺮﻴ/ﲑ/g;
s/ﻦﻴ/ﲔ/g;
s/ﺠﺑ/ﲜ/g;
s/ﺤﺑ/ﲝ/g;
s/ﺨﺑ/ﲞ/g;
s/ﻤﺑ/ﲟ/g;
s/ﺠﺗ/ﲡ/g;
s/ﺤﺗ/ﲢ/g;
s/ﺨﺗ/ﲣ/g;
s/ﻤﺗ/ﲤ/g;
s/ﻤﺛ/ﲦ/g;
s/ﻤﺟ/ﲨ/g;
s/ﻤﺣ/ﲪ/g;
s/ﻤﺧ/ﲬ/g;
s/ﻤﺳ/ﲰ/g;
s/ﺠﻟ/ﳉ/g;
s/ﺤﻟ/ﳊ/g;
s/ﺨﻟ/ﳋ/g;
s/ﻤﻟ/ﳌ/g;
s/ﻬﻟ/ﳍ/g;
s/ﺠﻣ/ﳎ/g;
s/ﺤﻣ/ﳏ/g;
s/ﺨﻣ/ﳐ/g;
s/ﻤﻣ/ﳑ/g;
s/ﺠﻧ/ﳒ/g;
s/ﺤﻧ/ﳓ/g;
s/ﺨﻧ/ﳔ/g;
s/ﻤﻧ/ﳕ/g;
s/ﺠﻳ/ﳚ/g;
s/ﺤﻳ/ﳛ/g;
s/ﺨﻳ/ﳜ/g;
s/ﻤﻳ/ﳝ/g;
s/ﺤﻤﻟ/ﶈ/g;
s/ﻪﻠﻟﺍ/ﷲ/g;
s/ﻢﻠﺳﻭ/ﻪﻴﻠﻋ/g;
s/ﻪﻟﺎﻠﺟ/ﻞﺟ/g;
print "$_\n";
}
# The following table lists the presentation variants of each
# character. Each value from the U+0600 block means that the
# necessary glyph variant has not been assigned a code in Unicode's
# U+FA00 compatibility zone. You may want to insert your private
# glyphs or approximation glyphs for them:
__END__
ء ﺀ
آ ﺁﺂ
أ ﺃﺄ
ؤ ﺅﺆ
إ ﺇﺈ
ئ ﺉﺊﺌﺋ
ا ﺍﺎ
ب ﺏﺐﺒﺑ
ة ﺓﺔ
ت ﺕﺖﺘﺗ
ث ﺙﺚﺜﺛ
ج ﺝﺞﺠﺟ
ح ﺡﺢﺤﺣ
خ ﺥﺦﺨﺧ
د ﺩﺪ
ذ ﺫﺬ
ر ﺭﺮ
ز ﺯﺰ
س ﺱﺲﺴﺳ
ش ﺵﺶﺸﺷ
ص ﺹﺺﺼﺻ
ض ﺽﺾﻀﺿ
ط ﻁﻂﻄﻃ
ظ ﻅﻆﻈﻇ
ع ﻉﻊﻌﻋ
غ ﻍﻎﻐﻏ
ـ ــــ
ف ﻑﻒﻔﻓ
ق ﻕﻖﻘﻗ
ك ﻙﻚﻜﻛ
ل ﻝﻞﻠﻟ
م ﻡﻢﻤﻣ
ن ﻥﻦﻨﻧ
ه ﻩﻪﻬﻫ
و ﻭﻮ
ى ﻯﻰ // ﯩﯨ
ي ﻱﻲﻴﻳ
ٱ ﭐ // ﭑ
ٲ ٲٲ
ٳ ٳٳ
ٴ ٴ
ٵ ٵٵ
ٶ ٶٶ
ٷ ﯝٷ
ٸ ٸٸٸٸ
ٹ ﭦﭧﭩﭨ
ٺ ﭞﭟﭡﭠ
ٻ ﭒﭓﭕﭔ
ټ ټټټټ
ٽ ٽٽٽٽ
پ ﭖﭗﭙﭘ
ٿ ﭢﭣﭥﭤ
ڀ ﭚﭛﭝﭜ
ځ ځځځځ
ڂ ڂڂڂڂ
ڃ ﭶﭷﭹﭸ
ڄ ﭲﭳﭵﭴ
څ څڅڅڅ
چ ﭺﭻﭽﭼ
ڇ ﭾﭿﮁﮀ
ڈ ﮈﮉ
ډ ډډ
ڊ ڊڊ
ڋ ڋڋ
ڌ ﮄﮅ
ڍ ﮂﮃ
ڎ ﮆﮇ
ڏ ڏڏ
ڐ ڐڐ
ڑ ﮌﮍ
ڒ ڒڒ
ړ ړړ
ڔ ڔڔ
ڕ ڕڕ
ږ ڕږ
ڗ ڗڗ
ژ ﮊﮋ
ڙ ڙڙ
ښ ښښښښ
ڛ ڛڛڛڛ
ڜ ڜڜڜڜ
ڝ ڝڝڝڝ
ڞ ڞڞڞڞ
ڟ ڟڟڟڟ
ڠ ڠڠڠڠ
ڡ ڡڡڡڡ
ڢ ڢڢڢڢ
ڣ ڣڣڣڣ
ڤ ﭪﭫﭭﭬ
ڥ ڥڥڥڥ
ڦ ﭮﭯﭱﭰ
ڧ ڧڧڧڧ
ڨ ڨڨڨڨ
ک ﮎﮏﮑﮐ
ڪ ڪڪڪڪ
ګ ګګګګ
ڬ ڬڬڬڬ
ڭ ﯓﯔﯖﯕ
ڮ ڮڮڮڮ
گ ﮒﮓﮕﮔ
ڰ ڰڰڰڰ
ڱ ﮚﮛﮝﮜ
ڲ ڲڲڲڲ
ڳ ﮖﮗﮙﮘ
ڴ ڴڴڴڴ
ڵ ڵڵڵڵ
ڶ ڶڶڶڶ
ڷ ڷڷڷڷ
ں ﮞﮟںں
ڻ ﮠﮡﮣﮢ
ڼ ڼڼڼڼ
ڽ ڽڽڽڽ
ھ ﮪﮫﮭﮬ
ۀ ﮤﮥ
ہ ﮦﮧﮩﮨ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ﯠﯡ
ۆ ﯙﯚ
ۇ ﯗﯘ
ۈ ﯛﯜ
ۉ ﯢﯣ
ۊ ۊۊ
ۋ ﯞﯟ
ی ﯼﯽﯿﯾ
ۍ ۍۍ
ێ ێێێێ
ې ﯤﯥﯧﯦ
ہ ہہہہ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ۅۅ
ۆ ۆۆ
ۇ ۇۇ
ۈ ۈۈ
ۉ ۉۉ
ۊ ۊۊ
ۋ ۋۋ
ی ﯼﯽﯿﯾ
ۍ ۍۍ
ێ ێێێێ
ې ېېېې
ۑ ۑۑۑۑ
ے ﮮﮯ
ۓ ﮰﮱ
ە ە
|