summaryrefslogblamecommitdiffstats
path: root/tlgu/tlgu.h
blob: 83aeb56f0be028b08ef8d64ee34faf86933a8f46 (plain) (tree)




























































































































































































































                                                                                                          
/* tlgu.h
 *
 * Copyright (C) 2004 Dimitri Marinakis
 *
 * Licensed under the terms of the GNU General Public License.
 * ABSOLUTELY NO WARRANTY.
 * See the file `COPYING' in this directory.
 *
 * Hellenic character codes
 * Relevant Unicode standard tables:
 * Greek and Coptic: 0370 - 03FF
 * Greek Extended: 1F00 - 1FFF
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <ctype.h>


#define INRECSIZE 0x2000
#define OUTRECSIZE 0xFFFFF

/* Beta code escapes and state processing codes */
#define HELLENIC 1
#define ROMAN 2
#define PUNCTUATION 3
#define QUOTATION 4
#define PAGE 5
#define BRACKET 6
#define QUASIBRACKET 7
#define NONTEXT 8
#define SYMBOL 9
#define HELLENIC_UPPER 0xa
#define HELLENIC_SELECT 0xb
#define HELLENIC_SIGMA 0xc
/* Accent is an existing code above 0x1f */
#define ACCENT 0x2f
#define HELLENIC_SIGMA_UPPER 0x10
#define TABHALF 0x11
#define ROMAN_SELECT 0x16
#define PUNCTUATION_SELECT 0x1f
#define QUOTATION_SELECT 0x29
#define PAGE_SELECT 0x33
#define BRACKET_SELECT 0x3d
#define QUASIBRACKET_SELECT 0x47
#define NONTEXT_SELECT 0x51
#define SYMBOL_SELECT 0x5b
#define TABHALF_SELECT 0x61

/* code defines */
#define SIGMEDIAL 0x3c3
#define SIGMEDIALUPPER 0x3a3
#define SIGFINAL 0x3c2
#define SIGFINALUPPER 0x3a3
#define SIGLUNATE 0x3f2
#define SIGLUNATEUPPER 0x3f9

/* accents */
#define PSILI 0x313
#define DASIA 0x314
#define DIALYTIKA 0x308
#define VARIA 0x300
#define OXIA 0x301
#define PERISPOMENI 0x342
#define YPOGEGRAMMENI 0x345
#define CARET 0x302

/* TLG stream translation table -- Unicode
   A B G D E Z H Q I K L M N C O P R S T U F X Y W V; V is digamma
   A value under 0x20 is a state change control code.
   Zero means no character.
 */
unsigned int hellenic[] = {
        /* sp     !     "     #     $     %     &     ' */
         0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
        /*    (       )               *       +     ,     -     .       / */
         ACCENT, ACCENT, HELLENIC_UPPER, ACCENT, 0x2c, 0x2d, 0x2e, ACCENT,
        /*  0     1     2     3     4     5     6     7 */
         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
        /*  8     9     :     ;     <     =     >     ?     @ */
         0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40,
        /*   a      b      c      d      e      f      g      h */
         0x3b1, 0x3b2, 0x3be, 0x3b4, 0x3b5, 0x3d5, 0x3b3, 0x3b7,
        /*   i      j      k      l      m      n      o      p */
         0x3b9, 0x3c2, 0x3ba, 0x3bb, 0x3bc, 0x3bd, 0x3bf, 0x3c0,
        /*   q      r      s      t      u      v      w     x */
         0x3b8, 0x3c1, 0x3c2, 0x3c4, 0x3c5, 0x3dd, 0x3c9, 0x3c7,
        /*   y      z     [     \     ]     ^     _  sep`*/
         0x3c8, 0x3b6, 0x54, 0x55, 0x56, 0x57, 0x00, 0x00,
        /*   A      B      C      D      E      F      G      H */
         0x391, 0x392, 0x39e, 0x394, 0x395, 0x3a6, 0x393, 0x397,
        /*   I      J      K      L      M      N      O      P */
         0x399, 0x3A3, 0x39a, 0x39b, 0x39c, 0x39d, 0x39f, 0x3a0,
        /*   Q      R      S      T      U      V      W      X */
         0x398, 0x3a1, 0x3a2, 0x3a4, 0x3a5, 0x3dc, 0x3a9, 0x3a7,
        /*   Y      Z     {     |     }     ~   DEL */
         0x3a8, 0x396, 0x7b, 0x7c, 0x7d, 0x00, 0x00};


/* Accents can be described in three groups, all optional
 * In the first group are - mutually exclusive - psili, daseia or dialytika
 * In the second group are - mutually exclusive - oxia, varia or perispomeni
 * In the third group are - mutually exclusive - ypogegrammeni, subscript dot or missing letter dot
 * as the last two are not part of fully-formed characters, will be used as combining diacritical marks
 * The simplified form is then:
 * [ ) or ( or + ]  [ / or \ or = ]  [ | ]
 *
 * This can be described by 5 accent flag bits (reverse order)
 *
 * 0 00 00 --- 0 00 00 no accent
 * |  |  |
 * |  |   ---- 01 psili, 10 dasia, 11 dialytika
 * |   ------- 01 varia, 10 oxia,  11 perispomeni
 * -----------  1 ypogegrammeni
 *
 * The resulting table of accentable characters will have 32-character rows
 * with the formed character codes in the appropriate positions, or zero:
 * plain, psili, dasia, dialytika, varia, psili-varia, dasia-varia, dialytika-varia
 * oxia, psili-oxia, dasia-oxia, dialytika-oxia, perispomeni, psili-perisp, dasia-perisp, dialytika-perisp
 * ditto with ypogegrammeni
 *
 * If zero is returned, combining diacritical marks should be generated from the accent flags.
 */
unsigned int alpha[] = {
	0x03b1, 0x1f00, 0x1f01, 0x0000, 0x1f70, 0x1f02, 0x1f03, 0x0000,
	0x1f71, 0x1f04, 0x1f05, 0x0000, 0x1fb6, 0x1f06, 0x1f07, 0x0000,
	0x1fb3, 0x1f80, 0x1f81, 0x0000, 0x1fb2, 0x1f82, 0x1f83, 0x0000,
	0x1fb4, 0x1f84, 0x1f85, 0x0000, 0x1fb7, 0x1f86, 0x1f87, 0x0000
	};
unsigned int Alpha[] = {
	0x0391, 0x1f08, 0x1f09, 0x0000, 0x1fba, 0x1f0a, 0x1f0b, 0x0000,
	0x1fbb, 0x1f0c, 0x1f0d, 0x0000, 0x0000, 0x1f0e, 0x1f0f, 0x0000,
	0x1fbc, 0x1f88, 0x1f89, 0x0000, 0x0000, 0x1f8a, 0x1f8b, 0x0000,
	0x0000, 0x1f8c, 0x1f8d, 0x0000, 0x0000, 0x1f8e, 0x1f8f, 0x0000
	};
unsigned int epsilon[] = {
	0x03b5, 0x1f10, 0x1f11, 0x0000, 0x1f72, 0x1f12, 0x1f13, 0x0000,
	0x1f73, 0x1f14, 0x1f15, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
 unsigned int Epsilon[] = {
	0x0395, 0x1f18, 0x1f19, 0x0000, 0x1fc8, 0x1f1a, 0x1f1b, 0x0000,
	0x1fc9, 0x1f1c, 0x1f1d, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
unsigned int eta[] = {
	0x03b7, 0x1f20, 0x1f21, 0x0000, 0x1f74, 0x1f22, 0x1f23, 0x0000,
	0x1f75, 0x1f24, 0x1f25, 0x0000, 0x1fc6, 0x1f26, 0x1f27, 0x0000,
	0x1fc3, 0x1f90, 0x1f91, 0x0000, 0x1fc2, 0x1f92, 0x1f93, 0x0000,
	0x1fc4, 0x1f94, 0x1f95, 0x0000, 0x1fc7, 0x1f96, 0x1f97, 0x0000
	};
unsigned int Eta[] = {
	0x0397, 0x1f28, 0x1f29, 0x0000, 0x1fca, 0x1f2a, 0x1f2b, 0x0000,
	0x1fcb, 0x1f2c, 0x1f2d, 0x0000, 0x0000, 0x1f2e, 0x1f2f, 0x0000,
	0x1fcc, 0x1f98, 0x1f99, 0x0000, 0x0000, 0x1f9a, 0x1f9b, 0x0000,
	0x0000, 0x1f9c, 0x1f9d, 0x0000, 0x0000, 0x1f9e, 0x1f9f, 0x0000
	};
unsigned int iota[] = {
	0x03b9, 0x1f30, 0x1f31, 0x03ca, 0x1f76, 0x1f32, 0x1f33, 0x1fd2,
	0x1f77, 0x1f34, 0x1f35, 0x1fd3, 0x1fd6, 0x1f36, 0x1f37, 0x1fd7,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
 unsigned int Iota[] = {
	0x0399, 0x1f38, 0x1f39, 0x03aa, 0x1fda, 0x1f3a, 0x1f3b, 0x0000,
	0x1fdb, 0x1f3c, 0x1f3d, 0x0000, 0x0000, 0x1f3e, 0x1f3f, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
unsigned int omicron[] = {
	0x03bf, 0x1f40, 0x1f41, 0x0000, 0x1f78, 0x1f42, 0x1f43, 0x0000,
	0x1f79, 0x1f44, 0x1f45, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
 unsigned int Omicron[] = {
	0x039f, 0x1f48, 0x1f49, 0x0000, 0x1ff8, 0x1f4a, 0x1f4b, 0x0000,
	0x1ff9, 0x1f4c, 0x1f4d, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
unsigned int ypsilon[] = {
	0x03c5, 0x1f50, 0x1f51, 0x03cb, 0x1f7a, 0x1f52, 0x1f53, 0x1fe2,
	0x1f7b, 0x1f54, 0x1f55, 0x1fe3, 0x1fe6, 0x1f56, 0x1f57, 0x1fe7,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
 unsigned int Ypsilon[] = {
	0x03a5, 0x0000, 0x1f59, 0x03ab, 0x1fea, 0x0000, 0x1f5b, 0x0000,
	0x1feb, 0x0000, 0x1f5d, 0x0000, 0x0000, 0x0000, 0x1f5f, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
unsigned int omega[] = {
	0x03c9, 0x1f60, 0x1f61, 0x0000, 0x1f7c, 0x1f62, 0x1f63, 0x0000,
	0x1f7d, 0x1f64, 0x1f65, 0x0000, 0x1ff6, 0x1f66, 0x1f67, 0x0000,
	0x1ff3, 0x1fa0, 0x1fa1, 0x0000, 0x1ff2, 0x1fa2, 0x1fa3, 0x0000,
	0x1ff4, 0x1fa4, 0x1fa5, 0x0000, 0x1ff7, 0x1fa6, 0x1fa7, 0x0000
	};
unsigned int Omega[] = {
	0x03a9, 0x1f68, 0x1f69, 0x0000, 0x1ffa, 0x1f6a, 0x1f6b, 0x0000,
	0x1ffb, 0x1f6c, 0x1f6d, 0x0000, 0x03a9, 0x1f6e, 0x1f6f, 0x0000,
	0x1ffc, 0x1fa8, 0x1fa9, 0x0000, 0x0000, 0x1faa, 0x1fab, 0x0000,
	0x0000, 0x1fac, 0x1fad, 0x0000, 0x0000, 0x1fae, 0x1faf, 0x0000
	};
unsigned int rho[] = {
	0x03c1, 0x1fe4, 0x1fe5, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};
unsigned int Rho[] = {
	0x03a1, 0x0000, 0x1fec, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
	};