diff options
Diffstat (limited to 'z36.c')
-rw-r--r-- | z36.c | 1223 |
1 files changed, 1223 insertions, 0 deletions
@@ -0,0 +1,1223 @@ +/*@z36.c:Hyphenation: Declarations@*******************************************/ +/* */ +/* THE LOUT DOCUMENT FORMATTING SYSTEM (VERSION 3.17) */ +/* COPYRIGHT (C) 1991, 1999 Jeffrey H. Kingston */ +/* */ +/* Jeffrey H. Kingston (jeff@cs.usyd.edu.au) */ +/* Basser Department of Computer Science */ +/* The University of Sydney 2006 */ +/* AUSTRALIA */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either Version 2, or (at your option) */ +/* any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston MA 02111-1307 USA */ +/* */ +/* FILE: z36.c */ +/* MODULE: Hyphenation */ +/* EXTERNS: Hyphenate() */ +/* */ +/*****************************************************************************/ +#include "externs.h" +#define NODE_MULT 4 /* what to multiply node indexes by */ +#define MAX_CHAR 256 /* max chars represented in one char */ +#define TRIE_MAGIC 5361534 +#define KILL_CLASS 0 /* characters preventing hyphenation */ +#define PUNCT_CLASS 1 /* characters delimiting hyphenation */ + + +/*****************************************************************************/ +/* */ +/* static tex_codes[] */ +/* */ +/* TeX hyphenation files often contain escape sequences consisting of a */ +/* backslash and two or three characters to denote 8-bit characters. This */ +/* code will read and translate such sequences if they are in the following */ +/* list. */ +/* */ +/*****************************************************************************/ + +static char *tex_codes[] = { + + "Agrave", "`A", "\300", + "Aacute", "'A", "\301", + "Acircumflex", "^A", "\302", + "Atilde", "~A", "\303", + "Adieresis", "\"A", "\304", + "agrave", "`a", "\340", + "aacute", "'a", "\341", + "acircumflex", "^a", "\342", + "atilde", "~a", "\343", + "adieresis", "\"a", "\344", + + "ccedilla", "cc", "\347", + + "Egrave", "`E", "\310", + "Eacute", "'E", "\311", + "Ecircumflex", "^E", "\312", + "Edieresis", "\"E", "\313", + "egrave", "`e", "\350", + "eacute", "'e", "\351", + "ecircumflex", "^e", "\352", + "edieresis", "\"e", "\353", + + "Igrave", "`I", "\314", + "Iacute", "'I", "\315", + "Icircumflex", "^I", "\316", + "Idieresis", "\"I", "\317", + "igrave", "`\\i", "\354", + "iacute", "'\\i", "\355", + "icircumflex", "^\\i", "\356", + "idieresis", "\"\\i","\357", + + "Ograve", "`O", "\322", + "Oacute", "'O", "\323", + "Ocircumflex", "^O", "\324", + "Otilde", "~O", "\325", + "Odieresis", "\"O", "\326", + "ograve", "`o", "\362", + "oacute", "'o", "\363", + "ocircumflex", "^o", "\364", + "otilde", "~o", "\365", + "odieresis", "\"o", "\366", + + "Ugrave", "`U", "\331", + "Uacute", "'U", "\332", + "Ucircumflex", "^U", "\333", + "Udieresis", "\"U", "\334", + "ugrave", "`u", "\371", + "uacute", "'u", "\372", + "ucircumflex", "^u", "\373", + "udieresis", "\"u", "\374", + + "", "", "" +}; + +static void DecodeEscapes(FULL_CHAR *str, FULL_CHAR *fname, int hline_num) +{ FULL_CHAR *p, *q; + int i; + p = q = str; + while( *q != '\0' ) + { + if( *q == '\\' ) + { for( i = 0; tex_codes[i][0] != '\0'; i += 3 ) + { + if( StringBeginsWith(q+1, AsciiToFull(tex_codes[i+1])) ) + break; + } + if( tex_codes[i][0] != '\0' ) + { + StringCopy(p, tex_codes[i+2]); + p += StringLength(AsciiToFull(tex_codes[i+2])); + q += StringLength(AsciiToFull(tex_codes[i+1])) + 1; + } + else + { + Error(36, 1, "in hyphenation file %s, unknown escape sequence (line %d)", + FATAL, no_fpos, fname, hline_num); + } + } + else *p++ = *q++; + } + *p++ = '\0'; +} /* end DecodeEscapes */ + + +/*****************************************************************************/ +/* */ +/* static TRIE HyphTables[] */ +/* */ +/* The packed hyphenation tables, indexed by language. An entry is NULL */ +/* when the table for that language has not yet been read in; TriedFile */ +/* is TRUE after we have tried to read that file, whether or not we were */ +/* successful. */ +/* */ +/*****************************************************************************/ + +typedef struct trie_rec +{ int magic; /* a magic number to make sure ok */ + int class_count; /* the number of character classes */ + unsigned char class[MAX_CHAR]; /* the character classes */ + short *node_mem; /* the node memory */ + int node_lim; /* top of node memory */ + int node_free; /* first free space in node memory */ + FULL_CHAR *string_mem; /* the string memory */ + int string_lim; /* top of string memory */ + int string_first; /* the first (last inserted) string */ +} *TRIE; + +static TRIE HyphTables[MAX_LANGUAGE] = { NULL }; +static BOOLEAN TriedFile[MAX_LANGUAGE] = { FALSE }; + + +/*@::CompressValue(), UncompressValue()@**************************************/ +/* */ +/* CompressValue(p, q) */ +/* */ +/* Compress value string p, placing the result in q. */ +/* */ +/*****************************************************************************/ +#define FirstHalf(y) ( (y) >> 4 ) +#define LastHalf(y) ( (y) & 15 ) +#define AssignFirstHalf(x, y) ( (x) = ((y) << 4) ) +#define AssignLastHalf(x, y) ( (x) |= (y) ) + +#define CompressValue(compressed, uncompressed) \ +/* FULL_CHAR *compressed, *uncompressed; */ \ +{ register FULL_CHAR *p, *q; \ + p = compressed; q = uncompressed; \ + for( ; ; ) \ + { \ + if( *q == (FULL_CHAR) '\0' ) \ + { *p = (FULL_CHAR) '\0'; \ + break; \ + } \ + AssignFirstHalf(*p, *q++ - '0' + 2); \ + if( *q == (FULL_CHAR) '\0' ) \ + { *++p = (FULL_CHAR) '\0'; \ + break; \ + } \ + AssignLastHalf(*p, *q++ - '0' + 2); \ + p++; \ + } \ +} + +/*****************************************************************************/ +/* */ +/* UncompressValue(q, p) */ +/* */ +/* Uncompress value string q, placing the result in p. */ +/* */ +/*****************************************************************************/ + +#define UncompressValue(compressed, uncompressed) \ +/* FULL_CHAR *compressed, *uncompressed; */ \ +{ register FULL_CHAR *p, *q; \ + p = compressed; q = uncompressed; \ + for( ; ; ) \ + { \ + if( FirstHalf(*p) == '\0' ) break; \ + *q++ = FirstHalf(*p) + '0' - 2; \ + if( LastHalf(*p) == '\0' ) break; \ + *q++ = LastHalf(*p) + '0' - 2; \ + p++; \ + } \ + *q = (FULL_CHAR) '\0'; \ +} + +/*@::AltCompressValue(), AltUncompressValue()@********************************/ +/* */ +/* AltCompressValue(compressed, uncompressed) */ +/* */ +/* Compress value string, placing the result in compressed. */ +/* */ +/* This is an alternative compression scheme to the one given above, which */ +/* should give better compression. The result is a sequence of pairs of */ +/* the form (skip, value) saying that we are to skip so many places and */ +/* then insert the given non-zero value. All the other values are zero. */ +/* Skip values are 4-bit numbers (maximum skip is 15, but we will insert */ +/* a 15 skip with a zero value in the rare case of skipping further). */ +/* Values are also 4-bit numbers, known to be non-zero. So the memory */ +/* cost is 8 bits per non-zero value. */ +/* */ +/*****************************************************************************/ +#define CharPack(ch, a, b) (ch = ((a) << 4) | (b)) +#define CharUnPack(ch, a, b) ((a) = (ch) >> 4, (b) = (ch) & 15) + +#define AltCompressValue(compressed, uncompressed) \ +/* FULL_CHAR *compressed, *uncompressed; */ \ +{ register FULL_CHAR *p, *q, *prev; \ + prev = (uncompressed) - 1; p = (compressed); \ + for( q = (uncompressed); *q != (FULL_CHAR) '\0'; q++ ) \ + { \ + if( *q != (FULL_CHAR) '0' || q - prev - 1 >= 15 ) \ + { \ + CharPack(*p++, q - prev - 1, *q - '0' + 2); \ + prev = q; \ + } \ + } \ + *p++ = (FULL_CHAR) '\0'; \ +} + +/*****************************************************************************/ +/* */ +/* AltUncompressValue(q, p) */ +/* */ +/* Uncompress value string q, placing the result in p. */ +/* */ +/*****************************************************************************/ + +#define AltUncompressValue(compressed, uncompressed) \ +{ register FULL_CHAR *p, *q, xval; int i, skip; \ + q = (uncompressed); \ + for( p = (compressed); *p != (FULL_CHAR) '\0'; p++ ) \ + { CharUnPack(*p, skip, xval); \ + for( i = 0; i < skip; i++ ) \ + *q++ = (FULL_CHAR) '0'; \ + *q++ = (FULL_CHAR) (xval + '0' - 2); \ + } \ + *q++ = (FULL_CHAR) '\0'; \ + debug1(DHY, D, "AltUncompressValue returning %s", (uncompressed)); \ +} + +/* *** +static void AltUncompressValue(FULL_CHAR *compressed, FULL_CHAR *uncompressed) +{ register FULL_CHAR *p, *q, xval; int i, skip; + q = uncompressed; + for( p = compressed; *p != (FULL_CHAR) '\0'; p++ ) + { CharUnPack(*p, skip, xval); + for( i = 0; i < skip; i++ ) + *q++ = (FULL_CHAR) '0'; + *q++ = (FULL_CHAR) (xval + '0' - 2); + } + *q++ = (FULL_CHAR) '\0'; + debug1(DHY, D, "AltUncompressValue returning %s", uncompressed); +} +*** */ + +/*@@**************************************************************************/ +/* */ +/* ClassConvert(in, out, T, fname, hline_num) */ +/* */ +/* Set out[i] to the character class of in[i] in T, for all i. */ +/* */ +/*****************************************************************************/ + +#define ClassConvert(in, out, T, fname, hline_num) \ +{ int i; \ + for( i = 0; in[i] != '\0'; i++ ) \ + if( T->class[in[i]] != 0 ) out[i] = T->class[in[i]]; \ + else \ + Error(36, 2, "in hyphenation file %s, line %d: character (octal %o) is not in any class", \ + FATAL, no_fpos, fname, hline_num, in[i]); \ + out[i] = '\0'; \ +} /* end ClassConvert */ + + +/*@::findrep(), TrieRetrieve(), ShowRate()@***********************************/ +/* */ +/* findrep(i, T) Returns one character whose class in T is i. */ +/* */ +/*****************************************************************************/ +#if DEBUG_ON + +static FULL_CHAR findrep(int i, TRIE T) +{ int ch; + for( ch = 0; ch < MAX_CHAR; ch++ ) + if( T->class[ch] == i ) return (FULL_CHAR) ch; + Error(36, 3, "DoTriePrint: findrep failed", INTERN, no_fpos); + return (FULL_CHAR) ch; /* never reached, but gcc doesn't know that */ +} /* end findrep */ + + +#if 0 +/*****************************************************************************/ +/* */ +/* static FULL_CHAR *TrieRetrieve(key, T) */ +/* */ +/* Retrieve the value associated with key in T, or NULL if not present. */ +/* This procedure is not presently in use. */ +/* */ +/*****************************************************************************/ + +static FULL_CHAR *TrieRetrieve(FULL_CHAR *key, TRIE T) +{ FULL_CHAR str[MAX_BUFF]; int i, curr_node, next_node, pos; + debug1(DHY, DD, "TrieRetrieve(%s, T)", key); + ClassConvert(key, str, T, STR_EMPTY, 0); + + /* invariant: curr_node is an existing node of T with prefix str[0..i-1] */ + curr_node = i = 0; + for(;;) + { + /* if next_node is 0, the string was never inserted */ + next_node = T->node_mem[curr_node + str[i]]; + if( next_node == 0 ) return (FULL_CHAR *) NULL; + + /* if next_node < 0 it represents an offset into the string memory */ + if( next_node < 0 ) + { pos = - next_node; + if( str[i] != '\0' ) + { do + { if( str[++i] != T->string_mem[pos++] ) return (FULL_CHAR *) NULL; + } while( str[i] != '\0' ); + } + return &(T->string_mem[pos]); + } + + /* otherwise next_node is the trie node to be searched next */ + curr_node = NODE_MULT*next_node; i++; + } +} /* end TrieRetrieve */ +#endif + + +/*****************************************************************************/ +/* */ +/* static ShowRate(key, start, stop, rate, fp) */ +/* */ +/* Debug print of key[] and rate[] on file fp. */ +/* */ +/*****************************************************************************/ + +static void ShowRate(FULL_CHAR *key, int start, int stop, FULL_CHAR *rate, +FILE *fp) +{ int i; + fprintf(fp, "key: "); + for( i = start; i < stop; i++ ) fprintf(fp, " %c", key[i]); + fprintf(fp, "\nrate:"); + for( i = 0; rate[i] != '\0'; i++ ) fprintf(fp, " %c", rate[i]); + fprintf(fp, "\n"); +} /* end ShowRate */ + + +/*@::DoTriePrint(), TriePrint()@**********************************************/ +/* */ +/* static DoTriePrint(T, node, len, fp) */ +/* */ +/* Print on file fp the subset of the entries of trie T stored in node and */ +/* its descendants. The node has prefix prefix[0..len-1]. */ +/* */ +/*****************************************************************************/ + +static FULL_CHAR prefix[MAX_BUFF]; + +static void DoTriePrint(TRIE T, int node, int len, FILE *fp) +{ int i, next_node, pos; FULL_CHAR str[20]; + for( i = 0; i < T->class_count; i++ ) + { + /* if next_node < 0, have string to print */ + next_node = T->node_mem[node + i]; + if( next_node < 0 ) + { + prefix[len] = '\0'; + fprintf(fp, "%s", prefix); + pos = - next_node; + if( i != 0 ) + { + fprintf(fp, "%c", findrep(i, T)); + while( T->string_mem[pos] != '\0' ) + { fprintf(fp, "%c", findrep(T->string_mem[pos], T)); + pos++; + } + pos++; + } + AltUncompressValue(&(T->string_mem[pos]), str); + fprintf(fp, " %s\n", str); + } + + /* else if next_node > 0 have a child node to explore */ + else if( next_node > 0 ) + { assert( i > 0, "DoTriePrint: i == 0!" ); + prefix[len] = findrep(i, T); + prefix[len+1] = '\0'; + DoTriePrint(T, NODE_MULT*next_node, len+1, fp); + } + } +} /* end DoTriePrint */ + + +/*****************************************************************************/ +/* */ +/* static TriePrint(T, fp) */ +/* */ +/* Print trie T on file fp. */ +/* */ +/*****************************************************************************/ + +static void TriePrint(TRIE T, FILE *fp) +{ int i, ch; + assert( T-> magic == TRIE_MAGIC, "TriePrint: magic!" ); + fprintf(fp, "Classes:"); + for( i = 1; i < T->class_count; i++ ) + { fprintf(fp, " "); + for( ch = 0; ch < MAX_CHAR; ch++ ) + if( T->class[ch] == i ) fprintf(fp, "%c", ch); + } + fprintf(fp, "\n"); + fprintf(fp, "Node space: %d capacity, %d used\n", T->node_lim, T->node_free); + fprintf(fp, "String space: %d capacity, %d used\n", T->string_lim, + T->string_lim - T->string_first); + prefix[0] = '\0'; + DoTriePrint(T, 0, 0, fp); +} /* end TriePrint */ +#endif + + +/*@::NewTrie(), NewTrieString(), NewTrieNode()@*******************************/ +/* */ +/* static TRIE NewTrie(node_lim, string_lim) */ +/* */ +/* Initialize a new trie with this much space for nodes and strings. */ +/* */ +/*****************************************************************************/ + +static TRIE NewTrie(unsigned node_lim, unsigned string_lim) +{ TRIE T; int i; + debug2(DHY, DD, "NewTrie(%d, %d)", node_lim, string_lim); + ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1, + sizeof(struct trie_rec) + node_lim*sizeof(short)+string_lim*sizeof(char))); + T = (TRIE) malloc( sizeof(struct trie_rec) + + node_lim*sizeof(short) + string_lim*sizeof(char)); + if( T == (TRIE) NULL ) + Error(36, 4, "run out of memory while constructing hyphenation table", + FATAL, no_fpos); + T->magic = TRIE_MAGIC; T->class_count = 1; + for( i = 0; i < MAX_CHAR; i++ ) T->class[i] = 0; + T->node_mem = (short *) ( (char *) T + sizeof(struct trie_rec)); + T->node_lim = node_lim; T->node_free = 0; + T->string_mem = (FULL_CHAR *) &(T->node_mem[node_lim]); + T->string_lim = T->string_first = string_lim; + debug0(DHY, DD, "NewTrie returning."); + return T; +} /* end NewTrie */ + + +/*****************************************************************************/ +/* */ +/* static short NewTrieString(str, T) */ +/* */ +/* Copy a new string into T, and return its offset in string_mem; */ +/* */ +/*****************************************************************************/ + +static short NewTrieString(FULL_CHAR *str, TRIE T) +{ short res = T->string_first - StringLength(str) - 1; + if( res >= 0 ) + { T->string_first = res; StringCopy(&(T->string_mem[res]), str); + } + return res; +} /* end NewTrieString */ + + +/*****************************************************************************/ +/* */ +/* ststic int NewTrieNode(T) */ +/* */ +/* Allocate a new empty trie node in T, and return its offset in node_mem. */ +/* */ +/*****************************************************************************/ + +static int NewTrieNode(TRIE T) +{ int i; int res; + if( T->node_free + T->class_count > T->node_lim ) + Error(36, 5, "hyphenation trie node limit exceeded", INTERN, no_fpos); + res = T->node_free; T->node_free += T->class_count; + for( i = res; i < T->node_free; i++ ) T->node_mem[i] = 0; + return res; +} /* end NewTrieNode */ + + +/*@::AddClassToTrie(), TrieInsert()@******************************************/ +/* */ +/* static AddClassToTrie(str, T) */ +/* */ +/* Add a new character class, whose members are the characters of str, to */ +/* trie T. This cannot occur after the first insertion. */ +/* */ +/*****************************************************************************/ + +static void AddClassToTrie(FULL_CHAR *str, TRIE T) +{ int i; + assert( T->string_first == T->string_lim, "AddClassToTrie: after insertion"); + for( i = 0; str[i] != '\0'; i++ ) + if( T->class[str[i]] == 0 ) T->class[str[i]] = T->class_count; + else Error(36, 6, "hyphenation class of %c may not be changed", + INTERN, no_fpos, str[i]); + T->class_count++; +} /* end AddClassToTrie */ + + +/*****************************************************************************/ +/* */ +/* static BOOLEAN TrieInsert(key, value, T, fname, hline_num) */ +/* */ +/* Insert a new key and value into trie T (originating in file fname on */ +/* line hline_num). */ +/* */ +/*****************************************************************************/ + +static BOOLEAN TrieInsert(FULL_CHAR *key, FULL_CHAR *value, TRIE T, +FULL_CHAR *fname, int hline_num) +{ FULL_CHAR str[MAX_BUFF], compressed_value[MAX_BUFF]; + int i, curr_node, next_node, pos, ch; short strpos; + debug2(DHY, DD, "TrieInsert(%s, %s, T)", key, value); + + /* if first insertion, add one node after making sure class_count is even */ + if( T->node_free == 0 ) + { T->class_count = NODE_MULT * ceiling(T->class_count, NODE_MULT); + ch = NewTrieNode(T); + } + + AltCompressValue(compressed_value, value); + + /* invariant: curr_node is an existing node of T with prefix str[0..i-1] */ + ClassConvert(key, str, T, fname, hline_num); + curr_node = i = 0; + for(;;) + { + /* if str is ended, add compressed_value only to string memory */ + if( str[i] == '\0' ) + { if( T->node_mem[curr_node] != 0 ) + Error(36, 7, "hyphenation string %s already inserted", + INTERN, no_fpos, key); + else + { + strpos = NewTrieString(compressed_value, T); + if( strpos < 0 ) + { debug0(DHY, DD, "TrieInsert returning FALSE (trie full)"); + return FALSE; + } + T->node_mem[curr_node] = - strpos; + } + debug0(DHY, DD, "TrieInsert returning TRUE (empty suffix)."); + return TRUE; + } + + /* if next position is unoccupied, store remainder of str and value */ + next_node = T->node_mem[curr_node + str[i]]; + if( next_node == 0 ) + { ch = NewTrieString(compressed_value, T); + if( ch < 0 ) + { debug0(DHY, DD, "TrieInsert returning FALSE (trie full)"); + return FALSE; + } + strpos = NewTrieString(&str[i+1], T); + if( strpos < 0 ) + { debug0(DHY, DD, "TrieInsert returning FALSE (trie full)"); + return FALSE; + } + T->node_mem[curr_node + str[i]] = - strpos; + debug0(DHY, DD, "TrieInsert returning (non-empty suffix)."); + return TRUE; + } + + /* if next position is occupied by a non-empty string, move that */ + /* string down one level and replace it by a trie node */ + if( next_node < 0 ) + { pos = - next_node; + ch = T->string_mem[pos]; + if( T->string_first == pos ) T->string_first++; + T->node_mem[curr_node + str[i]] = next_node = NewTrieNode(T)/NODE_MULT; + T->node_mem[NODE_MULT*next_node + ch] = -(pos+1); + } + + /* now next is the offset of the next node to be searched */ + curr_node = NODE_MULT*next_node; i++; + } +} /* end TrieInsert */ + + +/*@::BeGetChar(), BePutChar(), BeGetShort(), BePutShort(), etc.@**************/ +/* */ +/* BeGetChar(fp, pv) */ +/* BePutChar(fp, v) */ +/* BeGetShort(fp, pv) */ +/* BePutShort(fp, v) */ +/* BeGetInt(fp, pv) */ +/* BePutInt(fp, v) */ +/* */ +/* Get char, short, or int pv from file fp, and put char, short, or int */ +/* onto file fp. These routines are designed so that the file can be */ +/* written or read safely by big-endian and little-endian architectures; */ +/* this is accomplished by reading and writing one byte at a time to and */ +/* from a big-endian format file. All return 0 on success, -1 on failure. */ +/* Thanks to David W. Sanderson for this code. */ +/* */ +/*****************************************************************************/ + +#define BeGetChar(fp, pv) ( (c = getc(fp)) == EOF ? -1 : (*pv = c & 0xFF, 0) ) +#define BePutChar(fp, v) ( putc( (char) (v & 0xFF), fp), 0 ) + +#define BeGetShort(fp, pv) \ +( (c = getc(fp)) == EOF ? -1 : \ + ( *pv = (c & 0xFF) << 8, \ + (c = getc(fp)) == EOF ? -1 : (*pv |= c & 0xFF, 0) \ + ) \ +) + +#define BePutShort(fp, v) \ +( putc((v >> 8) & 0xFF, fp), putc(v & 0xFF, fp), 0 ) + +static int BeGetInt(FILE *fp, int *pv) +{ int c; + if ((c = getc(fp)) == EOF) return -1; + *pv = (c & 0xFF) << 24; + if ((c = getc(fp)) == EOF) return -1; + *pv |= (c & 0xFF) << 16; + if ((c = getc(fp)) == EOF) return -1; + *pv |= (c & 0xFF) << 8; + if ((c = getc(fp)) == EOF) return -1; + *pv |= c & 0xFF; + return 0; +} + +static int BePutInt(FILE *fp, int v) +{ + putc((v >> 24) & 0xFF, fp); + putc((v >> 16) & 0xFF, fp); + putc((v >> 8) & 0xFF, fp); + putc(v & 0xFF, fp); + return 0; +} + + +/*@::CompressTrie(), TrieRead(), AccumulateRating()@**************************/ +/* */ +/* static CompressTrie(T) */ +/* */ +/* Compress trie T and return its length in characters. */ +/* */ +/*****************************************************************************/ + +static void CompressTrie(TRIE T) +{ FULL_CHAR *p, *q; int len, i; + debug0(DHY, DD, "CompressTrie(T), T ="); + debug2(DHY, DD, "Node space: %d capacity, %d used\n", + T->node_lim, T->node_free); + debug2(DHY, DD, "String space: %d capacity, %d used\n", + T->string_lim, T->string_lim - T->string_first); + ifdebug(DHY, DD, TriePrint(T, stderr)); + T->node_lim = T->node_free; + for( i = 0; i < T->node_lim; i++ ) + if( T->node_mem[i] < 0 ) + T->node_mem[i] = - ( -T->node_mem[i] - T->string_first); + p = (FULL_CHAR *) &(T->node_mem[T->node_free]); + q = &(T->string_mem[T->string_first]); + len = T->string_lim - T->string_first; + for( i = 0; i < len; i++ ) *p++ = *q++; + T->string_mem = (FULL_CHAR *) &(T->node_mem[T->node_lim]); + T->string_first = 0; + T->string_lim = len; + len = sizeof(struct trie_rec) + T->node_lim * sizeof(short) + + T->string_lim * sizeof(FULL_CHAR); + debug1(DHY, DD, "CompressTrie returning; len = %d, T =", len); + ifdebug(DHY, DD, TriePrint(T, stderr)); +} /* end CompressTrie */ + + +/*****************************************************************************/ +/* */ +/* static TRIE TrieRead(lnum, success) */ +/* */ +/* Read in a packed trie if possible, otherwise pack an unpacked one. */ +/* The trie is to be for language lnum. */ +/* */ +/* Boolean success is set to true if no errors were encountered. If the */ +/* file read was a placeholder, success will be true but still a null */ +/* TRIE will be returned. */ +/* */ +/*****************************************************************************/ +#define START_STATE 0 +#define CLASSES_STATE 1 +#define EXCEPTIONS_STATE 2 +#define LENGTH_LIMIT_STATE 3 +#define PATTERNS_STATE 4 + +static TRIE TrieRead(LANGUAGE_NUM lnum, BOOLEAN *success) +{ TRIE T; FILE_NUM unpacked_fnum, packed_fnum; OBJECT fname; + FILE *unpacked_fp, *packed_fp; unsigned len; + int prev, i, j, c, state, hline_num, length_limit; +#if DEBUG_ON + int icount = 0; +#endif + debug2(DHY, DD, "TrieRead(%d %s)", lnum, + lnum == 0 ? STR_NONE : LanguageString(lnum)); + + /* get hyphenation file name from language module */ + fname = LanguageHyph(lnum); + assert( fname == nilobj || is_word(type(fname)), "TrieRead: fname!" ); + if( fname == nilobj ) + { *success = FALSE; + return (TRIE) NULL; + } + + /* define and open packed file */ + debug0(DFS, DD, " calling DefineFile from TrieRead (1)"); + packed_fnum = DefineFile(string(fname), HYPH_PACKED_SUFFIX, + &fpos(fname), HYPH_PACKED_FILE, HYPH_PATH); + if( InitializeAll ) + { + /* initializing so want to reconstruct packed files */ + /* thanks to Ian Jackson <ian@chiark.greenend.org.uk> for this */ + packed_fp = NULL; + } + else + { + /* not initializing so use existing packed files if possible */ + packed_fp = OpenFile(packed_fnum, FALSE, FALSE); + } + + if( packed_fp == NULL ) + { + /* no packed file, so define and open unpacked one instead */ + FULL_CHAR str[MAX_BUFF], key[MAX_BUFF], value[MAX_BUFF], + buff[MAX_BUFF+10]; + int bpos, bcount; + debug0(DFS, DD, " calling DefineFile from TrieRead (2)"); + unpacked_fnum = DefineFile(string(fname), HYPH_SUFFIX, + &fpos(fname), HYPH_FILE, HYPH_PATH); + unpacked_fp = OpenFile(unpacked_fnum, FALSE, FALSE); + if( unpacked_fp == NULL ) + { Error(36, 8, "cannot open hyphenation file %s", + WARN, no_fpos, FileName(unpacked_fnum)); + *success = FALSE; + return (TRIE) NULL; + } + + /* check that first line contains magic header or stub */ + if( StringFGets(str, MAX_BUFF, unpacked_fp) == NULL || + ( !StringEqual(str, AsciiToFull("Lout hyphenation information\n")) && + !StringEqual(str, AsciiToFull("Lout hyphenation placeholder\n")) ) + ) + Error(36, 9, "header line of hyphenation file %s missing", + FATAL, no_fpos, FileName(unpacked_fnum)); + + /* if file is just a placeholder, exit silently with success */ + if( !StringEqual(str, AsciiToFull("Lout hyphenation information\n")) ) + { *success = TRUE; + return (TRIE) NULL; + } + + /* read the classes, exceptions, and patterns from the unpacked file */ + T = NewTrie( (unsigned) 120000, (unsigned) 32767); + state = START_STATE; + hline_num = 1; + length_limit = 0; + while( StringFGets(buff, MAX_BUFF, unpacked_fp) != NULL ) + { + hline_num++; bpos = 0; + while( sscanf( (char *) &buff[bpos], "%s%n", str, &bcount) == 1 && + str[0] != '%' ) + { + bpos += bcount; + DecodeEscapes(str, string(fname), hline_num); + + switch( state ) + { + case START_STATE: + + if( !StringEqual(str, AsciiToFull("Classes:")) ) + Error(36, 10, "Classes heading of hyphenation file %s missing", + FATAL, no_fpos, FileName(unpacked_fnum)); + state = CLASSES_STATE; + break; + + + case CLASSES_STATE: + + if( StringEqual(str, AsciiToFull("Exceptions:")) ) + { state = EXCEPTIONS_STATE; + } + else if( StringEqual(str, AsciiToFull("Patterns:")) ) + { state = PATTERNS_STATE; + } + else if( StringEqual(str, AsciiToFull("LengthLimit:")) ) + { state = LENGTH_LIMIT_STATE; + } + else + { debug1(DHY, DD, "adding class %s", str); + AddClassToTrie(str, T); + } + break; + + + case EXCEPTIONS_STATE: + + if( StringEqual(str, AsciiToFull("Patterns:")) ) + { state = PATTERNS_STATE; + } + else if( StringEqual(str, AsciiToFull("LengthLimit:")) ) + { state = LENGTH_LIMIT_STATE; + } + else + { prev = CH_EIGHT; j = 0; + key[j] = '.', value[j++] = prev, prev = CH_EIGHT; + for( i = 0; str[i] != '\0'; i++ ) + { if( str[i] == CH_HYPHEN ) prev = CH_NINE; + else key[j] = str[i], value[j++] = prev, prev = CH_EIGHT; + } + key[j] = '.', value[j++] = prev, prev = CH_EIGHT; + key[j] = '\0'; value[j] = prev; value[j+1] = '\0'; + if( !TrieInsert(key, value, T, string(fname), hline_num) ) + { + Error(36, 11, "hyphenation file %s%s is too large (at line %d)", + WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num); + *success = FALSE; + return (TRIE) NULL; + } + } + break; + + + case LENGTH_LIMIT_STATE: + + if( StringEqual(str, AsciiToFull("Patterns:")) ) + { state = PATTERNS_STATE; + } + else if( sscanf( (char *) str, "%d", &length_limit) != 1 ) + { + Error(36, 20, "bad LengthLimit in hyphenation file %s%s (line %d)", + WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num); + *success = FALSE; + return (TRIE) NULL; + } + break; + + + case PATTERNS_STATE: + + prev = CH_ZERO; j = 0; + for( i = 0; str[i] != '\0'; i++ ) + { if( decimaldigit(str[i]) ) prev = str[i]; + else key[j] = str[i], value[j++] = prev, prev = CH_ZERO; + } + key[j] = '\0'; value[j] = prev; value[j+1] = '\0'; + if( length_limit == 0 || j <= length_limit ) + { + debug3(DHY, DD, "TrieInsert(%s, %s, T) [%d]", key, value, ++icount); + if( !TrieInsert(key, value, T, string(fname), hline_num) ) + { + Error(36, 12, "hyphenation file %s%s is too large (at line %d)", + WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num); + *success = FALSE; + return (TRIE) NULL; + } + } + break; + + + default: + + assert(FALSE, "TrieRead: state"); + break; + + } /* end switch */ + } /* end while */ + } /* end while */ + + if( state != PATTERNS_STATE ) + Error(36, 13, "format error in hyphenation file %s", + FATAL, no_fpos, FileName(unpacked_fnum)); + fclose(unpacked_fp); + CompressTrie(T); + + /* write the compressed trie out to the packed file */ + /* cannot use FileName(packed_fnum) because path won't be right */ + StringCopy(buff, FileName(unpacked_fnum)); + StringCopy(&buff[StringLength(buff) - StringLength(HYPH_SUFFIX)], + HYPH_PACKED_SUFFIX); + packed_fp = StringFOpen(buff, WRITE_BINARY); + if( packed_fp == NULL ) + Error(36, 14, "cannot write to hyphenation file %s", FATAL,no_fpos,buff); + BePutInt(packed_fp, T->magic); + BePutInt(packed_fp, T->class_count); + for( i = 0; i < MAX_CHAR; i++ ) BePutChar(packed_fp, T->class[i]); + /* BePutInt(packed_fp, 0); */ /* placeholder for node_mem now omitted */ + BePutInt(packed_fp, T->node_lim); + BePutInt(packed_fp, T->node_free); + /* BePutInt(packed_fp, 0); */ /* placeholder for string_mem now omitted */ + BePutInt(packed_fp, T->string_lim); + BePutInt(packed_fp, T->string_first); + for( i=0; i < T->node_free; i++ ) BePutShort(packed_fp, T->node_mem[i]); + for( i=0; i < T->string_lim; i++) BePutChar(packed_fp, T->string_mem[i]); + fclose(packed_fp); + + /* free T */ + ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1, + sizeof(struct trie_rec) + 120000*sizeof(short)+32767*sizeof(char))); + free(T); + + /* now try again to open packed_fnum, the file just written */ + packed_fp = OpenFile(packed_fnum, FALSE, FALSE); + if( packed_fp == NULL ) + Error(36, 15, "cannot open hyphenation file %s", + FATAL, no_fpos, FileName(packed_fnum)); + } /* end if( packed_fp == NULL ) */ + + /* now packed hyphenation file is open, read it in */ + fseek(packed_fp, 0L, SEEK_END); + len = (unsigned) ftell(packed_fp); rewind(packed_fp); + ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1, len)); + /* the 2*sizeof(void*) is for the sizes of node_mem and string_mem */ + T = (TRIE) malloc(len + 2*sizeof(void*)); + if( T == (TRIE) NULL ) + Error(36, 16, "run out of memory while reading hyphenation table", + FATAL, no_fpos); + if( BeGetInt(packed_fp, &T->magic) != 0 ) + Error(36, 17, "error on read from packed hyphenation file %s", + FATAL, no_fpos, FileName(packed_fnum)); + if( T->magic != TRIE_MAGIC ) + Error(36, 18, "bad magic number in hyphenation file %s", + FATAL, no_fpos, FileName(packed_fnum)); + BeGetInt(packed_fp, &T->class_count); + for( i = 0; i < MAX_CHAR; i++ ) BeGetChar(packed_fp, &T->class[i]); + /* BeGetInt(packed_fp, &i); */ /* placeholder for node_mem now omitted */ + BeGetInt(packed_fp, &T->node_lim); + BeGetInt(packed_fp, &T->node_free); + /* BeGetInt(packed_fp, &i); */ /* placeholder for string_mem now omitted */ + BeGetInt(packed_fp, &T->string_lim); + BeGetInt(packed_fp, &T->string_first); + T->node_mem = (short *) ( (char *) T + sizeof(struct trie_rec) ); + T->string_mem = (FULL_CHAR *) &(T->node_mem[T->node_lim]); + for( i = 0; i < T->node_free; i++ ) BeGetShort(packed_fp, &T->node_mem[i]); + for( i = 0; i < T->string_lim; i++ ) BeGetChar(packed_fp, &T->string_mem[i]); + fclose(packed_fp); + + /* debug and exit */ + debug0(DHY, DD, "TrieRead returning, T ="); + *success = TRUE; + ifdebug(DHY, DD, TriePrint(T, stderr)); + return T; +} /* end TrieRead */ + + +/*****************************************************************************/ +/* */ +/* AccumulateRating(x, y) */ +/* */ +/* Accumulate the hyphenation rating string x into y. */ +/* */ +/*****************************************************************************/ + +#define AccumulateRating(x, y) \ +{ FULL_CHAR *p = x, *q = y; \ + while( *p ) \ + { if( *p > *q ) *q = *p; \ + p++, q++; \ + } \ +} /* end AccumulateRating */ + + +/*@::ReadHyphTable()@*********************************************************/ +/* */ +/* BOOLEAN ReadHyphTable(lnum) */ +/* */ +/* Read hyphenation table for language lnum. */ +/* */ +/*****************************************************************************/ + +BOOLEAN ReadHyphTable(LANGUAGE_NUM lnum) +{ BOOLEAN res; + debug1(DHY, DD, "ReadHyphTable(%d)", lnum); + assert(lnum > 0, "ReadHyphTable: lnum <= 0!"); + assert(HyphTables[lnum]==(TRIE) NULL && !TriedFile[lnum], "ReadHyphTable!"); + HyphTables[lnum] = TrieRead(lnum, &res); + TriedFile[lnum] = TRUE; + debug2(DHY, DD, "ReadHyphTable(%d) returning %s", lnum, bool(res)); + return res; +} /* end ReadHyphTable */ + + +/*@::Hyphenate@***************************************************************/ +/* */ +/* OBJECT Hyphenate(x) */ +/* */ +/* Hyphenate ACAT object x, returning the hyphenated result. */ +/* */ +/*****************************************************************************/ + +OBJECT Hyphenate(OBJECT x) +{ OBJECT link, y, z, next_link; TRIE T; LANGUAGE_NUM lnum; + FULL_CHAR str[MAX_WORD+2], rate[MAX_WORD+3], val[MAX_WORD+3], + *class, *key, *ss, *s, *p, *rem, *lig, *a, *b; + int start, stop, i, curr_node, next_node, pos; + BOOLEAN hyphenated, success; + assert( type(x) == ACAT, "Hyphenate: type(x) != ACAT!" ); + debug1(DHY, D, "Hyphenate(%s)", EchoObject(x)); + + /* for each word y of x, try to hyphenate it */ + for( link = Down(x); link != x; link = NextDown(link) ) + { Child(y, link); + if( !is_word(type(y)) || string(y)[0] == '\0' || !word_hyph(y) ) + { + if( type(y) == GAP_OBJ && mode(gap(y)) == HYPH_MODE ) + nobreak(gap(y)) = FALSE; + continue; + } + debug1(DHY, DD, "Hyphenate() examining %s", EchoObject(y)); + + /* determine T, the trie to use */ + lnum = word_language(y); + if( lnum == 0 ) + Error(36, 19, "no current language for word %s", + FATAL, &fpos(y), string(y)); + T = HyphTables[lnum]; + + /* if no trie is present, try to get it from a file */ + if( T == (TRIE) NULL ) + { if( !TriedFile[lnum] ) + { T = HyphTables[lnum] = TrieRead(lnum, &success); + TriedFile[lnum] = TRUE; + } + if( T == (TRIE) NULL ) + { debug1(DHY, DD, "Hyphenate continuing (no trie for %s)", string(y)); + continue; + } + } + + /* start := index of first letter of y, stop := index following last */ + key = string(y); class = T->class; + for( start = 0; class[key[start]] == PUNCT_CLASS; start++ ); + for( stop = start; class[key[stop]] > PUNCT_CLASS; stop++ ); + + /* if a - ended the run, hyphenate there only */ + if( key[stop] == CH_HYPHEN ) + { + /* actually, don't hyphenate if the hyphen is last in the word [thanks Uwe] */ + if( key[stop+1] == '\0' ) + continue; + + next_link = NextDown(link); + z = MakeWord(WORD, &key[stop+1], &fpos(y)); + word_font(z) = word_font(y); + word_colour(z) = word_colour(y); + word_language(z) = word_language(y); + word_hyph(z) = word_hyph(y); + underline(z) = underline(y); + debug1(DHY, DD, "Hyphenate (hyph case) making fragment %s", string(z)); + FontWordSize(z); + Link(NextDown(link), z); + New(z, GAP_OBJ); + hspace(z) = vspace(z) = 0; + SetGap(gap(z), FALSE, FALSE, TRUE, FIXED_UNIT, HYPH_MODE, 0); + underline(z) = underline(y); + Link(NextDown(link), z); + Link(z, MakeWord(WORD, STR_GAP_ZERO_HYPH, &fpos(y))); + key[stop + 1] = '\0'; + FontWordSize(y); + /* *** link = PrevDown(next_link); */ + link = NextDown(link); + continue; + } + + /* do not hyphenate if less than 5 letters, or a kill char is nearby */ + if( stop - start < 5 ) continue; + if( key[stop] != '\0' && class[key[stop]] == KILL_CLASS ) continue; + + /* let str[] be the converted substring, let rate[] be all CH_ZERO */ + str[0] = PUNCT_CLASS; rate[0] = CH_ZERO; + for( i = 0; i < stop - start; i++ ) + { str[i+1] = class[key[start + i]]; + rate[i+1] = CH_ZERO; + } + str[i+1] = PUNCT_CLASS; rate[i+1] = CH_ZERO; + str[i+2] = '\0'; rate[i+2] = CH_ZERO; + rate[i+3] = '\0'; + ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr)); + + /* for each suffix of str[], accumulate patterns matching its prefixes */ + ss = str; + do + { + ifdebug(DHY, DD, + fprintf(stderr, "trying suffix \""); + for( p = ss; *p != 0; p++ ) fprintf(stderr, "%c", findrep(*p, T)); + fprintf(stderr, "\"\n"); + ); + + /* accumulate all prefixes of ss */ + curr_node = 0; s = ss; + for(;;) + { + /* if curr_node has empty string, that is one prefix */ + pos = T->node_mem[curr_node]; + if( pos < 0 ) + { AltUncompressValue(&(T->string_mem[- pos]), val); + AccumulateRating(val, rate+(ss-str)); + debug1(DHY, DD, " found %s", val); + } + + /* if ss is finished, no other prefixes are possible */ + if( *s == '\0' ) break; + + /* determine next_node and break if empty */ + next_node = T->node_mem[curr_node + *s]; + if( next_node == 0 ) break; + + /* if next_node is a string, check whether it is a prefix of ss */ + if( next_node < 0 ) + { rem = &(T->string_mem[-next_node]); + do + { if( *rem == '\0' ) + { AltUncompressValue(rem+1, val); + AccumulateRating(val, rate+(ss-str)); + debug1(DHY, DD, " found %s", val); + break; + } + } while( *++s == *rem++ ); + break; + } + + /* otherwise go on to the next trie node */ + curr_node = NODE_MULT*next_node; s++; + } + } while( *(++ss + 1) != PUNCT_CLASS ); + ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr)); + + /* set rate[i] to CH_ZERO whenever key[start+i-1] lies within a ligature */ + lig = finfo[word_font(y)].lig_table; + for( p = key, i = 2; *p != '\0'; p++, i++ ) + { if( lig[*p] > 1 ) + { a = &lig[ lig[*p] + MAX_CHARS ]; + while( *a++ == *p ) + { b = p+1; + while( *a == *b && *(a+1) != '\0' && *b != '\0' ) a++, b++; + if( *(a+1) == '\0' ) + { rate[i] = CH_ZERO; + break; + } + else + { while( *++a ); + a++; + } + } + } + } + ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr)); + + /* now rate[] has accumulated ratings; use it to perform hyphenations */ + hyphenated = FALSE; + next_link = NextDown(link); + for( i = stop - start - 1; i >= 3; i-- ) + { + /* hyphenate at i if rate[i] is odd */ + if( is_odd(rate[i]) ) + { z = MakeWord(WORD, &key[start+i-1], &fpos(y)); + word_font(z) = word_font(y); + word_colour(z) = word_colour(y); + word_language(z) = word_language(y); + word_hyph(z) = word_hyph(y); + underline(z) = underline(y); + debug1(DHY, D, "Hyphenate making fragment %s", string(z)); + FontWordSize(z); + Link(NextDown(link), z); + New(z, GAP_OBJ); + hspace(z) = vspace(z) = 0; + SetGap(gap(z), FALSE, FALSE, TRUE, FIXED_UNIT, HYPH_MODE, 0); + underline(z) = underline(y); + Link(NextDown(link), z); + Link(z, MakeWord(WORD, STR_GAP_ZERO_HYPH, &fpos(y))); + key[start + i - 1] = '\0'; + hyphenated = TRUE; + } + } + if( hyphenated ) + { FontWordSize(y); + link = PrevDown(next_link); + } + + } /* end for each word */ + + debug3(DHY, D, "Hyphenate returning %s,%s %s", + EchoLength(back(x, COLM)), EchoLength(fwd(x, COLM)), EchoObject(x)); + return x; +} /* end Hyphenate */ |