/* tlgu: Translates TLG (D) text files to Unicode text
*
* Copyright (C) 2004, 2005 Dimitri Marinakis
*
* Licensed under the terms of the GNU General Public License.
* ABSOLUTELY NO WARRANTY.
* See the file `COPYING' in this directory.
*
* Usage:
* tlgu [options] infile outfile
*
* Options:
* -r -- primarily Roman text; default betastate = ROMAN, reset on every ID code
* -vwxyz -- reference citations are printed in the form xxx.xxx...xxx
* -(a)b(cd) -- description citations are printed
* -B -- output blank space (tab) after each new line (beginning of line)
* -p -- pagination is observed, otherwise book lines are printed continuously
* -C -- citation debug information is printed
* -S -- special code debug information is printed
* -T -- bracket debug information is printed
* -V -- processing debug information is printed
* -W -- multiple output files, one for each work
*
* Returns: exit code 1 if unsuccesful
*
* Make: gcc tlgu.c -o tlgu
*
* History: This is a re-write of a DOS program (tlgft.asm) written several
* years ago to translate Hellenic texts distributed on the TLG CD-ROM from
* "beta code" to something readable, editable and printable.
*
* Pointers / References:
* TLG Project - www.tlg.uci.edu
* PHI CD ROM Format Description, Packard Humanities Institute, 19 April 1992
* Beta code reference - Text versions: tlgbeta.txt or tlgcode.txt
* a .pdf version is also available.
* ID locator reference - Text version tlgcodes.txt
*
* dm: 14-Jun-2001 ELOT-928
* 14-Jun-2004 Unicode
* 26-Jun-2004 Command-line options
* 26-Feb-2005 Output file separation (-W option)
* 06-Mar-2005 Latin accent characters added (without parentheses)
* 21-Nov-2005 Added -Z -e and imported into sword-tools SVN repository
*/
#include <stdio.h>
#include <string.h>
#include "tlgu.h"
#include "tlgcodes.h"
/****************** PROTOTYPES FROM THE TOP DOWN *******************/
int tlgu (char * input_file, char * output_file);
void output_utf(int ucode);
void output_string(char *outstr);
int process_beta (int input_count);
void beta_code(int input_count);
int id_code(int input_count);
void store_accents(unsigned char bufferchar);
const char *resolve_cite_format(const char *cformat);
/****************** PROGRAM VERSION INFORMATION *******************/
char *prog_version="1.2";
/****************** COMMAND LINE OPTIONS **************************/
int opt_roman = 0;
int opt_page = 0;
int opt_blank = 0;
int opt_acit = 0;
int opt_bcit = 0;
int opt_ccit = 0;
int opt_dcit = 0;
int opt_cit_id = 0; /* combines a, b, c */
int opt_vcit = 0;
int opt_wcit = 0;
int opt_xcit = 0;
int opt_ycit = 0;
int opt_cprefix = 0;
char cformat[253];
int opt_ecit_blank = 0;
char ecite[253];
int opt_zcit = 0;
int opt_verbose = 0;
int opt_debug_bracket = 0;
int opt_debug_cit = 0;
int opt_debug_special = 0;
int opt_multiple = 0;
/****************** GLOBAL VARIABLES *******************************/
int iptr = 0; /* input buffer pointer, reset before every read */
int optr = 0; /* output buffer pointer, reset after every write */
unsigned char input_buffer[INRECSIZE];
unsigned char output_buffer[OUTRECSIZE];
#define MAXFILELEN 256
/************ GLOBAL BETA CODE PROCESSING VARIABLES **************/
unsigned int outcode;
int betastate; /* translation state machine */
int previous_state; /* needed for symbol translations */
int start_new_line = 0; /* needed for symbol translations */
int book_change = 0; /* needed for symbol translations */
int accents; /* holds accent combinations */
char *accented_chars = "AEHIOUWR";
char *accent_chars = ")(+/\\=|";
char *latin_accent_chars = "+/\\=|";
char *escape_codes = "$&%\"@#^[]<>{}";
char *punctuation_codes = " .,:;_\"%{}$&"; /* used by which_sigma */
char previous_bcit[52][32]; /* holds previous work (book) citation */
/****************** GLOBAL DESCRIPTOR VARIABLES *****************/
/*
Space is reserved for descriptive data as follows:
citations, binary component -- z, y, x, w, v, n (1 to 16383)
citations, ascii component -- a-z (1 to 15 characters + null, only a-d, n, v-z are actually used)
descriptors, binary component -- a-z (1 to 16383)
descriptors, ascii component -- a-z (1 to 31 characters + null)
Citations ---
a - author citation
b - work citation
c - preferred abbreviation for the work
d - preferred abbreviation for the author
n - if present signifies a document within a work
when it changes, v-z are nulled but are then independent
if n is not present, a change in an upper level nulls out the rest
v-z hierarchical citation levels, high to low
v
w
x - (chapter)
y - (verse) (book)
z - line
Descriptions ---
z - comment sequence number within a work
In the common data structures below, citations will hold the first 26 positions (0-25)
while descriptors will hold the next 26 positions.
*/
int icitation[52];
char citation[52][32];
int id_level; /* holds translated current id level as an index to ID arrays */
int id_char; /* holds the pointer for the ascii part of the ID arrays */
int id_command; /* holds the current instruction for ID handling */
int id_process; /* if non-zero, command must be processed */
/****************** HANDLE ARGUMENTS AND SYNTAX *******************/
void usage_info(void)
{
printf("\ntlgu: TLG beta code file to Unicode translator ver. %s\n", prog_version);
printf("\ntlgu: Copyright (C) 2004, 2005 Dimitri Marinakis");
printf("\ntlgu: This program is free software; you are encouraged to redistribute it under");
printf("\ntlgu: the terms of the GNU General Public License.\n");
printf("\ntlgu: This program comes with ABSOLUTELY NO WARRANTY. See the GNU General Public");
printf("\ntlgu: License (e.g. in the file named `COPYING') for more details.\n");
printf("\ntlgu: Syntax: [-options...] tlgu beta_code_file text_file\n\n");
printf("tlgu: -r -- primarily Roman text; default betastate = ROMAN, reset on every ID code\n");
printf("tlgu: -v -w -x -y -z -- work reference citations are printed in the form xxx.xxx...xxx\n");
printf("tlgu: -Z <custom_cite_format_prefix> -- use special codes %%v %%w %%y %%z in string\n");
printf("tlgu: -e <custom_blank_cite_seg_string> -- e.g. \"[NONE]\" instead of default \"\"\n");
printf("tlgu: -b -- books are preceded by a page feed and description citations are printed\n");
printf("tlgu: -p -- pagination is observed, otherwise book lines are printed continuously\n");
printf("tlgu: -B -- output blank space (tab) at the beginning of each line\n");
printf("tlgu: -C -- citation debug information is printed\n");
printf("tlgu: -S -- special code debug information is printed\n");
printf("tlgu: -V -- processing debug information is printed\n");
printf("tlgu: -W -- multiple output files, one for each work (book)\n\n");
}
main(int argc, char * argv[])
{
unsigned char ucc; /* test variable */
int idx;
if (sizeof(ucc) != 1) {
printf("\ntlgu: I need 8-bit characters to work\n");
exit(1);
}
if (argc < 3) {
usage_info();
exit(1);
}
--argc ;
++argv ;
while(argc > 2 && argv[0][0] == '-') {
switch(argv[0][1]) {
case 'W':
opt_multiple =1;
break ;
case 'V':
opt_verbose =1;
break ;
case 'S':
opt_debug_special = 1;
break ;
case 'T':
opt_debug_bracket = 1;
break ;
case 'C':
opt_debug_cit = 1;
break ;
case 'B':
opt_blank = 1;
break ;
case 'p':
opt_page = 1;
break ;
case 'r':
opt_roman = 1;
break ;
case 'a':
opt_acit = 1;
opt_cit_id =1;
break ;
case 'b':
opt_bcit = 1;
opt_cit_id =1;
break ;
case 'c':
opt_ccit = 1;
opt_cit_id =1;
break ;
case 'd':
opt_dcit = 1;
opt_cit_id =1;
break ;
case 'v':
opt_vcit = 1;
break ;
case 'w':
opt_wcit = 1;
break ;
case 'x':
opt_xcit = 1;
break;
case 'y':
opt_ycit = 1;
break ;
case 'z':
opt_zcit = 1;
break;
case 'e':
opt_ecit_blank = 1;
strcpy(ecite, argv[1]);
argc-- ;
argv++ ;
break;
case 'Z':
opt_cprefix = 1;
strcpy(cformat, argv[1]);
argc-- ;
argv++ ;
break;
default:
usage_info() ;
exit(0) ;
}
argc-- ;
argv++ ;
}
return tlgu(argv[0], argv[1]);
}
/****************** FILE READ-WRITE LOOP **************************/
int tlgu(char *input_file, char *output_file)
{
int i; /* counter */
int j; /* counter */
int infile; /* input file descriptor */
int outfile;/* output file descriptor */
int icnt; /* input file bytes read in input buffer */
int ocnt; /* output file bytes written */
int bytes_to_process; /* bytes read minus bytes already processed */
int wehaveinput; /* flag for while */
int beta_return; /* process beta return code */
char new_file[256];
struct stat filestat;
/* Open input and output files
*/
infile = open(input_file, O_RDONLY);
if (infile < 0) {
perror("tlgu input file open");
return(1);
} else {
if (strlen(output_file) < MAXFILELEN-5) {
strcpy(new_file, output_file);
} else {
printf("\ntlgu output filename too long - exiting\n");
return(1);
}
outfile = open(new_file, O_WRONLY | O_CREAT | O_TRUNC);
if (outfile < 0) {
perror("tlgu output file create");
close(infile);
return(1);
}
}
/* Initialize citation
* and descriptor indicators
*/
for (i = 0; i < 52; i++) {
icitation[i] = 0;
for (j = 0; j < 32; j++) {
citation[i][j]=0;
}
}
/* Initialize beta processing defaults
* e.g. The TLG Canon needs ROMAN as default
* Hellenic should be reset at each ID CODE
*/
if (opt_roman) betastate = ROMAN;
else betastate = HELLENIC;
/* Read, process and write file blocks,
* Optionally create one file per book (-W)
* Change file mode (equivalent to chmod 644 output_file),
* and return.
* Note: Local deblocking usually yields higher speeds
*/
wehaveinput = 1;
while (wehaveinput) {
/* Read and process beta code in input_buffer */
icnt = read(infile, input_buffer, sizeof(input_buffer));
if (icnt == 0) wehaveinput = 0;
iptr = 0;
while ((icnt > 0) && (iptr < icnt)) {
bytes_to_process = icnt - iptr;
beta_return = process_beta(bytes_to_process);
/* Write processed data and reset output buffer pointer */
if (optr > 0) {
ocnt = write(outfile, output_buffer, optr);
optr = 0;
if (ocnt < 0) {
perror("tlgu output file write");
wehaveinput = 0;
}
} else if (beta_return != -2) { /* no more bytes to write, no book change request */
if (opt_verbose) printf("\ntlgu: no more bytes to write");
wehaveinput = 0; /* signal no more input */
}
if (beta_return == -2) {
/* book change request, close current file and open a new one */
if (opt_verbose) printf("\ntlgu: book change request: %s", previous_bcit[1]);
if (close(outfile)) return(1);
if (chmod(new_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) return(1);
/* request file information and delete zero-length files
*/
stat(new_file, &filestat);
if (filestat.st_size == 0) unlink(new_file);
sprintf(new_file, "%s-%s.txt", output_file, previous_bcit[1]);
outfile = open(new_file, O_WRONLY | O_CREAT | O_TRUNC);
if (outfile < 0) {
perror("tlgu: new_file create");
close(infile);
return(1);
}
}
}
}
/* Close input and output files,
* make output file readable
*/
close(infile);
if (close(outfile)) {
perror("tlgu output file close");
return(1);
}
if (chmod(new_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) {
perror("tlgu output file chmod");
return(1);
}
if (opt_verbose) printf("\ntlgu: processing complete\n");
return(0);
}
/****************** PROCESSING *************************************/
/* process_beta:
* Processes <input_count> bytes in <input_buffer>
* Returns: -1 for EOF, -2 for book change
* Changes: iptr
*/
int process_beta (int input_count)
{
unsigned char inchar;
unsigned int outcode;
int processing;
int iptr_max; /* holds the calculated maximum input pointer value */
int return_code; /* id_code and beta_code bytes written; error if negative */
char outstring[511];
char nstring[253];
return_code = 0;
/* A beta code stream includes two kinds of data:
* ID data - always has the high bit set.
* Text data - always has the high bit reset.
*/
processing = 1;
iptr_max = iptr + input_count;
if (opt_verbose) printf("\n\ntlgu: process_beta - %d bytes, iptr = %4.4x, iptr_max = %4.4x", input_count, iptr, iptr_max);
while (processing) {
if ((iptr < INRECSIZE) && (iptr < iptr_max)) {
inchar = input_buffer[iptr++];
if (optr < OUTRECSIZE) {
if (inchar == 0) {
/* do nothing for null characters */
} else if (inchar > 0x7F) {
/* ID data - decrement input pointer before processing */
--iptr;
/* Reset beta decoding state if roman option specified */
if (opt_roman) betastate = ROMAN;
/* Process ID code */
return_code = id_code(input_count);
if (return_code == -1) {
if (opt_verbose) printf("\ntlgu: EOF while processing id code");
processing = 0;
} else if (return_code == -2) {
if (opt_verbose) printf("\ntlgu: book change request");
processing = 0;
}
start_new_line = 1;
} else {
/* text data < 0x80 - decrement input pointer before processing */
--iptr;
if (start_new_line) {
/* Write info on (book) citation change */
if (book_change) {
if (opt_cit_id) {
sprintf(outstring, "\n\f[%s] ", citation[0]);
output_string(outstring);
sprintf(outstring, "[%s] ", citation[1]);
output_string(outstring);
sprintf(outstring, "[%s] ", citation[2]);
output_string(outstring);
sprintf(outstring, "[%s]\n", citation[3]);
output_string(outstring);
}
book_change = 0;
}
sprintf(outstring, "\n");
if (opt_blank)
strcat(outstring, "\t");
else if (opt_cprefix) {
strcat(outstring, resolve_cite_format(cformat));
}
else if (opt_vcit || opt_wcit || opt_xcit || opt_ycit || opt_zcit) {
if (opt_vcit) {
if (icitation[21] == 0) sprintf(nstring, "%s.",citation[21]);
else sprintf(nstring, "%d%s.", icitation[21], citation[21]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
strcat(outstring, nstring);
}
if (opt_wcit) {
if (icitation[22] == 0) sprintf(nstring, "%s.",citation[22]);
else sprintf(nstring, "%d%s.", icitation[22], citation[22]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
strcat(outstring, nstring);
}
if (opt_xcit) {
if (icitation[23] == 0) sprintf(nstring, "%s.",citation[23]);
else sprintf(nstring, "%d%s.", icitation[23], citation[23]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
strcat(outstring, nstring);
}
if (opt_ycit) {
if (icitation[24] == 0) sprintf(nstring, "%s.",citation[24]);
else sprintf(nstring, "%d%s.", icitation[24], citation[24]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
strcat(outstring, nstring);
}
if (opt_zcit) {
if (icitation[25] == 0) sprintf(nstring, "%s.",citation[25]);
else sprintf(nstring, "%d%s", icitation[25], citation[25]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
strcat(outstring, nstring);
}
/* Separate text from citation using a tab character */
strcat(outstring, "\t");
}
if (input_buffer[iptr] < 0x80) {
/* Print only if not followed by another ID byte */
output_string(outstring);
}
start_new_line = 0;
if (opt_roman) betastate = ROMAN;
else betastate = HELLENIC;
}
beta_code(input_count);
}
} else {
/* Output size is greater than input -- intermediate write */
printf("\ntlgu: FIXME -- DATA LOSS: ERROR output size iptr - %x optr - %x", iptr, optr);
processing = 0;
}
} else {
/* Finished processing all input */
processing = 0;
}
} /* end while processing*/
if (opt_verbose) printf("\ntlgu: iptr - %4.4x, optr - %4.4x ", iptr, optr);
return return_code;
}
/****************** LIBRARY FUNCTIONS ******************************/
/* get_acents:
* gets accents in <accents>
* Returns: number of accents found or zero
* Changes: accents, iptr
*/
int get_accents(void)
{
unsigned char bufferchar;
int processing = 1;
int number_of_accents = 0;
accents = 0;
while (processing) {
if (iptr < INRECSIZE) {
bufferchar = input_buffer[iptr++];
if (betastate == ROMAN) {
if (strchr(latin_accent_chars, bufferchar)) {
store_accents(bufferchar);
number_of_accents++;
} else {
--iptr;
processing = 0;
}
} else if (strchr(accent_chars, bufferchar)) {
store_accents(bufferchar);
number_of_accents++;
} else {
--iptr;
processing = 0;
}
} else {
processing = 0;
}
}
return number_of_accents;
}
/* store_accents:
* Stores accent character passed as a parameter to <accents>
* 0 00 00 --- 0 00 00 no accent
* | | |
* | | ---- 01 psili, 10 dasia, 11 dialytika
* | ------- 01 varia, 10 oxia, 11 perispomeni
* ----------- 1 ypogegrammeni
* Changes: accents
* Caveat: currently only ORs new accent... expects an all-zero accent variable
*/
void store_accents(unsigned char bufferchar)
{
switch (bufferchar)
{
case ')':
accents = accents | 1;
break;
case '(':
accents = accents | 2;
break;
case '+':
accents = accents | 3;
break;
case '\\':
accents = accents | 4;
break;
case '/':
accents = accents | 8;
break;
case '=':
accents = accents | 0xc;
break;
case '|':
accents = accents | 0x10;
break;
default:
break;
}
accents &= 0x1f;
}
/* output_accents:
* Input: <accents>
* 0 00 00 --- 0 00 00 no accent
* | | |
* | | ---- 01 psili, 10 dasia, 11 dialytika
* | ------- 01 varia, 10 oxia, 11 perispomeni
* ----------- 1 ypogegrammeni
* Changes: optr (output_utf)
*/
void output_accents(void)
{
int paccents;
paccents = accents & 3;
if (paccents == 1)
output_utf(PSILI);
else if (paccents == 2)
output_utf(DASIA);
else if (paccents == 3)
output_utf(DIALYTIKA);
paccents = (accents & 0xc) >> 2;
if (paccents == 1)
output_utf(VARIA);
else if (paccents == 2)
output_utf(OXIA);
else if (paccents == 3) {
if (betastate == ROMAN)
output_utf(CARET);
else
output_utf(PERISPOMENI);
}
paccents = accents & 0x10;
if (paccents)
output_utf(YPOGEGRAMMENI);
}
/* getnum:
* Collects a non-zero number from the current <input_buffer> position.
* Returns: an integer or zero if no number found, -1 on end of buffer
* Changes: iptr
*/
int getnum(void)
{
#define MAXNUMBERS 32
unsigned char bufferchar;
unsigned char modnumber[MAXNUMBERS]; /* symbol or font modifier number string */
int imodnumber = 0; /* index to modnumber */
int convnumber = 0; /* converted modnumber string */
int processing = 1;
modnumber[0] = 0;
while (processing) {
if ( (iptr < INRECSIZE) && (imodnumber < MAXNUMBERS) ) {
bufferchar = input_buffer[iptr++];
if (isdigit(bufferchar)) {
modnumber[imodnumber++] = bufferchar;
} else {
--iptr;
modnumber[imodnumber] = 0;
sscanf(modnumber, "%d", &convnumber);
processing = 0;
}
} else {
convnumber = -1;
processing = 0;
}
}
if (convnumber < 0) perror("did not complete number\n");
return convnumber;
}
/* output_utf:
* Converts the input code into a UTF-8 byte sequence in output_buffer
* Changes: optr, output_buffer
*/
void output_utf(int ucode)
{
if ((optr+3) > OUTRECSIZE) {
perror("optr out of range");
} else if (ucode == 0){
/* do nothing */
} else if (ucode < 0x80) {
output_buffer[optr++] = ucode;
} else if (ucode < 0x800) {
output_buffer[optr++] = (ucode >> 6) | 0xc0;
output_buffer[optr++] = (ucode & 0x3f) | 0x80;
} else if (ucode <= 0xffff) {
output_buffer[optr++] = ((ucode & 0xf000) >> 12) | 0xe0;
output_buffer[optr++] = ((ucode & 0x0fc0) >> 6) | 0x80;
output_buffer[optr++] = (ucode & 0x3f) | 0x80;
} else {
/* higher unicodes are ignored */
}
}
/* output_string:
* Calls output_utf to write a string in <output_buffer>
* Returns: the number of bytes written
* Changes: optr, output_buffer
*/
void output_string(char *outstr)
{
int nextchar;
int cnt;
for (cnt = 0; cnt < strlen(outstr); cnt++) {
output_utf(outstr[cnt]);
}
}
/* handle_escape_codes:
* Formatting and character output based on escape codes: $&%"@#^[]<>{}
* Input: escape code, optional number
* Changes: optr, output_buffer
*/
void handle_escape_codes(unsigned char beta, int number)
{
int temp = 0;
switch (beta)
{
case '$':
betastate = HELLENIC;
accents = 0;
break;
case '&':
betastate = ROMAN;
accents = 0;
break;
case '%':
if (opt_debug_special) printf("%%%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_PUNCTUATION)
output_utf(punctuation[number]);
break;
case '\"':
if (opt_debug_special) printf("\"%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_QUOTATION) {
if (quotation_open[number]) {
output_utf(quotation_close_symbol[number]);
quotation_open[number] = 0;
} else {
output_utf(quotation_open_symbol[number]);
quotation_open[number] = 1;
}
}
break;
case '@':
/* FIXME: If citations are active, paging should be disabled */
if (opt_debug_special) printf("@%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
/* Page formats -- FIXME: incomplete */
if (number == 0) {
output_utf(0x20);
output_utf(0x20);
} else if (number == 1) {
if (opt_page) output_utf(0xc);
//FIXME: reinstate else output_utf(0xa);
} //fixme: reinstate else output_utf(0xa);
break;
case '#':
if (opt_debug_special) printf("#%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_TEXT_SYMBOLS) {
output_utf(text_symbols[number]);
}
break;
case '^':
/* quarter-spaces: will output at least one space */
if (number > 0) temp = number / 4;
while (temp >= 0) {
output_utf(0x20);
temp--;
}
break;
case '[':
if (opt_debug_bracket) printf("[%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_BRACKET) {
output_utf(bracket_open_symbol[number]);
}
break;
case ']':
if (opt_debug_bracket) printf("]%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_BRACKET) {
output_utf(bracket_close_symbol[number]);
}
break;
case '<':
if (opt_debug_bracket) printf("<%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_QUASI_BRACKET) {
output_utf(quasi_bracket_open_symbol[number]);
}
break;
case '>':
if (opt_debug_bracket) printf(">%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_QUASI_BRACKET) {
output_utf(quasi_bracket_close_symbol[number]);
}
break;
case '{':
if (opt_debug_bracket) printf("{%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_NON_TEXT) {
output_utf(non_text_open_symbol[number]);
}
break;
case '}':
if (opt_debug_bracket) printf("{%d -- %s %d.%d.%d\n", number, citation[1], icitation[23], icitation[24], icitation[25]);
if (number < MAX_NON_TEXT) {
output_utf(non_text_close_symbol[number]);
}
break;
default:
break;
}
}
/* which_sigma:
* Tries to decide on which sigma form to use.
* Input: index of input_buffer (iptr) after the sigma
* Returns: output character code
*/
int which_sigma(int nextptr)
{
int scanning;
int nextcode;
/* If the next character is a hyphen, it is a medial sigma
* Otherwise, a few characters are examined in the input buffer:
* if an alphabetic character is found before we hit a space, or
* other punctuation character, it is a medial sigma
* otherwise it is a final sigma (there is one exception in 4085 - POS(.))
*/
if (input_buffer[iptr] == '-')
return(SIGMEDIAL);
else {
scanning = 10;
while(scanning) {
nextcode = input_buffer[nextptr++];
if (isalpha(nextcode))
return(SIGMEDIAL);
if (nextcode > 0x7f)
return(SIGFINAL);
if (strchr(punctuation_codes, nextcode))
return(SIGFINAL);
scanning--;
}
return(SIGMEDIAL);
}
}
/* beta_code:
* Processes <input_count> characters in <input_buffer> and
* writes processed output to output_buffer>
* Changes: optr, output_buffer
*/
void beta_code(int input_count)
{
int processing;
int input_pointer_max;
unsigned char betachar;
unsigned int outputchar;
int tmp;
input_pointer_max = iptr + input_count;
processing = 1;
while (processing) {
if ( (iptr < INRECSIZE) && (iptr < input_pointer_max) ) {
betachar = input_buffer[iptr++];
if ((betachar > 0x7F)) {
/* ID data found - restore pointer and stop processing*/
--iptr;
processing = 0;
} else {
outputchar = 0;
if (strchr(escape_codes, betachar)) {
/* Handle escape codes */
handle_escape_codes(betachar, getnum());
} else if (betastate == HELLENIC && betachar == '*') {
/* Handle Hellenic uppercase character */
get_accents();
betachar = input_buffer[iptr++];
if (accents == 0) get_accents(); //FIXME: handle suffix accents differently
if (strchr(accented_chars, betachar)) {
switch (betachar) {
case 'A':
outputchar = Alpha[accents];
break;
case 'E':
outputchar = Epsilon[accents];
break;
case 'H':
outputchar = Eta[accents];
break;
case 'I':
outputchar = Iota[accents];
break;
case 'O':
outputchar = Omicron[accents];
break;
case 'U':
outputchar = Ypsilon[accents];
break;
case 'W':
outputchar = Omega[accents];
break;
case 'R':
outputchar = Rho[accents];
break;
default:
break;
}
} else if (betachar == 'S') {
tmp = getnum();
if (tmp == 3) outputchar = SIGLUNATEUPPER;
else outputchar = SIGMEDIALUPPER;
} else if (isalpha(betachar)) {
/* not an accented character */
outputchar = hellenic[betachar];
} else {
outputchar = hellenic[betachar - 0x20];
}
if (outputchar == 0) outputchar = hellenic[betachar]; /* error condition */
output_utf(outputchar);
} else if (betastate == HELLENIC && isalpha(betachar)) {
/* Handle hellenic lower case:
* Get default character and then try to pin accents
*/
if (strchr(accented_chars, betachar)) {
get_accents();
switch (betachar) {
case 'A':
outputchar = alpha[accents];
break;
case 'E':
outputchar = epsilon[accents];
break;
case 'H':
outputchar = eta[accents];
break;
case 'I':
outputchar = iota[accents];
break;
case 'O':
outputchar = omicron[accents];
break;
case 'U':
outputchar = ypsilon[accents];
break;
case 'W':
outputchar = omega[accents];
break;
case 'R':
outputchar = rho[accents];
break;
default:
break;
}
} else if (betachar == 'S') {
tmp = getnum();
if (tmp == 1) outputchar = SIGMEDIAL;
else if (tmp == 2)outputchar = SIGFINAL;
else if (tmp == 3) outputchar = SIGLUNATE;
if (outputchar == 0) {
outputchar = which_sigma(iptr);
}
}
if (outputchar == 0) outputchar = hellenic[betachar - 0x20];
output_utf(outputchar);
} else if (betastate == ROMAN && isalpha(betachar)) {
/* Handle Roman characters */
//FIXME: need to process roman characters
if (isalpha(betachar)) get_accents();
outputchar = betachar;
output_utf(outputchar);
/* ROMAN uses combining accent forms */
output_accents();
} else {
//FIXME: placeholder
if (betachar != '`') outputchar = betachar;
output_utf(outputchar);
}
}
} else {
/* Requested number of characters have been processed
* or no more characters available in buffer
*/
processing = 0;
}
}
}
const char *resolve_cite_format(const char *cformat) {
static char outbuf[511];
char nstring[253];
int z;
*outbuf = 0;
const char *c;
for (c = cformat; *c; c++) {
if (*c == '%') {
const char c2 = *(c+1);
signed char cstart = -1;
if ((c2 >= 'a') && (c2 <= 'z')) {
cstart = c2 - 'a';
}
else if ((c2 >= 'A') && (c2 <= 'Z')) {
cstart = 26 + (c2 - 'A');
}
else if (c2 == '%') {
*nstring = '%'; nstring[1] = 0; strcat(outbuf, nstring);
}
else {
fprintf(stderr, "unknown escape sequence: %%%c\n", c2);
}
c++; //skip both our '%' and following character (by loop inc);
if (cstart > 20) {
if (icitation[cstart] == 0) sprintf(nstring, "%s",citation[cstart]);
else sprintf(nstring, "%d%s", icitation[cstart], citation[cstart]);
if ((opt_ecit_blank) && (!*nstring)) strcpy(nstring, ecite);
// ADDED FOR SWORD KEY DELIMETER
for (z = 0; z < strlen(nstring); z++) {
if (nstring[z] == '/') nstring[z] = ':';
}
// -----------------------------
strcat(outbuf, nstring);
}
else if (cstart > -1) {
if (!citation[cstart] || !citation[cstart][0]) {
if (opt_ecit_blank) strcat(outbuf, ecite);
}
else {
// ADDED FOR SWORD KEY DELIMETER
for (z = 0; z < strlen(nstring); z++) {
if (nstring[z] == '/') nstring[z] = ':';
}
// -----------------------------
strcat(outbuf, citation[cstart]);
}
}
}
else if (*c == '\\') {
switch (*(c+1)) {
case 't': strcat(outbuf, "\t"); break;
case 'n': strcat(outbuf, "\n"); break;
case 'r': strcat(outbuf, "\r"); break;
default: *nstring = *(c+1); nstring[1] = 0; strcat(outbuf, nstring); break;
}
c++; //skip both our '%' and following character (by loop inc);
}
else {
*nstring = *c; nstring[1] = 0; strcat(outbuf, nstring);
}
}
return outbuf;
}
/* id_code:
* <iptr> points to the next character in the <input_buffer> to process;
* <optr> points to the next empty <output_buffer position.
* Returns: 0 or -1 for EOF
*/
int id_code(int input_count)
{
int input_pointer_max;
int return_code;
int scratch;
int processing;
unsigned char idchar;
unsigned char outcode;
return_code = 0;
input_pointer_max = iptr + input_count;
processing = 1;
while (processing) {
if ((iptr < INRECSIZE) && (iptr < input_pointer_max)) {
outcode = 0;
idchar = input_buffer[iptr++];
if ((idchar < 0x80)) { /* text data - restore pointer and return*/
--iptr;
processing = 0;
} else { /* ID data - translate and write */
if (optr < OUTRECSIZE) {
id_process = 0; /* we don't have a command yet */
if (idchar >= 0xF0) {
switch (idchar)
{
case 0xF0: /* EOF */
return_code = -1; /* indicate EOF */
processing = 0;
break;
case 0xFE: /* End of block -- block is padded with nulls */
while (!input_buffer[iptr] && iptr<INRECSIZE) {
iptr++;
}
if (opt_debug_cit) printf("tlgu: EOB %x\n", iptr-1);
break;
case 0xFF: /* End of ASCII string */
if (opt_debug_cit) printf("tlgu: %d %s\n", id_level, citation[id_level]);
if (opt_debug_cit) printf("tlgu: EOS %x\n ", iptr-1);
break;
case 0xF8: /* Exception start */
if (opt_debug_cit) printf("tlgu: Exc start %x\n", iptr-1);
break;
case 0xF9: /* Exception end */
if (opt_debug_cit) printf("tlgu: Exc end %x\n", iptr-1);
break;
default:
break;
}
} else if (idchar >= 0xE0) {
/* The byte following an escape code is an ID byte
* Citation IDs can only be 0=a, 1=b, 2=c and 4=d
*/
if (opt_debug_cit) printf("tlgu: Escape %x", idchar);
id_command = idchar & 0xF; /* get "command" nybble */
idchar = input_buffer[iptr++] & 0x7F; /* get ID level byte */
if (idchar >= 97) { /* descriptors hold the upper part of the array */
id_level = idchar - 97 + 26; /* create an index offset */
if (id_level > 51) {id_level = 51;} /* default to z */
} else {
id_level = idchar & 7; /* must be 0 - 4 */
if (id_level == 4) {id_level = 3;} /* adjust d level */
}
if (opt_debug_cit) printf(" ID level: %d\n", id_level);
id_process = 1; /* command must be processed */
} else if ((idchar >= 0x80) && (id_process == 0)) {
id_command = idchar & 0xF; /* get command first */
scratch = (idchar >> 4) & 0x7; /* try to create an offset */
//printf(" %x %x ", idchar, scratch);
switch (scratch)
{
case 0:
id_level = 25; /* z */
id_process = 1; /* command must be processed */
break;
case 1:
id_level = 24; /* y */
id_process = 1; /* command must be processed */
break;
case 2:
id_level = 23; /* x */
id_process = 1; /* command must be processed */
break;
case 3:
id_level = 22; /* w */
id_process = 1; /* command must be processed */
break;
case 4:
id_level = 21; /* v */
id_process = 1; /* command must be processed */
break;
case 5:
id_level = 13; /* n */
id_process = 1; /* command must be processed */
break;
default:
break;
}
}
if (id_process) {
switch (id_command)
{
case 0:
icitation[id_level]++; /* increment ID */
break;
case 1:
icitation[id_level] = 1; /* literal value */
break;
case 2:
icitation[id_level] = 2; /* literal value */
break;
case 3:
icitation[id_level] = 3; /* literal value */
break;
case 4:
icitation[id_level] = 4; /* literal value */
break;
case 5:
icitation[id_level] = 5; /* literal value */
break;
case 6:
icitation[id_level] = 6; /* literal value */
break;
case 7:
icitation[id_level] = 7; /* literal value */
break;
case 8:
idchar = input_buffer[iptr++]; /* 7 bit binary value */
icitation[id_level] = idchar & 0x7F;
break;
case 9:
idchar = input_buffer[iptr++]; /* 7 bit binary value */
icitation[id_level] = idchar & 0x7F;
idchar = input_buffer[iptr++]; /* single character */
citation[id_level][0] = idchar & 0x7F;
citation[id_level][1] = 0;
break;
case 0xa:
idchar = input_buffer[iptr++]; /* 7 bit binary value */
icitation[id_level] = idchar & 0x7F;
for (id_char=0; id_char < 31; id_char++) {
idchar = input_buffer[iptr++]; /* string */
if (idchar == 0xFF) {
citation[id_level][id_char] = 0; /* end of string */
break;
} else {
citation[id_level][id_char] = idchar & 0x7F;
}
}
break;
case 0xb:
idchar = input_buffer[iptr++]; /* 14 bit binary value */
scratch = (idchar & 0x7F) << 7; /* shift upper */
idchar = input_buffer[iptr++]; /* 14 bit binary value */
idchar &= 0x7F; /* mask sign bit */
scratch = scratch | idchar; /* combine */
icitation[id_level] = scratch;
break;
case 0xc:
idchar = input_buffer[iptr++]; /* 14 bit binary value */
scratch = (idchar & 0x7F) << 7; /* shift upper */
idchar = input_buffer[iptr++]; /* 14 bit binary value */
idchar &= 0x7F; /* mask sign bit */
scratch = scratch | idchar; /* combine */
icitation[id_level] = scratch;
idchar = input_buffer[iptr++]; /* single character */
citation[id_level][0] = idchar & 0x7F;
citation[id_level][1] = 0; /* end of string */
break;
case 0xd:
idchar = input_buffer[iptr++]; /* 14 bit binary value */
scratch = (idchar & 0x7F) << 7; /* shift upper */
idchar = input_buffer[iptr++]; /* 14 bit binary value */
idchar &= 0x7F; /* mask sign bit */
scratch = scratch | idchar; /* combine */
icitation[id_level] = scratch;
for (id_char=0; id_char < 31; id_char++) {
idchar = input_buffer[iptr++]; /* string */
if (idchar == 0xFF) {
citation[id_level][id_char] = 0; /* end of string */
break;
} else {
citation[id_level][id_char] = idchar & 0x7F;
}
}
break;
case 0xe:
/* same binary value, single character */
idchar = input_buffer[iptr++]; /* single character */
citation[id_level][0] = idchar & 0x7F;
citation[id_level][1] = 0; /* end of string */
break;
case 0xf:
icitation[id_level] = 0; /* no binary value */
for (id_char=0; id_char < 31; id_char++) {
idchar = input_buffer[iptr++]; /* string */
if (idchar == 0xFF) {
citation[id_level][id_char] = 0; /* end of string */
break;
} else {
citation[id_level][id_char] = idchar & 0x7F;
}
}
/* Keep tab of book changes, optionally split into books */
if (id_level == 1) {
if (strncmp(citation[1], previous_bcit[1], 31)) {
if (opt_multiple) {
/* Signal outer loop to stop
* after processing citation change
*/
return_code = -2;
processing = 0;
if (opt_verbose) printf("\ntlgu: book citation: %s, previous: %s", citation[1], previous_bcit[1]);
}
strncpy(previous_bcit[1], citation[1], 31);
previous_bcit[1][31] = 0;
}
book_change = 1;
}
break;
default:
printf("tlgu: Unknown id_command: %x, iptr %x\n", id_command, iptr);
break;
}
if (opt_debug_cit) printf("tlgu: Command: %x ID level: %d, Binary: %d, ASCII: %s iptr++ %x\n",\
id_command, id_level,icitation[id_level], citation[id_level], iptr);
/* Adjust lower citation levels */
switch (id_level)
{
case 21:
icitation[22] = 1;
case 22:
icitation[23] = 1;
case 23:
icitation[24] = 1;
case 24:
icitation[25] = 1;
case 25:
outcode = 0;
break;
default:
break;
}
} /* id_process */
if (outcode) {
output_utf(outcode);
}
} else {
--iptr; /* output buffer full - restore pointer and return */
processing = 0;
}
} /* ID data processing */
} else { /* Finished processing all input */
processing = 0;
}
} /* while processing loop */
return return_code;
}