summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile13
-rw-r--r--apropos.12
-rw-r--r--libmandoc.h9
-rw-r--r--main.c30
-rw-r--r--man.12
-rw-r--r--mandoc.126
-rw-r--r--mandoc.h2
-rw-r--r--preconv.1157
-rw-r--r--preconv.c425
-rw-r--r--read.c71
10 files changed, 180 insertions, 557 deletions
diff --git a/Makefile b/Makefile
index b07ff5ce..610b3801 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-BASEBIN = mandoc preconv demandoc
+BASEBIN = mandoc demandoc
DBBIN = makewhatis
CGIBIN = man.cgi
@@ -145,7 +145,6 @@ DISTFILES = INSTALL \
mdoc.h \
msec.in \
out.h \
- preconv.1 \
predefs.in \
roff.7 \
st.in \
@@ -186,6 +185,7 @@ LIBMANDOC_OBJS = $(LIBMAN_OBJS) \
mandoc.o \
mandoc_aux.o \
msec.o \
+ preconv.o \
read.o
COMPAT_OBJS = compat_fgetln.o \
@@ -226,8 +226,6 @@ MAN_OBJS = $(MANDOC_OBJS)
MAKEWHATIS_OBJS = mandocdb.o mansearch_const.o manpath.o
-PRECONV_OBJS = preconv.o
-
APROPOS_OBJS = mansearch.o mansearch_const.o manpath.o
CGI_OBJS = $(MANDOC_HTML_OBJS) \
@@ -244,7 +242,6 @@ WWW_MANS = apropos.1.html \
demandoc.1.html \
man.1.html \
mandoc.1.html \
- preconv.1.html \
mandoc.3.html \
mandoc_escape.3.html \
mandoc_html.3.html \
@@ -302,7 +299,6 @@ clean:
rm -f libmandoc.a $(LIBMANDOC_OBJS) $(COMPAT_OBJS)
rm -f mandoc $(MANDOC_OBJS) $(APROPOS_OBJS)
rm -f makewhatis $(MAKEWHATIS_OBJS)
- rm -f preconv $(PRECONV_OBJS)
rm -f man.cgi $(CGI_OBJS)
rm -f manpage $(MANPAGE_OBJS)
rm -f demandoc $(DEMANDOC_OBJS)
@@ -321,7 +317,7 @@ base-install: base-build
$(INSTALL_LIB) libmandoc.a $(DESTDIR)$(LIBDIR)
$(INSTALL_LIB) man.h mandoc.h mandoc_aux.h mdoc.h \
$(DESTDIR)$(INCLUDEDIR)
- $(INSTALL_MAN) man.1 mandoc.1 preconv.1 demandoc.1 \
+ $(INSTALL_MAN) man.1 mandoc.1 demandoc.1 \
$(DESTDIR)$(MANDIR)/man1
$(INSTALL_MAN) mandoc.3 mandoc_escape.3 mandoc_malloc.3 \
mchars_alloc.3 tbl.3 $(DESTDIR)$(MANDIR)/man3
@@ -386,9 +382,6 @@ mandoc: $(MAN_OBJS) libmandoc.a
makewhatis: $(MAKEWHATIS_OBJS) libmandoc.a
$(CC) $(LDFLAGS) -o $@ $(MAKEWHATIS_OBJS) libmandoc.a $(DBLIB)
-preconv: $(PRECONV_OBJS)
- $(CC) $(LDFLAGS) -o $@ $(PRECONV_OBJS)
-
manpage: $(MANPAGE_OBJS) libmandoc.a
$(CC) $(LDFLAGS) -o $@ $(MANPAGE_OBJS) libmandoc.a $(DBLIB)
diff --git a/apropos.1 b/apropos.1
index 01d6ac26..fc940d06 100644
--- a/apropos.1
+++ b/apropos.1
@@ -79,7 +79,7 @@ to paginate them.
In
.Fl a
mode, the options
-.Fl IOTW
+.Fl IKOTW
described in the
.Xr mandoc 1
manual are also available.
diff --git a/libmandoc.h b/libmandoc.h
index d74d1e20..027009db 100644
--- a/libmandoc.h
+++ b/libmandoc.h
@@ -30,6 +30,12 @@ enum rofferr {
ROFF_ERR /* badness: puke and stop */
};
+struct buf {
+ char *buf;
+ size_t sz;
+ size_t offs;
+};
+
__BEGIN_DECLS
struct roff;
@@ -66,6 +72,9 @@ int man_endparse(struct man *);
int man_addspan(struct man *, const struct tbl_span *);
int man_addeqn(struct man *, const struct eqn *);
+int preconv_cue(const struct buf *);
+int preconv_encode(struct buf *, struct buf *, int *);
+
void roff_free(struct roff *);
struct roff *roff_alloc(struct mparse *, int);
void roff_reset(struct roff *);
diff --git a/main.c b/main.c
index 2451ff5e..2c358c75 100644
--- a/main.c
+++ b/main.c
@@ -82,6 +82,7 @@ struct curparse {
char outopts[BUFSIZ]; /* buf of output opts */
};
+static int koptions(int *, char *);
static int moptions(int *, char *);
static void mmsg(enum mandocerr, enum mandoclevel,
const char *, int, int, const char *);
@@ -149,14 +150,15 @@ main(int argc, char *argv[])
memset(&curp, 0, sizeof(struct curparse));
curp.outtype = OUTT_ASCII;
curp.wlevel = MANDOCLEVEL_FATAL;
- options = MPARSE_SO;
+ options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1;
defos = NULL;
use_pager = 1;
show_usage = 0;
outmode = OUTMODE_DEF;
- while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) {
+ while (-1 != (c = getopt(argc, argv,
+ "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) {
switch (c) {
case 'a':
outmode = OUTMODE_ALL;
@@ -192,6 +194,10 @@ main(int argc, char *argv[])
case 'i':
outmode = OUTMODE_INT;
break;
+ case 'K':
+ if ( ! koptions(&options, optarg))
+ return((int)MANDOCLEVEL_BADARG);
+ break;
case 'k':
search.argmode = ARG_EXPR;
break;
@@ -599,6 +605,26 @@ fail:
}
static int
+koptions(int *options, char *arg)
+{
+
+ if ( ! strcmp(arg, "utf-8")) {
+ *options |= MPARSE_UTF8;
+ *options &= ~MPARSE_LATIN1;
+ } else if ( ! strcmp(arg, "iso-8859-1")) {
+ *options |= MPARSE_LATIN1;
+ *options &= ~MPARSE_UTF8;
+ } else if ( ! strcmp(arg, "us-ascii")) {
+ *options &= ~(MPARSE_UTF8 | MPARSE_LATIN1);
+ } else {
+ fprintf(stderr, "%s: -K%s: Bad argument\n",
+ progname, arg);
+ return(0);
+ }
+ return(1);
+}
+
+static int
moptions(int *options, char *arg)
{
diff --git a/man.1 b/man.1
index eb266fe7..4693d165 100644
--- a/man.1
+++ b/man.1
@@ -255,7 +255,7 @@ combination.
The
.Nm
utility also supports the options
-.Fl IOTW
+.Fl IKOTW
described in the
.Xr mandoc 1
manual.
diff --git a/mandoc.1 b/mandoc.1
index b9ed80f7..aa903147 100644
--- a/mandoc.1
+++ b/mandoc.1
@@ -27,6 +27,7 @@
.Sm off
.Op Fl I Cm os Li = Ar name
.Sm on
+.Op Fl K Na Ar encoding
.Op Fl m Ns Ar format
.Op Fl O Ns Ar option
.Op Fl T Ns Ar output
@@ -89,6 +90,31 @@ macro.
Display only the SYNOPSIS lines.
Implies
.Fl a .
+.It Fl K Ns Ar encoding
+Specify the input encoding.
+The supported
+.Ar encoding
+arguments are
+.Cm us-ascii ,
+.Cm iso-8859-1 ,
+and
+.Cm utf-8 .
+If not specified, autodetection uses the first match:
+.Bl -tag -width iso-8859-1
+.It Cm utf-8
+if the first three bytes of the input file
+are the UTF-8 byte order mark (BOM, 0xefbbbf)
+.It Ar encoding
+if the first or second line of the input file matches the
+.Sy emacs
+mode line format
+.Pp
+.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*-
+.It Cm utf-8
+if the first non-ASCII byte in the file introduces a valid UTF-8 sequence
+.It Cm iso-8859-1
+otherwise
+.El
.It Fl k
A synonym for
.Xr apropos 1 .
diff --git a/mandoc.h b/mandoc.h
index cf22d6de..14fddd36 100644
--- a/mandoc.h
+++ b/mandoc.h
@@ -393,6 +393,8 @@ struct eqn {
#define MPARSE_MAN 2 /* assume -man */
#define MPARSE_SO 4 /* honour .so requests */
#define MPARSE_QUICK 8 /* abort the parse early */
+#define MPARSE_UTF8 16 /* accept UTF-8 input */
+#define MPARSE_LATIN1 32 /* accept ISO-LATIN-1 input */
enum mandoc_esc {
ESCAPE_ERROR = 0, /* bail! unparsable escape */
diff --git a/preconv.1 b/preconv.1
deleted file mode 100644
index 7b6e647f..00000000
--- a/preconv.1
+++ /dev/null
@@ -1,157 +0,0 @@
-.\" $Id$
-.\"
-.\" Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
-.\"
-.\" Permission to use, copy, modify, and distribute this software for any
-.\" purpose with or without fee is hereby granted, provided that the above
-.\" copyright notice and this permission notice appear in all copies.
-.\"
-.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-.\"
-.Dd $Mdocdate$
-.Dt PRECONV 1
-.Os
-.Sh NAME
-.Nm preconv
-.Nd recode multibyte UNIX manuals
-.Sh SYNOPSIS
-.Nm preconv
-.Op Fl D Ar enc
-.Op Fl e Ar enc
-.Op Ar file
-.Sh DESCRIPTION
-The
-.Nm
-utility recodes multibyte
-.Ux
-manual files into
-.Xr mandoc 1
-.Po
-or other troff system supporting the
-.Sq \e[uNNNN]
-escape sequence
-.Pc
-input.
-.Pp
-By default, it parses from standard output, determining encoding as
-described in
-.Sx Algorithm .
-.Pp
-Its arguments are as follows:
-.Bl -tag -width Ds
-.It Fl D Ar enc
-The default encoding.
-.It Fl e Ar enc
-The document's encoding.
-.It Ar file
-The input file.
-.El
-.Pp
-The recoded input is written to standard output: Unicode characters in
-the ASCII range are printed as regular ASCII characters, while those
-above this range are printed using the
-.Sq \e[uNNNN]
-format documented in
-.Xr mandoc_char 7 .
-.Pp
-If input bytes are improperly formed in the current encoding, they're
-passed unmodified to standard output.
-For some encodings, such as UTF-8, unrecoverable input sequences will
-cause
-.Nm
-to stop processing and exit.
-.Ss Algorithm
-An encoding is chosen according to the following steps:
-.Bl -enum
-.It
-From the argument passed to
-.Fl e Ar enc .
-.It
-If a BOM exists, UTF\-8 encoding is selected.
-.It
-From the coding tags parsed from
-.Qq File Variables
-on the first two lines of input.
-A file variable is an input line of the form
-.Pp
-.Dl \%.\e\(dq -*- key: val [; key: val ]* -*-
-.Pp
-A coding tag variable is where
-.Cm key
-is
-.Qq coding
-and
-.Cm val
-is the name of the encoding.
-A typical file variable with a coding tag is
-.Pp
-.Dl \%.\e\(dq -*- mode: troff; coding: utf-8 -*-
-.It
-From the argument passed to
-.Fl D Ar enc .
-.It
-If all else fails, Latin\-1 is used.
-.El
-.Pp
-The
-.Nm
-utility recognises the UTF\-8, us\-ascii, and latin\-1 encodings as
-passed to the
-.Fl e
-and
-.Fl D
-arguments, or as coding tags.
-Encodings are matched case-insensitively.
-.\" .Sh IMPLEMENTATION NOTES
-.\" Not used in OpenBSD.
-.\" .Sh RETURN VALUES
-.\" For sections 2, 3, & 9 only.
-.\" .Sh ENVIRONMENT
-.\" For sections 1, 6, 7, & 8 only.
-.\" .Sh FILES
-.Sh EXIT STATUS
-.Ex -std
-.Sh EXAMPLES
-Explicitly page a UTF\-8 manual
-.Pa foo.1
-in the current locale:
-.Pp
-.Dl $ preconv \-e utf\-8 foo.1 | mandoc -Tlocale | less
-.\" .Sh DIAGNOSTICS
-.\" For sections 1, 4, 6, 7, & 8 only.
-.\" .Sh ERRORS
-.\" For sections 2, 3, & 9 only.
-.Sh SEE ALSO
-.Xr mandoc 1 ,
-.Xr mandoc_char 7
-.Sh STANDARDS
-The
-.Nm
-utility references the US-ASCII character set standard, ANSI_X3.4\-1968;
-the Latin\-1 character set standard, ISO/IEC 8859\-1:1998; the UTF\-8
-character set standard; and UCS (Unicode), ISO/IEC 10646.
-.Sh HISTORY
-The
-.Nm
-utility first appeared in the GNU troff
-.Pq Dq groff
-system in December 2005, authored by Tomohiro Kubota and Werner
-Lemberg.
-The implementation that is part of the
-.Xr mandoc 1
-utility appeared in May 2011.
-.Sh AUTHORS
-The
-.Nm
-utility was written by
-.An Kristaps Dzonsons Aq Mt kristaps@bsd.lv .
-.\" .Sh CAVEATS
-.\" .Sh BUGS
-.\" .Sh SECURITY CONSIDERATIONS
-.\" Not used in OpenBSD.
diff --git a/preconv.c b/preconv.c
index 64ed5686..d427c3cd 100644
--- a/preconv.c
+++ b/preconv.c
@@ -1,6 +1,7 @@
/* $Id$ */
/*
* Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -17,117 +18,24 @@
#include "config.h"
#include <sys/types.h>
-#if HAVE_MMAP
-#include <sys/stat.h>
-#include <sys/mman.h>
-#endif
-#include <assert.h>
-#include <fcntl.h>
#include <stdio.h>
-#include <stdlib.h>
#include <string.h>
-#include <unistd.h>
+#include "mandoc.h"
+#include "libmandoc.h"
-/*
- * The read_whole_file() and resize_buf() functions are copied from
- * read.c, including all dependency code.
- */
-
-enum enc {
- ENC_UTF_8, /* UTF-8 */
- ENC_US_ASCII, /* US-ASCII */
- ENC_LATIN_1, /* Latin-1 */
- ENC__MAX
-};
-
-struct buf {
- char *buf; /* binary input buffer */
- size_t sz; /* size of binary buffer */
- size_t offs; /* starting buffer offset */
-};
-
-struct encode {
- const char *name;
- int (*conv)(const struct buf *);
-};
-
-static int cue_enc(const struct buf *, size_t *, enum enc *);
-static int conv_latin_1(const struct buf *);
-static int conv_us_ascii(const struct buf *);
-static int conv_utf_8(const struct buf *);
-static int read_whole_file(const char *, int,
- struct buf *, int *);
-static void resize_buf(struct buf *, size_t);
-static void usage(void);
-
-static const struct encode encs[ENC__MAX] = {
- { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
- { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
- { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
-};
-
-static const char *progname;
-
-static void
-usage(void)
-{
-
- fprintf(stderr, "usage: %s "
- "[-D enc] "
- "[-e ENC] "
- "[file]\n", progname);
-}
-
-static int
-conv_latin_1(const struct buf *b)
-{
- size_t i;
- unsigned char cu;
- const char *cp;
-
- cp = b->buf + (int)b->offs;
-
- /*
- * Latin-1 falls into the first 256 code-points of Unicode, so
- * there's no need for any sort of translation. Just make the
- * 8-bit characters use the Unicode escape.
- * Note that binary values 128 < v < 160 are passed through
- * unmodified to mandoc.
- */
-
- for (i = b->offs; i < b->sz; i++) {
- cu = (unsigned char)*cp++;
- cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
- }
-
- return(1);
-}
-
-static int
-conv_us_ascii(const struct buf *b)
-{
-
- /*
- * US-ASCII has no conversion since it falls into the first 128
- * bytes of Unicode.
- */
-
- fwrite(b->buf, 1, b->sz, stdout);
- return(1);
-}
-
-static int
-conv_utf_8(const struct buf *b)
+int
+preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
{
int state, be;
unsigned int accum;
size_t i;
unsigned char cu;
- const char *cp;
const long one = 1L;
- cp = b->buf + (int)b->offs;
+ if ( ! (*filenc & MPARSE_UTF8))
+ goto latin;
+
state = 0;
accum = 0U;
be = 0;
@@ -137,23 +45,26 @@ conv_utf_8(const struct buf *b)
if ( ! (*((const char *)(&one))))
be = 1;
- for (i = b->offs; i < b->sz; i++) {
- cu = (unsigned char)*cp++;
+ for (i = ib->offs; i < ib->sz; i++) {
+ cu = ib->buf[i];
if (state) {
if ( ! (cu & 128) || (cu & 64)) {
/* Bad sequence header. */
- return(0);
+ break;
}
/* Accept only legitimate bit patterns. */
if (cu > 191 || cu < 128) {
/* Bad in-sequence bits. */
- return(0);
+ break;
}
accum |= (cu & 63) << --state * 6;
+ if (state)
+ continue;
+
/*
* Accum is held in little-endian order as
* stipulated by the UTF-8 sequence coding. We
@@ -161,18 +72,21 @@ conv_utf_8(const struct buf *b)
* architecture requires it.
*/
- if (0 == state && be)
+ if (be)
accum = (accum >> 24) |
((accum << 8) & 0x00FF0000) |
((accum >> 8) & 0x0000FF00) |
(accum << 24);
- if (0 == state) {
- accum < 128U ? putchar(accum) :
- printf("\\[u%.4X]", accum);
- accum = 0U;
- }
- } else if (cu & (1 << 7)) {
+ if (accum < 0x80)
+ ob->buf[ob->offs++] = accum;
+ else
+ ob->offs += snprintf(ob->buf + ob->offs,
+ 11, "\\[u%.4X]", accum);
+ ib->offs = i + 1;
+ *filenc &= ~MPARSE_LATIN1;
+ return(1);
+ } else {
/*
* Entering a UTF-8 state: if we encounter a
* UTF-8 bitmask, calculate the expected UTF-8
@@ -184,154 +98,69 @@ conv_utf_8(const struct buf *b)
/* Accept only legitimate bit patterns. */
- switch (state) {
+ switch (state--) {
case (4):
if (cu <= 244 && cu >= 240) {
accum = (cu & 7) << 18;
- break;
+ continue;
}
/* Bad 4-sequence start bits. */
- return(0);
+ break;
case (3):
if (cu <= 239 && cu >= 224) {
accum = (cu & 15) << 12;
- break;
+ continue;
}
/* Bad 3-sequence start bits. */
- return(0);
+ break;
case (2):
if (cu <= 223 && cu >= 194) {
accum = (cu & 31) << 6;
- break;
+ continue;
}
/* Bad 2-sequence start bits. */
- return(0);
+ break;
default:
/* Bad sequence bit mask. */
- return(0);
+ break;
}
- state--;
- } else
- putchar(cu);
- }
-
- if (0 != state) {
- /* Bad trailing bits. */
- return(0);
- }
-
- return(1);
-}
-
-static void
-resize_buf(struct buf *buf, size_t initial)
-{
-
- buf->sz = buf->sz > initial / 2 ?
- 2 * buf->sz : initial;
-
- buf->buf = realloc(buf->buf, buf->sz);
- if (NULL == buf->buf) {
- perror(NULL);
- exit(EXIT_FAILURE);
- }
-}
-
-static int
-read_whole_file(const char *f, int fd,
- struct buf *fb, int *with_mmap)
-{
- size_t off;
- ssize_t ssz;
-
-#if HAVE_MMAP
- struct stat st;
- if (-1 == fstat(fd, &st)) {
- perror(f);
- return(0);
+ break;
+ }
}
- /*
- * If we're a regular file, try just reading in the whole entry
- * via mmap(). This is faster than reading it into blocks, and
- * since each file is only a few bytes to begin with, I'm not
- * concerned that this is going to tank any machines.
- */
+ /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
- if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
- fprintf(stderr, "%s: input too large\n", f);
+latin:
+ if ( ! (*filenc & MPARSE_LATIN1))
return(0);
- }
-
- if (S_ISREG(st.st_mode)) {
- *with_mmap = 1;
- fb->sz = (size_t)st.st_size;
- fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
- if (fb->buf != MAP_FAILED)
- return(1);
- }
-#endif
-
- /*
- * If this isn't a regular file (like, say, stdin), then we must
- * go the old way and just read things in bit by bit.
- */
-
- *with_mmap = 0;
- off = 0;
- fb->sz = 0;
- fb->buf = NULL;
- for (;;) {
- if (off == fb->sz && fb->sz == (1U << 31)) {
- fprintf(stderr, "%s: input too large\n", f);
- break;
- }
-
- if (off == fb->sz)
- resize_buf(fb, 65536);
- ssz = read(fd, fb->buf + (int)off, fb->sz - off);
- if (ssz == 0) {
- fb->sz = off;
- return(1);
- }
- if (ssz == -1) {
- perror(f);
- break;
- }
- off += (size_t)ssz;
- }
+ ob->offs += snprintf(ob->buf + ob->offs, 11,
+ "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
- free(fb->buf);
- fb->buf = NULL;
- return(0);
+ *filenc &= ~MPARSE_UTF8;
+ return(1);
}
-static int
-cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
+int
+preconv_cue(const struct buf *b)
{
const char *ln, *eoln, *eoph;
- size_t sz, phsz, nsz;
- int i;
+ size_t sz, phsz;
- ln = b->buf + (int)*offs;
- sz = b->sz - *offs;
+ ln = b->buf + b->offs;
+ sz = b->sz - b->offs;
/* Look for the end-of-line. */
if (NULL == (eoln = memchr(ln, '\n', sz)))
- return(-1);
-
- /* Set next-line marker. */
-
- *offs = (size_t)((eoln + 1) - b->buf);
+ eoln = ln + sz;
/* Check if we have the correct header/trailer. */
if ((sz = (size_t)(eoln - ln)) < 10 ||
memcmp(ln, ".\\\" -*-", 7) ||
memcmp(eoln - 3, "-*-", 3))
- return(0);
+ return(MPARSE_UTF8 | MPARSE_LATIN1);
/* Move after the header and adjust for the trailer. */
@@ -355,8 +184,8 @@ cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
/* Only account for the "coding" phrase. */
- if ((phsz = (size_t)(eoph - ln)) < 7 ||
- strncasecmp(ln, "coding:", 7)) {
+ if ((phsz = eoph - ln) < 7 ||
+ strncasecmp(ln, "coding:", 7)) {
sz -= phsz;
ln += phsz;
continue;
@@ -370,153 +199,15 @@ cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
sz--;
}
if (0 == sz)
- break;
+ return(0);
/* Check us against known encodings. */
- for (i = 0; i < (int)ENC__MAX; i++) {
- nsz = strlen(encs[i].name);
- if (phsz < nsz)
- continue;
- if (strncasecmp(ln, encs[i].name, nsz))
- continue;
-
- *enc = (enum enc)i;
- return(1);
- }
-
- /* Unknown encoding. */
-
- *enc = ENC__MAX;
- return(1);
- }
-
- return(0);
-}
-
-int
-main(int argc, char *argv[])
-{
- int i, ch, map, fd, rc;
- struct buf b;
- const char *fn;
- enum enc enc, def;
- unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
- size_t offs;
- extern int optind;
- extern char *optarg;
-
- progname = strrchr(argv[0], '/');
- if (progname == NULL)
- progname = argv[0];
- else
- ++progname;
-
- fn = "<stdin>";
- fd = STDIN_FILENO;
- rc = EXIT_FAILURE;
- enc = def = ENC__MAX;
- map = 0;
-
- memset(&b, 0, sizeof(struct buf));
-
- while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
- switch (ch) {
- case ('D'):
- /* FALLTHROUGH */
- case ('e'):
- for (i = 0; i < (int)ENC__MAX; i++) {
- if (strcasecmp(optarg, encs[i].name))
- continue;
- break;
- }
- if (i < (int)ENC__MAX) {
- if ('D' == ch)
- def = (enum enc)i;
- else
- enc = (enum enc)i;
- break;
- }
-
- fprintf(stderr, "%s: Bad encoding\n", optarg);
- return(EXIT_FAILURE);
- case ('r'):
- /* FALLTHROUGH */
- case ('d'):
- /* FALLTHROUGH */
- case ('v'):
- /* Compatibility with GNU preconv. */
- break;
- case ('h'):
- /* Compatibility with GNU preconv. */
- /* FALLTHROUGH */
- default:
- usage();
- return(EXIT_FAILURE);
- }
-
- argc -= optind;
- argv += optind;
-
- /*
- * Open and read the first argument on the command-line.
- * If we don't have one, we default to stdin.
- */
-
- if (argc > 0) {
- fn = *argv;
- fd = open(fn, O_RDONLY, 0);
- if (-1 == fd) {
- perror(fn);
- return(EXIT_FAILURE);
- }
- }
-
- if ( ! read_whole_file(fn, fd, &b, &map))
- goto out;
-
- /* Try to read the UTF-8 BOM. */
-
- if (ENC__MAX == enc)
- if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
- b.offs = 3;
- enc = ENC_UTF_8;
- }
-
- /* Try reading from the "-*-" cue. */
-
- if (ENC__MAX == enc) {
- offs = b.offs;
- ch = cue_enc(&b, &offs, &enc);
- if (0 == ch)
- ch = cue_enc(&b, &offs, &enc);
- }
-
- /*
- * No encoding has been detected.
- * Thus, we either fall into our default encoder, if specified,
- * or use Latin-1 if all else fails.
- */
-
- if (ENC__MAX == enc)
- enc = ENC__MAX == def ? ENC_LATIN_1 : def;
-
- if ( ! (*encs[(int)enc].conv)(&b)) {
- fprintf(stderr, "%s: Bad encoding\n", fn);
- goto out;
+ if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
+ return(MPARSE_UTF8);
+ if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
+ return(MPARSE_LATIN1);
+ return(0);
}
-
- rc = EXIT_SUCCESS;
-out:
-#if HAVE_MMAP
- if (map)
- munmap(b.buf, b.sz);
- else
-#endif
- free(b.buf);
-
- if (fd > STDIN_FILENO)
- close(fd);
-
- return(rc);
+ return(MPARSE_UTF8 | MPARSE_LATIN1);
}
diff --git a/read.c b/read.c
index b595a1ae..43368b25 100644
--- a/read.c
+++ b/read.c
@@ -45,11 +45,6 @@
#define REPARSE_LIMIT 1000
-struct buf {
- char *buf; /* binary input buffer */
- size_t sz; /* size of binary buffer */
-};
-
struct mparse {
struct man *pman; /* persistent man parser */
struct mdoc *pmdoc; /* persistent mdoc parser */
@@ -65,6 +60,7 @@ struct mparse {
enum mandoclevel file_status; /* status of current parse */
enum mandoclevel wlevel; /* ignore messages below this */
int options; /* parser options */
+ int filenc; /* encoding of the current file */
int reparse_count; /* finite interp. stack */
int line; /* line number in the file */
};
@@ -326,13 +322,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
lnn = curp->line;
pos = 0;
- for (i = 0; i < (int)blk.sz; ) {
+ for (i = blk.offs; i < (int)blk.sz; ) {
if (0 == pos && '\0' == blk.buf[i])
break;
if (start) {
curp->line = lnn;
curp->reparse_count = 0;
+
+ if (lnn < 3 &&
+ curp->filenc & MPARSE_UTF8 &&
+ curp->filenc & MPARSE_LATIN1) {
+ blk.offs = i;
+ curp->filenc = preconv_cue(&blk);
+ }
}
while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
@@ -353,27 +356,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
}
/*
- * Make sure we have space for at least
- * one backslash and one other character
- * and the trailing NUL byte.
+ * Make sure we have space for the worst
+ * case of 11 bytes: "\\[u10ffff]\0"
*/
- if (pos + 2 >= (int)ln.sz)
+ if (pos + 11 > (int)ln.sz)
resize_buf(&ln, 256);
/*
- * Warn about bogus characters. If you're using
- * non-ASCII encoding, you're screwing your
- * readers. Since I'd rather this not happen,
- * I'll be helpful and replace these characters
- * with "?", so we don't display gibberish.
- * Note to manual writers: use special characters.
+ * Encode 8-bit input.
*/
- c = (unsigned char) blk.buf[i];
+ c = blk.buf[i];
+ if (c & 0x80) {
+ blk.offs = i;
+ ln.offs = pos;
+ if (curp->filenc && preconv_encode(
+ &blk, &ln, &curp->filenc)) {
+ pos = ln.offs;
+ i = blk.offs;
+ } else {
+ mandoc_vmsg(MANDOCERR_BADCHAR,
+ curp, curp->line, pos,
+ "0x%x", c);
+ ln.buf[pos++] = '?';
+ i++;
+ }
+ continue;
+ }
+
+ /*
+ * Exclude control characters.
+ */
- if ( ! (isascii(c) &&
- (isgraph(c) || isblank(c)))) {
+ if (c == 0x7f || (c < 0x20 && c != 0x09)) {
mandoc_vmsg(MANDOCERR_BADCHAR, curp,
curp->line, pos, "0x%x", c);
i++;
@@ -633,6 +649,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
return(0);
}
*with_mmap = 1;
+ fb->offs = 0;
fb->sz = (size_t)st.st_size;
fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
if (fb->buf != MAP_FAILED)
@@ -664,6 +681,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
ssz = read(fd, fb->buf + (int)off, fb->sz - off);
if (ssz == 0) {
fb->sz = off;
+ fb->offs = 0;
return(1);
}
if (ssz == -1) {
@@ -735,6 +753,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
curp->line = 1;
recursion_depth++;
+ /* Skip an UTF-8 byte order mark. */
+ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+ (unsigned char)blk.buf[0] == 0xef &&
+ (unsigned char)blk.buf[1] == 0xbb &&
+ (unsigned char)blk.buf[2] == 0xbf) {
+ blk.offs = 3;
+ curp->filenc &= ~MPARSE_LATIN1;
+ }
+
mparse_buf_r(curp, blk, 1);
if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
@@ -752,6 +779,7 @@ mparse_readmem(struct mparse *curp, const void *buf, size_t len,
blk.buf = UNCONST(buf);
blk.sz = len;
+ blk.offs = 0;
mparse_parse_buffer(curp, blk, file);
return(curp->file_status);
@@ -762,6 +790,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
{
struct buf blk;
int with_mmap;
+ int save_filenc;
if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
curp->file_status = MANDOCLEVEL_SYSERR;
@@ -780,7 +809,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
*/
if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+ save_filenc = curp->filenc;
+ curp->filenc = curp->options &
+ (MPARSE_UTF8 | MPARSE_LATIN1);
mparse_parse_buffer(curp, blk, file);
+ curp->filenc = save_filenc;
#if HAVE_MMAP
if (with_mmap)
munmap(blk.buf, blk.sz);