integrate preconv(1) into mandoc(1);

enhances functionality and reduces code and docs by more than 300 lines
author: Ingo Schwarze <schwarze@openbsd.org> 2014-10-25 01:03:52 +0000
committer: Ingo Schwarze <schwarze@openbsd.org> 2014-10-25 01:03:52 +0000
commit: dd148a56f3f3e29132148b8f2bace859b7590d34 (patch)
tree: 5290383cf43df4fcf23e706458fc63fd96eb8cbf
parent: e3f177878d1b2ecad452ce2d7d08861420b2ffbb (diff)
download: mandoc-dd148a56f3f3e29132148b8f2bace859b7590d34.tar.gz
10 files changed, 180 insertions, 557 deletions
diff --git a/Makefile b/Makefile
index b07ff5ce..610b3801 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@
 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-BASEBIN		 = mandoc preconv demandoc
+BASEBIN		 = mandoc demandoc
 DBBIN		 = makewhatis
 CGIBIN		 = man.cgi
 
@@ -145,7 +145,6 @@ DISTFILES	 = INSTALL \
 		   mdoc.h \
 		   msec.in \
 		   out.h \
-		   preconv.1 \
 		   predefs.in \
 		   roff.7 \
 		   st.in \
@@ -186,6 +185,7 @@ LIBMANDOC_OBJS	 = $(LIBMAN_OBJS) \
 		   mandoc.o \
 		   mandoc_aux.o \
 		   msec.o \
+		   preconv.o \
 		   read.o
 
 COMPAT_OBJS	 = compat_fgetln.o \
@@ -226,8 +226,6 @@ MAN_OBJS	 = $(MANDOC_OBJS)
 
 MAKEWHATIS_OBJS	 = mandocdb.o mansearch_const.o manpath.o
 
-PRECONV_OBJS	 = preconv.o
-
 APROPOS_OBJS	 = mansearch.o mansearch_const.o manpath.o
 
 CGI_OBJS	 = $(MANDOC_HTML_OBJS) \
@@ -244,7 +242,6 @@ WWW_MANS	 = apropos.1.html \
 		   demandoc.1.html \
 		   man.1.html \
 		   mandoc.1.html \
-		   preconv.1.html \
 		   mandoc.3.html \
 		   mandoc_escape.3.html \
 		   mandoc_html.3.html \
@@ -302,7 +299,6 @@ clean:
 	rm -f libmandoc.a $(LIBMANDOC_OBJS) $(COMPAT_OBJS)
 	rm -f mandoc $(MANDOC_OBJS) $(APROPOS_OBJS)
 	rm -f makewhatis $(MAKEWHATIS_OBJS)
-	rm -f preconv $(PRECONV_OBJS)
 	rm -f man.cgi $(CGI_OBJS)
 	rm -f manpage $(MANPAGE_OBJS)
 	rm -f demandoc $(DEMANDOC_OBJS)
@@ -321,7 +317,7 @@ base-install: base-build
 	$(INSTALL_LIB) libmandoc.a $(DESTDIR)$(LIBDIR)
 	$(INSTALL_LIB) man.h mandoc.h mandoc_aux.h mdoc.h \
 		$(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_MAN) man.1 mandoc.1 preconv.1 demandoc.1 \
+	$(INSTALL_MAN) man.1 mandoc.1 demandoc.1 \
 		$(DESTDIR)$(MANDIR)/man1
 	$(INSTALL_MAN) mandoc.3 mandoc_escape.3 mandoc_malloc.3 \
 		mchars_alloc.3 tbl.3 $(DESTDIR)$(MANDIR)/man3
@@ -386,9 +382,6 @@ mandoc: $(MAN_OBJS) libmandoc.a
 makewhatis: $(MAKEWHATIS_OBJS) libmandoc.a
 	$(CC) $(LDFLAGS) -o $@ $(MAKEWHATIS_OBJS) libmandoc.a $(DBLIB)
 
-preconv: $(PRECONV_OBJS)
-	$(CC) $(LDFLAGS) -o $@ $(PRECONV_OBJS)
-
 manpage: $(MANPAGE_OBJS) libmandoc.a
 	$(CC) $(LDFLAGS) -o $@ $(MANPAGE_OBJS) libmandoc.a $(DBLIB)
 
diff --git a/apropos.1 b/apropos.1
index 01d6ac26..fc940d06 100644
--- a/apropos.1
+++ b/apropos.1
@@ -79,7 +79,7 @@ to paginate them.
 In
 .Fl a
 mode, the options
-.Fl IOTW
+.Fl IKOTW
 described in the
 .Xr mandoc 1
 manual are also available.
diff --git a/libmandoc.h b/libmandoc.h
index d74d1e20..027009db 100644
--- a/libmandoc.h
+++ b/libmandoc.h
@@ -30,6 +30,12 @@ enum	rofferr {
 	ROFF_ERR /* badness: puke and stop */
 };
 
+struct	buf {
+	char	*buf;
+	size_t	 sz;
+	size_t	 offs;
+};
+
 __BEGIN_DECLS
 
 struct	roff;
@@ -66,6 +72,9 @@ int		 man_endparse(struct man *);
 int		 man_addspan(struct man *, const struct tbl_span *);
 int		 man_addeqn(struct man *, const struct eqn *);
 
+int		 preconv_cue(const struct buf *);
+int		 preconv_encode(struct buf *, struct buf *, int *);
+
 void		 roff_free(struct roff *);
 struct roff	*roff_alloc(struct mparse *, int);
 void		 roff_reset(struct roff *);
diff --git a/main.c b/main.c
index 2451ff5e..2c358c75 100644
--- a/main.c
+++ b/main.c
@@ -82,6 +82,7 @@ struct	curparse {
 	char		  outopts[BUFSIZ]; /* buf of output opts */
 };
 
+static	int		  koptions(int *, char *);
 static	int		  moptions(int *, char *);
 static	void		  mmsg(enum mandocerr, enum mandoclevel,
 				const char *, int, int, const char *);
@@ -149,14 +150,15 @@ main(int argc, char *argv[])
 	memset(&curp, 0, sizeof(struct curparse));
 	curp.outtype = OUTT_ASCII;
 	curp.wlevel  = MANDOCLEVEL_FATAL;
-	options = MPARSE_SO;
+	options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1;
 	defos = NULL;
 
 	use_pager = 1;
 	show_usage = 0;
 	outmode = OUTMODE_DEF;
 
-	while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) {
+	while (-1 != (c = getopt(argc, argv,
+			"aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) {
 		switch (c) {
 		case 'a':
 			outmode = OUTMODE_ALL;
@@ -192,6 +194,10 @@ main(int argc, char *argv[])
 		case 'i':
 			outmode = OUTMODE_INT;
 			break;
+		case 'K':
+			if ( ! koptions(&options, optarg))
+				return((int)MANDOCLEVEL_BADARG);
+			break;
 		case 'k':
 			search.argmode = ARG_EXPR;
 			break;
@@ -599,6 +605,26 @@ fail:
 }
 
 static int
+koptions(int *options, char *arg)
+{
+
+	if ( ! strcmp(arg, "utf-8")) {
+		*options |=  MPARSE_UTF8;
+		*options &= ~MPARSE_LATIN1;
+	} else if ( ! strcmp(arg, "iso-8859-1")) {
+		*options |=  MPARSE_LATIN1;
+		*options &= ~MPARSE_UTF8;
+	} else if ( ! strcmp(arg, "us-ascii")) {
+		*options &= ~(MPARSE_UTF8 | MPARSE_LATIN1);
+	} else {
+		fprintf(stderr, "%s: -K%s: Bad argument\n",
+		    progname, arg);
+		return(0);
+	}
+	return(1);
+}
+
+static int
 moptions(int *options, char *arg)
 {
 
diff --git a/man.1 b/man.1
index eb266fe7..4693d165 100644
--- a/man.1
+++ b/man.1
@@ -255,7 +255,7 @@ combination.
 The
 .Nm
 utility also supports the options
-.Fl IOTW
+.Fl IKOTW
 described in the
 .Xr mandoc 1
 manual.
diff --git a/mandoc.1 b/mandoc.1
index b9ed80f7..aa903147 100644
--- a/mandoc.1
+++ b/mandoc.1
@@ -27,6 +27,7 @@
 .Sm off
 .Op Fl I Cm os Li = Ar name
 .Sm on
+.Op Fl K Na Ar encoding
 .Op Fl m Ns Ar format
 .Op Fl O Ns Ar option
 .Op Fl T Ns Ar output
@@ -89,6 +90,31 @@ macro.
 Display only the SYNOPSIS lines.
 Implies
 .Fl a .
+.It Fl K Ns Ar encoding
+Specify the input encoding.
+The supported
+.Ar encoding
+arguments are
+.Cm us-ascii ,
+.Cm iso-8859-1 ,
+and
+.Cm utf-8 .
+If not specified, autodetection uses the first match:
+.Bl -tag -width iso-8859-1
+.It Cm utf-8
+if the first three bytes of the input file
+are the UTF-8 byte order mark (BOM, 0xefbbbf)
+.It Ar encoding
+if the first or second line of the input file matches the
+.Sy emacs
+mode line format
+.Pp
+.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*-
+.It Cm utf-8
+if the first non-ASCII byte in the file introduces a valid UTF-8 sequence
+.It Cm iso-8859-1
+otherwise
+.El
 .It Fl k
 A synonym for
 .Xr apropos 1 .
diff --git a/mandoc.h b/mandoc.h
index cf22d6de..14fddd36 100644
--- a/mandoc.h
+++ b/mandoc.h
@@ -393,6 +393,8 @@ struct	eqn {
 #define	MPARSE_MAN	2  /* assume -man */
 #define	MPARSE_SO	4  /* honour .so requests */
 #define	MPARSE_QUICK	8  /* abort the parse early */
+#define	MPARSE_UTF8	16 /* accept UTF-8 input */
+#define	MPARSE_LATIN1	32 /* accept ISO-LATIN-1 input */
 
 enum	mandoc_esc {
 	ESCAPE_ERROR = 0, /* bail! unparsable escape */
diff --git a/preconv.1 b/preconv.1
deleted file mode 100644
index 7b6e647f..00000000
--- a/preconv.1
+++ /dev/null
@@ -1,157 +0,0 @@
-.\"	$Id$
-.\"
-.\" Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
-.\"
-.\" Permission to use, copy, modify, and distribute this software for any
-.\" purpose with or without fee is hereby granted, provided that the above
-.\" copyright notice and this permission notice appear in all copies.
-.\"
-.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-.\"
-.Dd $Mdocdate$
-.Dt PRECONV 1
-.Os
-.Sh NAME
-.Nm preconv
-.Nd recode multibyte UNIX manuals
-.Sh SYNOPSIS
-.Nm preconv
-.Op Fl D Ar enc
-.Op Fl e Ar enc
-.Op Ar file
-.Sh DESCRIPTION
-The
-.Nm
-utility recodes multibyte
-.Ux
-manual files into
-.Xr mandoc 1
-.Po
-or other troff system supporting the
-.Sq \e[uNNNN]
-escape sequence
-.Pc
-input.
-.Pp
-By default, it parses from standard output, determining encoding as
-described in
-.Sx Algorithm .
-.Pp
-Its arguments are as follows:
-.Bl -tag -width Ds
-.It Fl D Ar enc
-The default encoding.
-.It Fl e Ar enc
-The document's encoding.
-.It Ar file
-The input file.
-.El
-.Pp
-The recoded input is written to standard output: Unicode characters in
-the ASCII range are printed as regular ASCII characters, while those
-above this range are printed using the
-.Sq \e[uNNNN]
-format documented in
-.Xr mandoc_char 7 .
-.Pp
-If input bytes are improperly formed in the current encoding, they're
-passed unmodified to standard output.
-For some encodings, such as UTF-8, unrecoverable input sequences will
-cause
-.Nm
-to stop processing and exit.
-.Ss Algorithm
-An encoding is chosen according to the following steps:
-.Bl -enum
-.It
-From the argument passed to
-.Fl e Ar enc .
-.It
-If a BOM exists, UTF\-8 encoding is selected.
-.It
-From the coding tags parsed from
-.Qq File Variables
-on the first two lines of input.
-A file variable is an input line of the form
-.Pp
-.Dl \%.\e\(dq -*- key: val [; key: val ]* -*-
-.Pp
-A coding tag variable is where
-.Cm key
-is
-.Qq coding
-and
-.Cm val
-is the name of the encoding.
-A typical file variable with a coding tag is
-.Pp
-.Dl \%.\e\(dq -*- mode: troff; coding: utf-8 -*-
-.It
-From the argument passed to
-.Fl D Ar enc .
-.It
-If all else fails, Latin\-1 is used.
-.El
-.Pp
-The
-.Nm
-utility recognises the UTF\-8, us\-ascii, and latin\-1 encodings as
-passed to the
-.Fl e
-and
-.Fl D
-arguments, or as coding tags.
-Encodings are matched case-insensitively.
-.\" .Sh IMPLEMENTATION NOTES
-.\" Not used in OpenBSD.
-.\" .Sh RETURN VALUES
-.\" For sections 2, 3, & 9 only.
-.\" .Sh ENVIRONMENT
-.\" For sections 1, 6, 7, & 8 only.
-.\" .Sh FILES
-.Sh EXIT STATUS
-.Ex -std
-.Sh EXAMPLES
-Explicitly page a UTF\-8 manual
-.Pa foo.1
-in the current locale:
-.Pp
-.Dl $ preconv \-e utf\-8 foo.1 | mandoc -Tlocale | less
-.\" .Sh DIAGNOSTICS
-.\" For sections 1, 4, 6, 7, & 8 only.
-.\" .Sh ERRORS
-.\" For sections 2, 3, & 9 only.
-.Sh SEE ALSO
-.Xr mandoc 1 ,
-.Xr mandoc_char 7
-.Sh STANDARDS
-The
-.Nm
-utility references the US-ASCII character set standard, ANSI_X3.4\-1968;
-the Latin\-1 character set standard, ISO/IEC 8859\-1:1998; the UTF\-8
-character set standard; and UCS (Unicode), ISO/IEC 10646.
-.Sh HISTORY
-The
-.Nm
-utility first appeared in the GNU troff
-.Pq Dq groff
-system in December 2005, authored by Tomohiro Kubota and Werner
-Lemberg.
-The implementation that is part of the
-.Xr mandoc 1
-utility appeared in May 2011.
-.Sh AUTHORS
-The
-.Nm
-utility was written by
-.An Kristaps Dzonsons Aq Mt kristaps@bsd.lv .
-.\" .Sh CAVEATS
-.\" .Sh BUGS
-.\" .Sh SECURITY CONSIDERATIONS
-.\" Not used in OpenBSD.
diff --git a/preconv.c b/preconv.c
index 64ed5686..d427c3cd 100644
--- a/preconv.c
+++ b/preconv.c
@@ -1,6 +1,7 @@
 /*	$Id$ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -17,117 +18,24 @@
 #include "config.h"
 
 #include <sys/types.h>
-#if HAVE_MMAP
-#include <sys/stat.h>
-#include <sys/mman.h>
-#endif
 
-#include <assert.h>
-#include <fcntl.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
+#include "mandoc.h"
+#include "libmandoc.h"
 
-/* 
- * The read_whole_file() and resize_buf() functions are copied from
- * read.c, including all dependency code.
- */
-
-enum	enc {
-	ENC_UTF_8, /* UTF-8 */
-	ENC_US_ASCII, /* US-ASCII */
-	ENC_LATIN_1, /* Latin-1 */
-	ENC__MAX
-};
-
-struct	buf {
-	char		 *buf; /* binary input buffer */
-	size_t	 	  sz; /* size of binary buffer */
-	size_t		  offs; /* starting buffer offset */
-};
-
-struct	encode {
-	const char	 *name;
-	int		(*conv)(const struct buf *);
-};
-
-static	int	 cue_enc(const struct buf *, size_t *, enum enc *);
-static	int	 conv_latin_1(const struct buf *);
-static	int	 conv_us_ascii(const struct buf *);
-static	int	 conv_utf_8(const struct buf *);
-static	int	 read_whole_file(const char *, int, 
-			struct buf *, int *);
-static	void	 resize_buf(struct buf *, size_t);
-static	void	 usage(void);
-
-static	const struct encode encs[ENC__MAX] = {
-	{ "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
-	{ "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
-	{ "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
-};
-
-static	const char	 *progname;
-
-static void
-usage(void)
-{
-
-	fprintf(stderr, "usage: %s "
-			"[-D enc] "
-			"[-e ENC] "
-			"[file]\n", progname);
-}
-
-static int
-conv_latin_1(const struct buf *b)
-{
-	size_t		 i;
-	unsigned char	 cu;
-	const char	*cp;
-
-	cp = b->buf + (int)b->offs;
-
-	/*
-	 * Latin-1 falls into the first 256 code-points of Unicode, so
-	 * there's no need for any sort of translation.  Just make the
-	 * 8-bit characters use the Unicode escape.
-	 * Note that binary values 128 < v < 160 are passed through
-	 * unmodified to mandoc.
-	 */
-
-	for (i = b->offs; i < b->sz; i++) {
-		cu = (unsigned char)*cp++;
-		cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
-	}
-
-	return(1);
-}
-
-static int
-conv_us_ascii(const struct buf *b)
-{
-
-	/*
-	 * US-ASCII has no conversion since it falls into the first 128
-	 * bytes of Unicode.
-	 */
-
-	fwrite(b->buf, 1, b->sz, stdout);
-	return(1);
-}
-
-static int
-conv_utf_8(const struct buf *b)
+int
+preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
 {
 	int		 state, be;
 	unsigned int	 accum;
 	size_t		 i;
 	unsigned char	 cu;
-	const char	*cp;
 	const long	 one = 1L;
 
-	cp = b->buf + (int)b->offs;
+	if ( ! (*filenc & MPARSE_UTF8))
+		goto latin;
+
 	state = 0;
 	accum = 0U;
 	be = 0;
@@ -137,23 +45,26 @@ conv_utf_8(const struct buf *b)
 	if ( ! (*((const char *)(&one))))
 		be = 1;
 
-	for (i = b->offs; i < b->sz; i++) {
-		cu = (unsigned char)*cp++;
+	for (i = ib->offs; i < ib->sz; i++) {
+		cu = ib->buf[i];
 		if (state) {
 			if ( ! (cu & 128) || (cu & 64)) {
 				/* Bad sequence header. */
-				return(0);
+				break;
 			}
 
 			/* Accept only legitimate bit patterns. */
 
 			if (cu > 191 || cu < 128) {
 				/* Bad in-sequence bits. */
-				return(0);
+				break;
 			}
 
 			accum |= (cu & 63) << --state * 6;
 
+			if (state)
+				continue;
+
 			/*
 			 * Accum is held in little-endian order as
 			 * stipulated by the UTF-8 sequence coding.  We
@@ -161,18 +72,21 @@ conv_utf_8(const struct buf *b)
 			 * architecture requires it.
 			 */
 
-			if (0 == state && be) 
+			if (be)
 				accum = (accum >> 24) | 
 					((accum << 8) & 0x00FF0000) |
 					((accum >> 8) & 0x0000FF00) |
 					(accum << 24);
 
-			if (0 == state) {
-				accum < 128U ? putchar(accum) : 
-					printf("\\[u%.4X]", accum);
-				accum = 0U;
-			}
-		} else if (cu & (1 << 7)) {
+			if (accum < 0x80)
+				ob->buf[ob->offs++] = accum;
+			else
+				ob->offs += snprintf(ob->buf + ob->offs,
+				    11, "\\[u%.4X]", accum);
+			ib->offs = i + 1;
+			*filenc &= ~MPARSE_LATIN1;
+			return(1);
+		} else {
 			/*
 			 * Entering a UTF-8 state:  if we encounter a
 			 * UTF-8 bitmask, calculate the expected UTF-8
@@ -184,154 +98,69 @@ conv_utf_8(const struct buf *b)
 
 			/* Accept only legitimate bit patterns. */
 
-			switch (state) {
+			switch (state--) {
 			case (4):
 				if (cu <= 244 && cu >= 240) {
 					accum = (cu & 7) << 18;
-					break;
+					continue;
 				}
 				/* Bad 4-sequence start bits. */
-				return(0);
+				break;
 			case (3):
 				if (cu <= 239 && cu >= 224) {
 					accum = (cu & 15) << 12;
-					break;
+					continue;
 				}
 				/* Bad 3-sequence start bits. */
-				return(0);
+				break;
 			case (2):
 				if (cu <= 223 && cu >= 194) {
 					accum = (cu & 31) << 6;
-					break;
+					continue;
 				}
 				/* Bad 2-sequence start bits. */
-				return(0);
+				break;
 			default:
 				/* Bad sequence bit mask. */
-				return(0);
+				break;
 			}
-			state--;
-		} else
-			putchar(cu);
-	}
-
-	if (0 != state) {
-		/* Bad trailing bits. */
-		return(0);
-	}
-
-	return(1);
-}
-
-static void
-resize_buf(struct buf *buf, size_t initial)
-{
-
-	buf->sz = buf->sz > initial / 2 ? 
-		2 * buf->sz : initial;
-
-	buf->buf = realloc(buf->buf, buf->sz);
-	if (NULL == buf->buf) {
-		perror(NULL);
-		exit(EXIT_FAILURE);
-	}
-}
-
-static int
-read_whole_file(const char *f, int fd, 
-		struct buf *fb, int *with_mmap)
-{
-	size_t		 off;
-	ssize_t		 ssz;
-
-#if HAVE_MMAP
-	struct stat	 st;
-	if (-1 == fstat(fd, &st)) {
-		perror(f);
-		return(0);
+			break;
+		}
 	}
 
-	/*
-	 * If we're a regular file, try just reading in the whole entry
-	 * via mmap().  This is faster than reading it into blocks, and
-	 * since each file is only a few bytes to begin with, I'm not
-	 * concerned that this is going to tank any machines.
-	 */
+	/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
 
-	if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
-		fprintf(stderr, "%s: input too large\n", f);
+latin:
+	if ( ! (*filenc & MPARSE_LATIN1))
 		return(0);
-	} 
-	
-	if (S_ISREG(st.st_mode)) {
-		*with_mmap = 1;
-		fb->sz = (size_t)st.st_size;
-		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
-		if (fb->buf != MAP_FAILED)
-			return(1);
-	}
-#endif
-
-	/*
-	 * If this isn't a regular file (like, say, stdin), then we must
-	 * go the old way and just read things in bit by bit.
-	 */
-
-	*with_mmap = 0;
-	off = 0;
-	fb->sz = 0;
-	fb->buf = NULL;
-	for (;;) {
-		if (off == fb->sz && fb->sz == (1U << 31)) {
-			fprintf(stderr, "%s: input too large\n", f);
-			break;
-		} 
-		
-		if (off == fb->sz)
-			resize_buf(fb, 65536);
 
-		ssz = read(fd, fb->buf + (int)off, fb->sz - off);
-		if (ssz == 0) {
-			fb->sz = off;
-			return(1);
-		}
-		if (ssz == -1) {
-			perror(f);
-			break;
-		}
-		off += (size_t)ssz;
-	}
+	ob->offs += snprintf(ob->buf + ob->offs, 11,
+	    "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
 
-	free(fb->buf);
-	fb->buf = NULL;
-	return(0);
+	*filenc &= ~MPARSE_UTF8;
+	return(1);
 }
 
-static int
-cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
+int
+preconv_cue(const struct buf *b)
 {
 	const char	*ln, *eoln, *eoph;
-	size_t		 sz, phsz, nsz;
-	int		 i;
+	size_t		 sz, phsz;
 
-	ln = b->buf + (int)*offs;
-	sz = b->sz - *offs;
+	ln = b->buf + b->offs;
+	sz = b->sz - b->offs;
 
 	/* Look for the end-of-line. */
 
 	if (NULL == (eoln = memchr(ln, '\n', sz)))
-		return(-1);
-
-	/* Set next-line marker. */
-
-	*offs = (size_t)((eoln + 1) - b->buf);
+		eoln = ln + sz;
 
 	/* Check if we have the correct header/trailer. */
 
 	if ((sz = (size_t)(eoln - ln)) < 10 || 
 			memcmp(ln, ".\\\" -*-", 7) ||
 			memcmp(eoln - 3, "-*-", 3))
-		return(0);
+		return(MPARSE_UTF8 | MPARSE_LATIN1);
 
 	/* Move after the header and adjust for the trailer. */
 
@@ -355,8 +184,8 @@ cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
 
 		/* Only account for the "coding" phrase. */
 
-		if ((phsz = (size_t)(eoph - ln)) < 7 ||
-				strncasecmp(ln, "coding:", 7)) {
+		if ((phsz = eoph - ln) < 7 ||
+		    strncasecmp(ln, "coding:", 7)) {
 			sz -= phsz;
 			ln += phsz;
 			continue;
@@ -370,153 +199,15 @@ cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
 			sz--;
 		}
 		if (0 == sz)
-			break;
+			return(0);
 
 		/* Check us against known encodings. */
 
-		for (i = 0; i < (int)ENC__MAX; i++) {
-			nsz = strlen(encs[i].name);
-			if (phsz < nsz)
-				continue;
-			if (strncasecmp(ln, encs[i].name, nsz))
-				continue;
-
-			*enc = (enum enc)i;
-			return(1);
-		}
-
-		/* Unknown encoding. */
-
-		*enc = ENC__MAX;
-		return(1);
-	}
-
-	return(0);
-}
-
-int
-main(int argc, char *argv[])
-{
-	int	 	 i, ch, map, fd, rc;
-	struct buf	 b;
-	const char	*fn;
-	enum enc	 enc, def;
-	unsigned char 	 bom[3] = { 0xEF, 0xBB, 0xBF };
-	size_t		 offs;
-	extern int	 optind;
-	extern char	*optarg;
-
-	progname = strrchr(argv[0], '/');
-	if (progname == NULL)
-		progname = argv[0];
-	else
-		++progname;
-
-	fn = "<stdin>";
-	fd = STDIN_FILENO;
-	rc = EXIT_FAILURE;
-	enc = def = ENC__MAX;
-	map = 0;
-
-	memset(&b, 0, sizeof(struct buf));
-
-	while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
-		switch (ch) {
-		case ('D'):
-			/* FALLTHROUGH */
-		case ('e'):
-			for (i = 0; i < (int)ENC__MAX; i++) {
-				if (strcasecmp(optarg, encs[i].name))
-					continue;
-				break;
-			}
-			if (i < (int)ENC__MAX) {
-				if ('D' == ch)
-					def = (enum enc)i;
-				else
-					enc = (enum enc)i;
-				break;
-			}
-
-			fprintf(stderr, "%s: Bad encoding\n", optarg);
-			return(EXIT_FAILURE);
-		case ('r'):
-			/* FALLTHROUGH */
-		case ('d'):
-			/* FALLTHROUGH */
-		case ('v'):
-			/* Compatibility with GNU preconv. */
-			break;
-		case ('h'):
-			/* Compatibility with GNU preconv. */
-			/* FALLTHROUGH */
-		default:
-			usage();
-			return(EXIT_FAILURE);
-		}
-
-	argc -= optind;
-	argv += optind;
-	
-	/* 
-	 * Open and read the first argument on the command-line.
-	 * If we don't have one, we default to stdin.
-	 */
-
-	if (argc > 0) {
-		fn = *argv;
-		fd = open(fn, O_RDONLY, 0);
-		if (-1 == fd) {
-			perror(fn);
-			return(EXIT_FAILURE);
-		}
-	}
-
-	if ( ! read_whole_file(fn, fd, &b, &map))
-		goto out;
-
-	/* Try to read the UTF-8 BOM. */
-
-	if (ENC__MAX == enc)
-		if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
-			b.offs = 3;
-			enc = ENC_UTF_8;
-		}
-
-	/* Try reading from the "-*-" cue. */
-
-	if (ENC__MAX == enc) {
-		offs = b.offs;
-		ch = cue_enc(&b, &offs, &enc);
-		if (0 == ch)
-			ch = cue_enc(&b, &offs, &enc);
-	}
-
-	/*
-	 * No encoding has been detected.
-	 * Thus, we either fall into our default encoder, if specified,
-	 * or use Latin-1 if all else fails.
-	 */
-
-	if (ENC__MAX == enc) 
-		enc = ENC__MAX == def ? ENC_LATIN_1 : def;
-
-	if ( ! (*encs[(int)enc].conv)(&b)) {
-		fprintf(stderr, "%s: Bad encoding\n", fn);
-		goto out;
+		if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
+			return(MPARSE_UTF8);
+		if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
+			return(MPARSE_LATIN1);
+		return(0);
 	}
-
-	rc = EXIT_SUCCESS;
-out:
-#if HAVE_MMAP
-	if (map)
-		munmap(b.buf, b.sz);
-	else 
-#endif
-		free(b.buf);
-
-	if (fd > STDIN_FILENO)
-		close(fd);
-
-	return(rc);
+	return(MPARSE_UTF8 | MPARSE_LATIN1);
 }
diff --git a/read.c b/read.c
index b595a1ae..43368b25 100644
--- a/read.c
+++ b/read.c
@@ -45,11 +45,6 @@
 
 #define	REPARSE_LIMIT	1000
 
-struct	buf {
-	char		 *buf; /* binary input buffer */
-	size_t		  sz; /* size of binary buffer */
-};
-
 struct	mparse {
 	struct man	 *pman; /* persistent man parser */
 	struct mdoc	 *pmdoc; /* persistent mdoc parser */
@@ -65,6 +60,7 @@ struct	mparse {
 	enum mandoclevel  file_status; /* status of current parse */
 	enum mandoclevel  wlevel; /* ignore messages below this */
 	int		  options; /* parser options */
+	int		  filenc; /* encoding of the current file */
 	int		  reparse_count; /* finite interp. stack */
 	int		  line; /* line number in the file */
 };
@@ -326,13 +322,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
 	lnn = curp->line;
 	pos = 0;
 
-	for (i = 0; i < (int)blk.sz; ) {
+	for (i = blk.offs; i < (int)blk.sz; ) {
 		if (0 == pos && '\0' == blk.buf[i])
 			break;
 
 		if (start) {
 			curp->line = lnn;
 			curp->reparse_count = 0;
+
+			if (lnn < 3 &&
+			    curp->filenc & MPARSE_UTF8 &&
+			    curp->filenc & MPARSE_LATIN1) {
+				blk.offs = i;
+				curp->filenc = preconv_cue(&blk);
+			}
 		}
 
 		while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
@@ -353,27 +356,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
 			}
 
 			/*
-			 * Make sure we have space for at least
-			 * one backslash and one other character
-			 * and the trailing NUL byte.
+			 * Make sure we have space for the worst
+			 * case of 11 bytes: "\\[u10ffff]\0"
 			 */
 
-			if (pos + 2 >= (int)ln.sz)
+			if (pos + 11 > (int)ln.sz)
 				resize_buf(&ln, 256);
 
 			/*
-			 * Warn about bogus characters.  If you're using
-			 * non-ASCII encoding, you're screwing your
-			 * readers.  Since I'd rather this not happen,
-			 * I'll be helpful and replace these characters
-			 * with "?", so we don't display gibberish.
-			 * Note to manual writers: use special characters.
+			 * Encode 8-bit input.
 			 */
 
-			c = (unsigned char) blk.buf[i];
+			c = blk.buf[i];
+			if (c & 0x80) {
+				blk.offs = i;
+				ln.offs = pos;
+				if (curp->filenc && preconv_encode(
+				    &blk, &ln, &curp->filenc)) {
+					pos = ln.offs;
+					i = blk.offs;
+				} else {
+					mandoc_vmsg(MANDOCERR_BADCHAR,
+					    curp, curp->line, pos,
+					    "0x%x", c);
+					ln.buf[pos++] = '?';
+					i++;
+				}
+				continue;
+			}
+
+			/*
+			 * Exclude control characters.
+			 */
 
-			if ( ! (isascii(c) &&
-			    (isgraph(c) || isblank(c)))) {
+			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
 				mandoc_vmsg(MANDOCERR_BADCHAR, curp,
 				    curp->line, pos, "0x%x", c);
 				i++;
@@ -633,6 +649,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
 			return(0);
 		}
 		*with_mmap = 1;
+		fb->offs = 0;
 		fb->sz = (size_t)st.st_size;
 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
 		if (fb->buf != MAP_FAILED)
@@ -664,6 +681,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
 		ssz = read(fd, fb->buf + (int)off, fb->sz - off);
 		if (ssz == 0) {
 			fb->sz = off;
+			fb->offs = 0;
 			return(1);
 		}
 		if (ssz == -1) {
@@ -735,6 +753,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
 	curp->line = 1;
 	recursion_depth++;
 
+	/* Skip an UTF-8 byte order mark. */
+	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+	    (unsigned char)blk.buf[0] == 0xef &&
+	    (unsigned char)blk.buf[1] == 0xbb &&
+	    (unsigned char)blk.buf[2] == 0xbf) {
+		blk.offs = 3;
+		curp->filenc &= ~MPARSE_LATIN1;
+	}
+
 	mparse_buf_r(curp, blk, 1);
 
 	if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
@@ -752,6 +779,7 @@ mparse_readmem(struct mparse *curp, const void *buf, size_t len,
 
 	blk.buf = UNCONST(buf);
 	blk.sz = len;
+	blk.offs = 0;
 
 	mparse_parse_buffer(curp, blk, file);
 	return(curp->file_status);
@@ -762,6 +790,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
 {
 	struct buf	 blk;
 	int		 with_mmap;
+	int		 save_filenc;
 
 	if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
 		curp->file_status = MANDOCLEVEL_SYSERR;
@@ -780,7 +809,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
 	 */
 
 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+		save_filenc = curp->filenc;
+		curp->filenc = curp->options &
+		    (MPARSE_UTF8 | MPARSE_LATIN1);
 		mparse_parse_buffer(curp, blk, file);
+		curp->filenc = save_filenc;
 #if HAVE_MMAP
 		if (with_mmap)
 			munmap(blk.buf, blk.sz);
author	Ingo Schwarze <schwarze@openbsd.org>	2014-10-25 01:03:52 +0000
committer	Ingo Schwarze <schwarze@openbsd.org>	2014-10-25 01:03:52 +0000
commit	dd148a56f3f3e29132148b8f2bace859b7590d34 (patch)
tree	5290383cf43df4fcf23e706458fc63fd96eb8cbf
parent	e3f177878d1b2ecad452ce2d7d08861420b2ffbb (diff)
download	mandoc-dd148a56f3f3e29132148b8f2bace859b7590d34.tar.gz