6 files changed, 677 insertions, 646 deletions
diff --git a/Makefile b/Makefile
index 17e1efb7..0ee080be 100644
--- a/Makefile
+++ b/Makefile
@@ -122,6 +122,7 @@ SRCS		 = arch.c \
 		   preconv.c \
 		   read.c \
 		   roff.c \
+		   roff_escape.c \
 		   roff_html.c \
 		   roff_term.c \
 		   roff_validate.c \
@@ -235,6 +236,7 @@ LIBMDOC_OBJS	 = att.o \
 
 LIBROFF_OBJS	 = eqn.o \
 		   roff.o \
+		   roff_escape.o \
 		   roff_validate.o \
 		   tbl.o \
 		   tbl_data.o \
diff --git a/mandoc.c b/mandoc.c
index 2abbbd56..a1ddc72b 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -1,7 +1,8 @@
 /* $Id$ */
 /*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -14,6 +15,11 @@
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
  */
 #include "config.h"
 
@@ -91,388 +97,6 @@ mandoc_font(const char *cp, int sz)
 	}
 }
 
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
-	const char	*local_start;
-	int		 local_sz, c, i;
-	char		 term;
-	enum mandoc_esc	 gly;
-
-	/*
-	 * When the caller doesn't provide return storage,
-	 * use local storage.
-	 */
-
-	if (NULL == start)
-		start = &local_start;
-	if (NULL == sz)
-		sz = &local_sz;
-
-	/*
-	 * Treat "\E" just like "\";
-	 * it only makes a difference in copy mode.
-	 */
-
-	while (**end == 'E')
-		++*end;
-
-	/*
-	 * Beyond the backslash, at least one input character
-	 * is part of the escape sequence.  With one exception
-	 * (see below), that character won't be returned.
-	 */
-
-	gly = ESCAPE_ERROR;
-	*start = ++*end;
-	*sz = 0;
-	term = '\0';
-
-	switch ((*start)[-1]) {
-	/*
-	 * First the glyphs.  There are several different forms of
-	 * these, but each eventually returns a substring of the glyph
-	 * name.
-	 */
-	case '(':
-		gly = ESCAPE_SPECIAL;
-		*sz = 2;
-		break;
-	case '[':
-		if (**start == ' ') {
-			++*end;
-			return ESCAPE_ERROR;
-		}
-		gly = ESCAPE_SPECIAL;
-		term = ']';
-		break;
-	case 'C':
-		if ('\'' != **start)
-			return ESCAPE_ERROR;
-		*start = ++*end;
-		gly = ESCAPE_SPECIAL;
-		term = '\'';
-		break;
-
-	/*
-	 * Escapes taking no arguments at all.
-	 */
-	case '!':
-	case '?':
-		return ESCAPE_UNSUPP;
-	case '%':
-	case '&':
-	case ')':
-	case ',':
-	case '/':
-	case '^':
-	case 'a':
-	case 'd':
-	case 'r':
-	case 't':
-	case 'u':
-	case '{':
-	case '|':
-	case '}':
-		return ESCAPE_IGNORE;
-	case 'c':
-		return ESCAPE_NOSPACE;
-	case 'p':
-		return ESCAPE_BREAK;
-
-	/*
-	 * The \z escape is supposed to output the following
-	 * character without advancing the cursor position.
-	 * Since we are mostly dealing with terminal mode,
-	 * let us just skip the next character.
-	 */
-	case 'z':
-		return ESCAPE_SKIPCHAR;
-
-	/*
-	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
-	 * 'X' is the trigger.  These have opaque sub-strings.
-	 */
-	case 'F':
-	case 'f':
-	case 'g':
-	case 'k':
-	case 'M':
-	case 'm':
-	case 'n':
-	case 'O':
-	case 'V':
-	case 'Y':
-	case '*':
-		switch ((*start)[-1]) {
-		case 'f':
-			gly = ESCAPE_FONT;
-			break;
-		case '*':
-			gly = ESCAPE_DEVICE;
-			break;
-		default:
-			gly = ESCAPE_IGNORE;
-			break;
-		}
-		switch (**start) {
-		case '(':
-			if ((*start)[-1] == 'O')
-				gly = ESCAPE_ERROR;
-			*start = ++*end;
-			*sz = 2;
-			break;
-		case '[':
-			if ((*start)[-1] == 'O')
-				gly = (*start)[1] == '5' ?
-				    ESCAPE_UNSUPP : ESCAPE_ERROR;
-			*start = ++*end;
-			term = ']';
-			break;
-		default:
-			if ((*start)[-1] == 'O') {
-				switch (**start) {
-				case '0':
-					gly = ESCAPE_UNSUPP;
-					break;
-				case '1':
-				case '2':
-				case '3':
-				case '4':
-					break;
-				default:
-					gly = ESCAPE_ERROR;
-					break;
-				}
-			}
-			*sz = 1;
-			break;
-		}
-		break;
-
-	/*
-	 * These escapes are of the form \X'Y', where 'X' is the trigger
-	 * and 'Y' is any string.  These have opaque sub-strings.
-	 * The \B and \w escapes are handled in roff.c, roff_res().
-	 */
-	case 'A':
-	case 'b':
-	case 'D':
-	case 'R':
-	case 'X':
-	case 'Z':
-		gly = ESCAPE_IGNORE;
-		/* FALLTHROUGH */
-	case 'o':
-		if (**start == '\0')
-			return ESCAPE_ERROR;
-		if (gly == ESCAPE_ERROR)
-			gly = ESCAPE_OVERSTRIKE;
-		term = **start;
-		*start = ++*end;
-		break;
-
-	/*
-	 * These escapes are of the form \X'N', where 'X' is the trigger
-	 * and 'N' resolves to a numerical expression.
-	 */
-	case 'h':
-	case 'H':
-	case 'L':
-	case 'l':
-	case 'S':
-	case 'v':
-	case 'x':
-		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
-			if ('\0' != **start)
-				++*end;
-			return ESCAPE_ERROR;
-		}
-		switch ((*start)[-1]) {
-		case 'h':
-			gly = ESCAPE_HORIZ;
-			break;
-		case 'l':
-			gly = ESCAPE_HLINE;
-			break;
-		default:
-			gly = ESCAPE_IGNORE;
-			break;
-		}
-		term = **start;
-		*start = ++*end;
-		break;
-
-	/*
-	 * Special handling for the numbered character escape.
-	 * XXX Do any other escapes need similar handling?
-	 */
-	case 'N':
-		if ('\0' == **start)
-			return ESCAPE_ERROR;
-		(*end)++;
-		if (isdigit((unsigned char)**start)) {
-			*sz = 1;
-			return ESCAPE_IGNORE;
-		}
-		(*start)++;
-		while (isdigit((unsigned char)**end))
-			(*end)++;
-		*sz = *end - *start;
-		if ('\0' != **end)
-			(*end)++;
-		return ESCAPE_NUMBERED;
-
-	/*
-	 * Sizes get a special category of their own.
-	 */
-	case 's':
-		gly = ESCAPE_IGNORE;
-
-		/* See +/- counts as a sign. */
-		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
-			*start = ++*end;
-
-		switch (**end) {
-		case '(':
-			*start = ++*end;
-			*sz = 2;
-			break;
-		case '[':
-			*start = ++*end;
-			term = ']';
-			break;
-		case '\'':
-			*start = ++*end;
-			term = '\'';
-			break;
-		case '3':
-		case '2':
-		case '1':
-			*sz = (*end)[-1] == 's' &&
-			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
-			break;
-		default:
-			*sz = 1;
-			break;
-		}
-
-		break;
-
-	/*
-	 * Several special characters can be encoded as
-	 * one-byte escape sequences without using \[].
-	 */
-	case ' ':
-	case '\'':
-	case '-':
-	case '.':
-	case '0':
-	case ':':
-	case '_':
-	case '`':
-	case 'e':
-	case '~':
-		gly = ESCAPE_SPECIAL;
-		/* FALLTHROUGH */
-	default:
-		if (gly == ESCAPE_ERROR)
-			gly = ESCAPE_UNDEF;
-		*start = --*end;
-		*sz = 1;
-		break;
-	}
-
-	/*
-	 * Read up to the terminating character,
-	 * paying attention to nested escapes.
-	 */
-
-	if ('\0' != term) {
-		while (**end != term) {
-			switch (**end) {
-			case '\0':
-				return ESCAPE_ERROR;
-			case '\\':
-				(*end)++;
-				if (ESCAPE_ERROR ==
-				    mandoc_escape(end, NULL, NULL))
-					return ESCAPE_ERROR;
-				break;
-			default:
-				(*end)++;
-				break;
-			}
-		}
-		*sz = (*end)++ - *start;
-
-		/*
-		 * The file chars.c only provides one common list
-		 * of character names, but \[-] == \- is the only
-		 * one of the characters with one-byte names that
-		 * allows enclosing the name in brackets.
-		 */
-		if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
-			return ESCAPE_ERROR;
-	} else {
-		assert(*sz > 0);
-		if ((size_t)*sz > strlen(*start))
-			return ESCAPE_ERROR;
-		*end += *sz;
-	}
-
-	/* Run post-processors. */
-
-	switch (gly) {
-	case ESCAPE_FONT:
-		gly = mandoc_font(*start, *sz);
-		break;
-	case ESCAPE_SPECIAL:
-		if (**start == 'c') {
-			if (*sz < 6 || *sz > 7 ||
-			    strncmp(*start, "char", 4) != 0 ||
-			    (int)strspn(*start + 4, "0123456789") + 4 < *sz)
-				break;
-			c = 0;
-			for (i = 4; i < *sz; i++)
-				c = 10 * c + ((*start)[i] - '0');
-			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
-				break;
-			*start += 4;
-			*sz -= 4;
-			gly = ESCAPE_NUMBERED;
-			break;
-		}
-
-		/*
-		 * Unicode escapes are defined in groff as \[u0000]
-		 * to \[u10FFFF], where the contained value must be
-		 * a valid Unicode codepoint.  Here, however, only
-		 * check the length and range.
-		 */
-		if (**start != 'u' || *sz < 5 || *sz > 7)
-			break;
-		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
-			break;
-		if (*sz == 6 && (*start)[1] == '0')
-			break;
-		if (*sz == 5 && (*start)[1] == 'D' &&
-		    strchr("89ABCDEF", (*start)[2]) != NULL)
-			break;
-		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
-		    + 1 == *sz)
-			gly = ESCAPE_UNICODE;
-		break;
-	case ESCAPE_DEVICE:
-		assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
-		break;
-	default:
-		break;
-	}
-
-	return gly;
-}
-
 static int
 a2time(time_t *t, const char *fmt, const char *p)
 {
diff --git a/mandoc.h b/mandoc.h
index 37e948d0..3c016a8a 100644
--- a/mandoc.h
+++ b/mandoc.h
@@ -285,11 +285,12 @@ enum	mandocerr {
 };
 
 enum	mandoc_esc {
-	ESCAPE_ERROR = 0, /* bail! unparsable escape */
-	ESCAPE_UNSUPP, /* unsupported escape; ignore it */
-	ESCAPE_IGNORE, /* escape to be ignored */
-	ESCAPE_UNDEF, /* undefined escape; print literal character */
-	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_EXPAND = 0, /* interpolation and iterative call needed */
+	ESCAPE_ERROR, /* non-fatal error: unparsable escape */
+	ESCAPE_UNSUPP, /* unsupported escape: warn and ignore */
+	ESCAPE_IGNORE, /* valid escape to be ignored */
+	ESCAPE_UNDEF, /* undefined escape: print literal character */
+	ESCAPE_SPECIAL, /* special character escape */
 	ESCAPE_FONT, /* a generic font mode */
 	ESCAPE_FONTBOLD, /* bold font mode */
 	ESCAPE_FONTITALIC, /* italic font mode */
diff --git a/roff.c b/roff.c
index c75c635f..c76be9b8 100644
--- a/roff.c
+++ b/roff.c
@@ -207,6 +207,8 @@ static	int		 roff_evalpar(struct roff *, int,
 static	int		 roff_evalstrcond(const char *, int *);
 static	int		 roff_expand(struct roff *, struct buf *,
 				int, int, char);
+static	void		 roff_expand_patch(struct buf *, int,
+				const char *, int);
 static	void		 roff_free1(struct roff *);
 static	void		 roff_freereg(struct roffreg *);
 static	void		 roff_freestr(struct roffkv *);
@@ -1233,9 +1235,15 @@ deroff(char **dest, const struct roff_node *n)
 
 /* --- main functions of the roff parser ---------------------------------- */
 
+/*
+ * Save comments preceding the title macro, for example in order to
+ * preserve Copyright and license headers in HTML output,
+ * provide diagnostics about RCS ids and trailing whitespace in comments,
+ * then discard comments including preceding whitespace.
+ * This function also handles input line continuation.
+ */
 static int
-roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
-    char newesc)
+roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, char ec)
 {
 	struct roff_node *n;	/* used for header comments */
 	const char	*start;	/* start of the string to process */
@@ -1245,15 +1253,39 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
 	int		 rcsid;	/* kind of RCS id seen */
 
 	for (start = stesc = buf->buf + pos;; stesc++) {
+		/*
+		 * XXX Ugly hack: Remove the newline character that
+		 * mparse_buf_r() appended to mark the end of input
+		 * if it is not preceded by an escape character.
+		 */
+		if (stesc[0] == '\n') {
+			assert(stesc[1] == '\0');
+			stesc[0] = '\0';
+		}
+
 		/* The line ends without continuation or comment. */
 		if (stesc[0] == '\0')
 			return ROFF_CONT;
 
 		/* Unescaped byte: skip it. */
-		if (stesc[0] != newesc)
+		if (stesc[0] != ec)
 			continue;
 
-		/* Backslash at end of line requests line continuation. */
+		/*
+		 * XXX Ugly hack: Do not attempt to append another line
+		 * if the function mparse_buf_r() appended a newline
+		 * character to indicate the end of input.
+		 */
+		if (stesc[1] == '\n') {
+			assert(stesc[2] == '\0');
+			stesc[0] = '\0';
+			return ROFF_CONT;
+		}
+
+		/*
+		 * An escape character at the end of an input line
+		 * requests line continuation.
+		 */
 		if (stesc[1] == '\0') {
 			stesc[0] = '\0';
 			return ROFF_IGN | ROFF_APPEND;
@@ -1264,7 +1296,7 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
 			break;
 
 		/* Escaped escape character: skip them both. */
-		if (stesc[1] == newesc)
+		if (stesc[1] == ec)
 			stesc++;
 	}
 
@@ -1331,325 +1363,218 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
  * which typically produce output glyphs or change formatter state.
  */
 static int
-roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char newesc)
+roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char ec)
 {
-	struct mctx	*ctx;	/* current macro call context */
-	char		 ubuf[24]; /* buffer to print the number */
-	const char	*start;	/* start of the string to process */
-	char		*stesc;	/* start of an escape sequence ('\\') */
-	const char	*esct;	/* type of esccape sequence */
-	const char	*stnam;	/* start of the name, after "[(*" */
-	const char	*cp;	/* end of the name, e.g. before ']' */
-	const char	*res;	/* the string to be substituted */
-	char		*nbuf;	/* new buffer to copy buf->buf to */
-	size_t		 maxl;  /* expected length of the escape name */
-	size_t		 naml;	/* actual length of the escape name */
-	size_t		 asz;	/* length of the replacement */
-	size_t		 rsz;	/* length of the rest of the string */
-	int		 inaml;	/* length returned from mandoc_escape() */
+	char		 ubuf[24];	/* buffer to print a number */
+	struct mctx	*ctx;		/* current macro call context */
+	const char	*res;		/* the string to be pasted */
+	const char	*src;		/* source for copying */
+	char		*dst;		/* destination for copying */
+	int		 iesc;		/* index of leading escape char */
+	int		 inam;		/* index of the escape name */
+	int		 iarg;		/* index beginning the argument */
+	int		 iendarg;	/* index right after the argument */
+	int		 iend;		/* index right after the sequence */
+	int		 deftype;	/* type of definition to paste */
+	int		 argi;		/* macro argument index */
+	int		 quote_args;	/* true for \\$@, false for \\$* */
+	int		 asz;		/* length of the replacement */
+	int		 rsz;		/* length of the rest of the string */
+	int		 npos;		/* position in numeric expression */
 	int		 expand_count;	/* to avoid infinite loops */
-	int		 npos;	/* position in numeric expression */
-	int		 arg_complete; /* argument not interrupted by eol */
-	int		 quote_args; /* true for \\$@, false for \\$* */
-	int		 deftype; /* type of definition to paste */
-	enum mandocerr	 err;	/* for escape sequence problems */
-	char		 sign;	/* increment number register */
-	char		 term;	/* character terminating the escape */
-
-	start = buf->buf + pos;
-	stesc = strchr(start, '\0') - 1;
-	if (stesc >= start && *stesc == '\n')
-		*stesc-- = '\0';
 
 	expand_count = 0;
-	while (stesc >= start) {
-		if (*stesc != newesc) {
+	while (buf->buf[pos] != '\0') {
 
-			/*
-			 * If we have a non-standard escape character,
-			 * escape literal backslashes because all
-			 * processing in subsequent functions uses
-			 * the standard escaping rules.
-			 */
+		/*
+		 * Skip plain ASCII characters.
+		 * If we have a non-standard escape character,
+		 * escape literal backslashes because all processing in
+		 * subsequent functions uses the standard escaping rules.
+		 */
 
-			if (newesc != ASCII_ESC && *stesc == '\\') {
-				*stesc = '\0';
-				buf->sz = mandoc_asprintf(&nbuf, "%s\\e%s",
-				    buf->buf, stesc + 1) + 1;
-				start = nbuf + pos;
-				stesc = nbuf + (stesc - buf->buf);
-				free(buf->buf);
-				buf->buf = nbuf;
+		if (buf->buf[pos] != ec) {
+			if (ec != ASCII_ESC && buf->buf[pos] == '\\') {
+				roff_expand_patch(buf, pos, "\\e", pos + 1);
+				pos++;
 			}
-
-			/* Search backwards for the next escape. */
-
-			stesc--;
+			pos++;
 			continue;
 		}
 
-		/* If it is escaped, skip it. */
-
-		for (cp = stesc - 1; cp >= start; cp--)
-			if (*cp != r->escape)
-				break;
-
-		if ((stesc - cp) % 2 == 0) {
-			while (stesc > cp)
-				*stesc-- = '\\';
-			continue;
-		} else if (stesc[1] == '\0') {
-			*stesc-- = '\0';
-			continue;
-		} else
-			*stesc = '\\';
-
-		/* Decide whether to expand or to check only. */
+		/*
+		 * Parse escape sequences,
+		 * issue diagnostic messages when appropriate,
+		 * and skip sequences that do not need expansion.
+		 * If we have a non-standard escape character, translate
+		 * it to backslashes and translate backslashes to \e.
+		 */
 
-		term = '\0';
-		cp = stesc + 1;
-		while (*cp == 'E')
-			cp++;
-		esct = cp;
-		switch (*esct) {
-		case '*':
-		case '$':
-			res = NULL;
-			break;
-		case 'B':
-		case 'w':
-			term = cp[1];
-			/* FALLTHROUGH */
-		case 'n':
-			sign = cp[1];
-			if (sign == '+' || sign == '-')
-				cp++;
-			res = ubuf;
-			break;
-		default:
-			err = MANDOCERR_OK;
-			switch(mandoc_escape(&cp, &stnam, &inaml)) {
-			case ESCAPE_SPECIAL:
-				if (mchars_spec2cp(stnam, inaml) >= 0)
-					break;
-				/* FALLTHROUGH */
-			case ESCAPE_ERROR:
-				err = MANDOCERR_ESC_BAD;
-				break;
-			case ESCAPE_UNDEF:
-				err = MANDOCERR_ESC_UNDEF;
-				break;
-			case ESCAPE_UNSUPP:
-				err = MANDOCERR_ESC_UNSUPP;
-				break;
-			default:
-				break;
+		if (roff_escape(buf->buf, ln, pos,
+		    &iesc, &iarg, &iendarg, &iend) != ESCAPE_EXPAND) {
+			while (pos < iend) {
+				if (buf->buf[pos] == ec) {
+					buf->buf[pos] = '\\';
+					if (pos + 1 < iend)
+						pos++;
+				} else if (buf->buf[pos] == '\\') {
+					roff_expand_patch(buf,
+					    pos, "\\e", pos + 1);
+					pos++;
+					iend++;
+				}
+				pos++;
 			}
-			if (err != MANDOCERR_OK)
-				mandoc_msg(err, ln, (int)(stesc - buf->buf),
-				    "%.*s", (int)(cp - stesc), stesc);
-			stesc--;
 			continue;
 		}
 
-		if (EXPAND_LIMIT < ++expand_count) {
-			mandoc_msg(MANDOCERR_ROFFLOOP,
-			    ln, (int)(stesc - buf->buf), NULL);
-			return ROFF_IGN;
-		}
-
 		/*
-		 * The third character decides the length
-		 * of the name of the string or register.
-		 * Save a pointer to the name.
+		 * Treat "\E" just like "\";
+		 * it only makes a difference in copy mode.
 		 */
 
-		if (term == '\0') {
-			switch (*++cp) {
-			case '\0':
-				maxl = 0;
-				break;
-			case '(':
-				cp++;
-				maxl = 2;
-				break;
-			case '[':
-				cp++;
-				term = ']';
-				maxl = 0;
-				break;
-			default:
-				maxl = 1;
-				break;
-			}
-		} else {
-			cp += 2;
-			maxl = 0;
-		}
-		stnam = cp;
+		inam = iesc + 1;
+		while (buf->buf[inam] == 'E')
+			inam++;
 
-		/* Advance to the end of the name. */
+		/* Handle expansion. */
 
-		naml = 0;
-		arg_complete = 1;
-		while (maxl == 0 || naml < maxl) {
-			if (*cp == '\0') {
-				mandoc_msg(MANDOCERR_ESC_BAD, ln,
-				    (int)(stesc - buf->buf), "%s", stesc);
-				arg_complete = 0;
-				break;
-			}
-			if (maxl == 0 && *cp == term) {
-				cp++;
-				break;
-			}
-			if (*cp++ != '\\' || *esct != 'w') {
-				naml++;
-				continue;
-			}
-			switch (mandoc_escape(&cp, NULL, NULL)) {
-			case ESCAPE_SPECIAL:
-			case ESCAPE_UNICODE:
-			case ESCAPE_NUMBERED:
-			case ESCAPE_UNDEF:
-			case ESCAPE_OVERSTRIKE:
-				naml++;
+		res = NULL;
+		switch (buf->buf[inam]) {
+		case '*':
+			if (iendarg == iarg)
 				break;
-			default:
+			deftype = ROFFDEF_USER | ROFFDEF_PRE;
+			if ((res = roff_getstrn(r, buf->buf + iarg,
+			    iendarg - iarg, &deftype)) != NULL)
 				break;
-			}
-		}
 
-		/*
-		 * Retrieve the replacement string; if it is
-		 * undefined, resume searching for escapes.
-		 */
+			/*
+			 * If not overriden,
+			 * let \*(.T through to the formatters.
+			 */
 
-		switch (*esct) {
-		case '*':
-			if (arg_complete) {
-				deftype = ROFFDEF_USER | ROFFDEF_PRE;
-				res = roff_getstrn(r, stnam, naml, &deftype);
-
-				/*
-				 * If not overriden, let \*(.T
-				 * through to the formatters.
-				 */
-
-				if (res == NULL && naml == 2 &&
-				    stnam[0] == '.' && stnam[1] == 'T') {
-					roff_setstrn(&r->strtab,
-					    ".T", 2, NULL, 0, 0);
-					stesc--;
-					continue;
-				}
+			if (iendarg - iarg == 2 &&
+			    buf->buf[iarg] == '.' &&
+			    buf->buf[iarg + 1] == 'T') {
+				roff_setstrn(&r->strtab, ".T", 2, NULL, 0, 0);
+				pos = iend;
+				continue;
 			}
+
+			mandoc_msg(MANDOCERR_STR_UNDEF, ln, iesc,
+			    "%.*s", iendarg - iarg, buf->buf + iarg);
 			break;
+
 		case '$':
 			if (r->mstackpos < 0) {
-				mandoc_msg(MANDOCERR_ARG_UNDEF, ln,
-				    (int)(stesc - buf->buf), "%.3s", stesc);
+				mandoc_msg(MANDOCERR_ARG_UNDEF, ln, iesc,
+				    "%.*s", iend - iesc, buf->buf + iesc);
 				break;
 			}
 			ctx = r->mstack + r->mstackpos;
-			npos = esct[1] - '1';
-			if (npos >= 0 && npos <= 8) {
-				res = npos < ctx->argc ?
-				    ctx->argv[npos] : "";
+			argi = buf->buf[iarg] - '1';
+			if (argi >= 0 && argi <= 8) {
+				if (argi < ctx->argc)
+					res = ctx->argv[argi];
 				break;
 			}
-			if (esct[1] == '*')
+			if (buf->buf[iarg] == '*')
 				quote_args = 0;
-			else if (esct[1] == '@')
+			else if (buf->buf[iarg] == '@')
 				quote_args = 1;
 			else {
-				mandoc_msg(MANDOCERR_ARG_NONUM, ln,
-				    (int)(stesc - buf->buf), "%.3s", stesc);
+				mandoc_msg(MANDOCERR_ARG_NONUM, ln, iesc,
+				    "%.*s", iend - iesc, buf->buf + iesc);
 				break;
 			}
 			asz = 0;
-			for (npos = 0; npos < ctx->argc; npos++) {
-				if (npos)
+			for (argi = 0; argi < ctx->argc; argi++) {
+				if (argi)
 					asz++;  /* blank */
 				if (quote_args)
 					asz += 2;  /* quotes */
-				asz += strlen(ctx->argv[npos]);
+				asz += strlen(ctx->argv[argi]);
 			}
-			if (asz != 3) {
-				rsz = buf->sz - (stesc - buf->buf) - 3;
-				if (asz < 3)
-					memmove(stesc + asz, stesc + 3, rsz);
-				buf->sz += asz - 3;
-				nbuf = mandoc_realloc(buf->buf, buf->sz);
-				start = nbuf + pos;
-				stesc = nbuf + (stesc - buf->buf);
-				buf->buf = nbuf;
-				if (asz > 3)
-					memmove(stesc + asz, stesc + 3, rsz);
+			if (asz != iend - iesc) {
+				rsz = buf->sz - iend;
+				if (asz < iend - iesc)
+					memmove(buf->buf + iesc + asz,
+					    buf->buf + iend, rsz);
+				buf->sz = iesc + asz + rsz;
+				buf->buf = mandoc_realloc(buf->buf, buf->sz);
+				if (asz > iend - iesc)
+					memmove(buf->buf + iesc + asz,
+					    buf->buf + iend, rsz);
 			}
-			for (npos = 0; npos < ctx->argc; npos++) {
-				if (npos)
-					*stesc++ = ' ';
+			dst = buf->buf + iesc;
+			for (argi = 0; argi < ctx->argc; argi++) {
+				if (argi)
+					*dst++ = ' ';
 				if (quote_args)
-					*stesc++ = '"';
-				cp = ctx->argv[npos];
-				while (*cp != '\0')
-					*stesc++ = *cp++;
+					*dst++ = '"';
+				src = ctx->argv[argi];
+				while (*src != '\0')
+					*dst++ = *src++;
 				if (quote_args)
-					*stesc++ = '"';
+					*dst++ = '"';
 			}
 			continue;
 		case 'B':
 			npos = 0;
-			ubuf[0] = arg_complete &&
-			    roff_evalnum(r, ln, stnam, &npos,
-			      NULL, ROFFNUM_SCALE) &&
-			    stnam + npos + 1 == cp ? '1' : '0';
+			ubuf[0] = iendarg > iarg && iend > iendarg &&
+			    roff_evalnum(r, ln, buf->buf + iarg, &npos,
+					 NULL, ROFFNUM_SCALE) &&
+			    npos == iendarg - iarg ? '1' : '0';
 			ubuf[1] = '\0';
+			res = ubuf;
 			break;
 		case 'n':
-			if (arg_complete)
+			if (iendarg > iarg)
 				(void)snprintf(ubuf, sizeof(ubuf), "%d",
-				    roff_getregn(r, stnam, naml, sign));
+				    roff_getregn(r, buf->buf + iarg,
+				    iendarg - iarg, buf->buf[inam + 1]));
 			else
 				ubuf[0] = '\0';
+			res = ubuf;
 			break;
 		case 'w':
-			/* use even incomplete args */
-			(void)snprintf(ubuf, sizeof(ubuf), "%d",
-			    24 * (int)naml);
+			(void)snprintf(ubuf, sizeof(ubuf),
+			    "%d", (iendarg - iarg) * 24);
+			res = ubuf;
+			break;
+		default:
 			break;
 		}
-
-		if (res == NULL) {
-			if (*esct == '*')
-				mandoc_msg(MANDOCERR_STR_UNDEF,
-				    ln, (int)(stesc - buf->buf),
-				    "%.*s", (int)naml, stnam);
+		if (res == NULL)
 			res = "";
-		} else if (buf->sz + strlen(res) > SHRT_MAX) {
-			mandoc_msg(MANDOCERR_ROFFLOOP,
-			    ln, (int)(stesc - buf->buf), NULL);
+		if (++expand_count > EXPAND_LIMIT ||
+		    buf->sz + strlen(res) > SHRT_MAX) {
+			mandoc_msg(MANDOCERR_ROFFLOOP, ln, iesc, NULL);
 			return ROFF_IGN;
 		}
-
-		/* Replace the escape sequence by the string. */
-
-		*stesc = '\0';
-		buf->sz = mandoc_asprintf(&nbuf, "%s%s%s",
-		    buf->buf, res, cp) + 1;
-
-		/* Prepare for the next replacement. */
-
-		start = nbuf + pos;
-		stesc = nbuf + (stesc - buf->buf) + strlen(res);
-		free(buf->buf);
-		buf->buf = nbuf;
+		roff_expand_patch(buf, iesc, res, iend);
 	}
 	return ROFF_CONT;
 }
 
 /*
+ * Replace the substring from the start position (inclusive)
+ * to end position (exclusive) with the repl(acement) string.
+ */
+static void
+roff_expand_patch(struct buf *buf, int start, const char *repl, int end)
+{
+	char	*nbuf;
+
+	buf->buf[start] = '\0';
+	buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", buf->buf, repl,
+	    buf->buf + end) + 1;
+	free(buf->buf);
+	buf->buf = nbuf;
+}
+
+/*
  * Parse a quoted or unquoted roff-style request or macro argument.
  * Return a pointer to the parsed argument, which is either the original
  * pointer or advanced by one byte in case the argument is quoted.
diff --git a/roff_escape.c b/roff_escape.c
new file mode 100644
index 00000000..1b5dc640
--- /dev/null
+++ b/roff_escape.c
@@ -0,0 +1,477 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Parser for roff(7) escape sequences.
+ * To be used by all mandoc(1) parsers and formatters.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mandoc.h"
+#include "roff.h"
+#include "roff_int.h"
+
+/*
+ * Traditional escape sequence interpreter for general use
+ * including in high-level formatters.  This function does not issue
+ * diagnostics and is not usable for expansion in the roff(7) parser.
+ * It is documented in the mandoc_escape(3) manual page.
+ */
+enum mandoc_esc
+mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
+{
+        int		 iarg, iendarg, iend;
+        enum mandoc_esc  rval;
+
+        rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
+        assert(rval != ESCAPE_EXPAND);
+        if (rarg != NULL)
+	       *rarg = *rendarg + iarg;
+        if (rargl != NULL)
+	       *rargl = iendarg - iarg;
+        *rendarg += iend;
+        return rval;
+}
+
+/*
+ * Full-featured escape sequence parser.
+ * If it encounters a nested escape sequence that requires expansion
+ * by the parser and re-parsing, the positions of that inner escape
+ * sequence are returned in *resc ... *rend.
+ * Otherwise, *resc is set to aesc and the positions of the escape
+ * sequence starting at aesc are returned.
+ * Diagnostic messages are generated if and only if resc != NULL,
+ * that is, if and only if called by roff_expand().
+ */
+enum mandoc_esc
+roff_escape(const char *buf, const int ln, const int aesc,
+    int *resc, int *rarg, int *rendarg, int *rend)
+{
+	int		 iesc;		/* index of leading escape char */
+	int		 iarg;		/* index beginning the argument */
+	int		 iendarg;	/* index right after the argument */
+	int		 iend;		/* index right after the sequence */
+	int		 sesc, sarg, sendarg, send; /* for sub-escape */
+	int		 maxl;		/* expected length of the argument */
+	int		 argl;		/* actual length of the argument */
+	int		 c, i;		/* for \[char...] parsing */
+	enum mandoc_esc	 rval;		/* return value */
+	enum mandocerr	 err;		/* diagnostic code */
+	char		 esc_name;
+	char		 term;		/* byte terminating the argument */
+
+	/*
+	 * Treat "\E" just like "\";
+	 * it only makes a difference in copy mode.
+	 */
+
+	iesc = iarg = aesc;
+	do {
+		iarg++;
+	} while (buf[iarg] == 'E');
+
+	/*
+	 * Sort the following cases first by syntax category,
+	 * then by escape sequence type, and finally by ASCII code.
+	 */
+
+	esc_name = buf[iarg];
+	iendarg = iend = ++iarg;
+	maxl = INT_MAX;
+	term = '\0';
+	switch (esc_name) {
+
+	/* Escape sequences taking no arguments at all. */
+
+	case '!':
+	case '?':
+		rval = ESCAPE_UNSUPP;
+		goto out;
+
+	case '%':
+	case '&':
+	case ')':
+	case ',':
+	case '/':
+	case '^':
+	case 'a':
+	case 'd':
+	case 'r':
+	case 't':
+	case 'u':
+	case '{':
+	case '|':
+	case '}':
+		rval = ESCAPE_IGNORE;
+		goto out;
+
+	case '\\':
+	default:
+		iarg--;
+		rval = ESCAPE_UNDEF;
+		goto out;
+
+	case ' ':
+	case '\'':
+	case '-':
+	case '.':
+	case '0':
+	case ':':
+	case '_':
+	case '`':
+	case 'e':
+	case '~':
+		iarg--;
+		argl = 1;
+		rval = ESCAPE_SPECIAL;
+		goto out;
+	case 'p':
+		rval = ESCAPE_BREAK;
+		goto out;
+	case 'c':
+		rval = ESCAPE_NOSPACE;
+		goto out;
+	case 'z':
+		rval = ESCAPE_SKIPCHAR;
+		goto out;
+
+	/* Standard argument format. */
+
+	case '$':
+	case '*':
+	case 'n':
+		rval = ESCAPE_EXPAND;
+		break;
+	case 'F':
+	case 'M':
+	case 'O':
+	case 'V':
+	case 'Y':
+	case 'g':
+	case 'k':
+	case 'm':
+		rval = ESCAPE_IGNORE;
+		break;
+	case '(':
+	case '[':
+		rval = ESCAPE_SPECIAL;
+		iendarg = iend = --iarg;
+		break;
+	case 'f':
+		rval = ESCAPE_FONT;
+		break;
+
+	/* Quoted arguments */
+
+	case 'B':
+	case 'w':
+		rval = ESCAPE_EXPAND;
+		term = '\b';
+		break;
+	case 'A':
+	case 'D':
+	case 'H':
+	case 'L':
+	case 'R':
+	case 'S':
+	case 'X':
+	case 'Z':
+	case 'b':
+	case 'v':
+	case 'x':
+		rval = ESCAPE_IGNORE;
+		term = '\b';
+		break;
+	case 'C':
+		if (buf[iarg] != '\'') {
+			rval = ESCAPE_ERROR;
+			goto out;
+		}
+		rval = ESCAPE_SPECIAL;
+		term = '\b';
+		break;
+	case 'N':
+		rval = ESCAPE_NUMBERED;
+		term = '\b';
+		break;
+	case 'h':
+		rval = ESCAPE_HORIZ;
+		term = '\b';
+		break;
+	case 'l':
+		rval = ESCAPE_HLINE;
+		term = '\b';
+		break;
+	case 'o':
+		rval = ESCAPE_OVERSTRIKE;
+		term = '\b';
+		break;
+
+	/* Sizes support both forms, with additional peculiarities. */
+
+	case 's':
+		rval = ESCAPE_IGNORE;
+		if (buf[iarg] == '+' || buf[iarg] == '-'||
+		    buf[iarg] == ASCII_HYPH)
+			iarg++;
+		switch (buf[iarg]) {
+		case '(':
+			maxl = 2;
+			iarg++;
+			break;
+		case '[':
+			term = ']';
+			iarg++;
+			break;
+		case '\'':
+			term = '\'';
+			iarg++;
+			break;
+		case '1':
+		case '2':
+		case '3':
+			if (buf[iarg - 1] == 's' &&
+			    isdigit((unsigned char)buf[iarg + 1])) {
+				maxl = 2;
+				break;
+			}
+			/* FALLTHROUGH */
+		default:
+			maxl = 1;
+			break;
+		}
+		iendarg = iend = iarg;
+	}
+
+	/* Decide how to end the argument. */
+
+	if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
+	    buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
+	    &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+		goto out_sub;
+
+	if (term == '\b') {
+		if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
+		    (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
+		     buf[iarg]) != NULL)) {
+			iendarg = iend = iarg + 1;
+			rval = ESCAPE_ERROR;
+			goto out;
+		}
+		term = buf[iarg++];
+	} else if (term == '\0' && maxl == INT_MAX) {
+		if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
+			iarg++;
+		switch (buf[iarg]) {
+		case '(':
+			maxl = 2;
+			iarg++;
+			break;
+		case '[':
+			if (buf[++iarg] == ' ') {
+				iendarg = iend = iarg + 1;
+				rval = ESCAPE_ERROR;
+				goto out;
+			}
+			term = ']';
+			break;
+		default:
+			maxl = 1;
+			break;
+		}
+	}
+
+	/* Advance to the end of the argument. */
+
+	iendarg = iarg;
+	while (maxl > 0) {
+		if (buf[iendarg] == '\0') {
+			/* Ignore an incomplete argument except for \w. */
+			if (esc_name != 'w')
+				iendarg = iarg;
+			break;
+		}
+		if (buf[iendarg] == term) {
+			iend = iendarg + 1;
+			break;
+		}
+		if (esc_name == 'N' &&
+		    isdigit((unsigned char)buf[iendarg]) == 0) {
+			iend = iendarg + 1;
+			break;
+		}
+		if (buf[iendarg] == buf[iesc]) {
+			if (roff_escape(buf, ln, iendarg,
+			    &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+				goto out_sub;
+			iendarg = iend = send;
+		} else {
+			if (maxl != INT_MAX)
+				maxl--;
+			iend = ++iendarg;
+		}
+	}
+	if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
+	    (term != '\0' && buf[iendarg] != term)))
+		mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
+
+	/* Post-process depending on the content of the argument. */
+
+	argl = iendarg - iarg;
+	switch (esc_name) {
+	case '*':
+		if (resc == NULL && argl == 2 &&
+		    buf[iarg] == '.' && buf[iarg + 1] == 'T')
+			rval = ESCAPE_DEVICE;
+		break;
+	case 'O':
+		switch (buf[iarg]) {
+		case '0':
+			rval = ESCAPE_UNSUPP;
+			break;
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+			rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
+			break;
+		case '5':
+			rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
+			    ESCAPE_ERROR;
+			break;
+		default:
+			rval = ESCAPE_ERROR;
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	switch (rval) {
+	case ESCAPE_FONT:
+		rval = mandoc_font(buf + iarg, argl);
+		break;
+
+	case ESCAPE_SPECIAL:
+
+		/*
+		 * The file chars.c only provides one common list of
+		 * character names, but \[-] == \- is the only one of
+		 * the characters with one-byte names that allows
+		 * enclosing the name in brackets.
+		 */
+
+		if (term != '\0' && argl == 1 && buf[iarg] != '-') {
+			rval = ESCAPE_ERROR;
+			break;
+		}
+
+		/* Treat \[char...] as an alias for \N'...'. */
+
+		if (buf[iarg] == 'c') {
+			if (argl < 6 || argl > 7 ||
+			    strncmp(buf + iarg, "char", 4) != 0 ||
+			    (int)strspn(buf + iarg + 4, "0123456789")
+			     + 4 < argl)
+				break;
+			c = 0;
+			for (i = iarg; i < iendarg; i++)
+				c = 10 * c + (buf[i] - '0');
+			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
+				break;
+			iarg += 4;
+			rval = ESCAPE_NUMBERED;
+			break;
+		}
+
+		/*
+		 * Unicode escapes are defined in groff as \[u0000]
+		 * to \[u10FFFF], where the contained value must be
+		 * a valid Unicode codepoint.  Here, however, only
+		 * check the length and range.
+		 */
+
+		if (buf[iarg] != 'u' || argl < 5 || argl > 7)
+			break;
+		if (argl == 7 &&
+		    (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
+			break;
+		if (argl == 6 && buf[iarg + 1] == '0')
+			break;
+		if (argl == 5 && buf[iarg + 1] == 'D' &&
+		    strchr("89ABCDEF", buf[iarg + 2]) != NULL)
+			break;
+		if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+		    + 1 == argl)
+			rval = ESCAPE_UNICODE;
+		break;
+	default:
+		break;
+	}
+	goto out;
+
+out_sub:
+	iesc = sesc;
+	iarg = sarg;
+	iendarg = sendarg;
+	iend = send;
+	rval = ESCAPE_EXPAND;
+
+out:
+	if (rarg != NULL)
+		*rarg = iarg;
+	if (rendarg != NULL)
+		*rendarg = iendarg;
+	if (rend != NULL)
+		*rend = iend;
+	if (resc == NULL)
+		return rval;
+
+	/*
+	 * Diagnostic messages are only issued when called
+	 * from the parser, not when called from the formatters.
+	 */
+
+	*resc = iesc;
+	switch (rval) {
+	case ESCAPE_ERROR:
+		err = MANDOCERR_ESC_BAD;
+		break;
+	case ESCAPE_UNSUPP:
+		err = MANDOCERR_ESC_UNSUPP;
+		break;
+	case ESCAPE_UNDEF:
+		if (esc_name == '\\')
+			return rval;
+		err = MANDOCERR_ESC_UNDEF;
+		break;
+	case ESCAPE_SPECIAL:
+		if (mchars_spec2cp(buf + iarg, argl) >= 0)
+			return rval;
+		err = MANDOCERR_ESC_BAD;
+		break;
+	default:
+		return rval;
+	}
+	mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
+	return rval;
+}
diff --git a/roff_int.h b/roff_int.h
index f7d688fd..ba7032b0 100644
--- a/roff_int.h
+++ b/roff_int.h
@@ -1,6 +1,6 @@
 /* $OpenBSD: roff_int.h,v 1.16 2019/01/05 00:36:46 schwarze Exp $	*/
 /*
- * Copyright (c) 2013-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2013-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
  * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -82,6 +82,8 @@ struct ohash	 *roffhash_alloc(enum roff_tok, enum roff_tok);
 enum roff_tok	  roffhash_find(struct ohash *, const char *, size_t);
 void		  roffhash_free(struct ohash *);
 
+enum mandoc_esc	  roff_escape(const char *, const int, const int,
+			int *, int *, int *, int *);
 void		  roff_state_reset(struct roff_man *);
 void		  roff_validate(struct roff_man *);