diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | mandoc.c | 392 | ||||
-rw-r--r-- | mandoc.h | 11 | ||||
-rw-r--r-- | roff.c | 437 | ||||
-rw-r--r-- | roff_escape.c | 477 | ||||
-rw-r--r-- | roff_int.h | 4 |
6 files changed, 677 insertions, 646 deletions
@@ -122,6 +122,7 @@ SRCS = arch.c \ preconv.c \ read.c \ roff.c \ + roff_escape.c \ roff_html.c \ roff_term.c \ roff_validate.c \ @@ -235,6 +236,7 @@ LIBMDOC_OBJS = att.o \ LIBROFF_OBJS = eqn.o \ roff.o \ + roff_escape.o \ roff_validate.o \ tbl.o \ tbl_data.o \ @@ -1,7 +1,8 @@ /* $Id$ */ /* - * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org> - * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> + * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021 + * Ingo Schwarze <schwarze@openbsd.org> + * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -14,6 +15,11 @@ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Utility functions to handle end of sentence punctuation + * and dates and times, for use by mdoc(7) and man(7) parsers. + * Utility functions to handle fonts and numbers, + * for use by mandoc(1) parsers and formatters. */ #include "config.h" @@ -91,388 +97,6 @@ mandoc_font(const char *cp, int sz) } } -enum mandoc_esc -mandoc_escape(const char **end, const char **start, int *sz) -{ - const char *local_start; - int local_sz, c, i; - char term; - enum mandoc_esc gly; - - /* - * When the caller doesn't provide return storage, - * use local storage. - */ - - if (NULL == start) - start = &local_start; - if (NULL == sz) - sz = &local_sz; - - /* - * Treat "\E" just like "\"; - * it only makes a difference in copy mode. - */ - - while (**end == 'E') - ++*end; - - /* - * Beyond the backslash, at least one input character - * is part of the escape sequence. With one exception - * (see below), that character won't be returned. - */ - - gly = ESCAPE_ERROR; - *start = ++*end; - *sz = 0; - term = '\0'; - - switch ((*start)[-1]) { - /* - * First the glyphs. There are several different forms of - * these, but each eventually returns a substring of the glyph - * name. - */ - case '(': - gly = ESCAPE_SPECIAL; - *sz = 2; - break; - case '[': - if (**start == ' ') { - ++*end; - return ESCAPE_ERROR; - } - gly = ESCAPE_SPECIAL; - term = ']'; - break; - case 'C': - if ('\'' != **start) - return ESCAPE_ERROR; - *start = ++*end; - gly = ESCAPE_SPECIAL; - term = '\''; - break; - - /* - * Escapes taking no arguments at all. - */ - case '!': - case '?': - return ESCAPE_UNSUPP; - case '%': - case '&': - case ')': - case ',': - case '/': - case '^': - case 'a': - case 'd': - case 'r': - case 't': - case 'u': - case '{': - case '|': - case '}': - return ESCAPE_IGNORE; - case 'c': - return ESCAPE_NOSPACE; - case 'p': - return ESCAPE_BREAK; - - /* - * The \z escape is supposed to output the following - * character without advancing the cursor position. - * Since we are mostly dealing with terminal mode, - * let us just skip the next character. - */ - case 'z': - return ESCAPE_SKIPCHAR; - - /* - * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where - * 'X' is the trigger. These have opaque sub-strings. - */ - case 'F': - case 'f': - case 'g': - case 'k': - case 'M': - case 'm': - case 'n': - case 'O': - case 'V': - case 'Y': - case '*': - switch ((*start)[-1]) { - case 'f': - gly = ESCAPE_FONT; - break; - case '*': - gly = ESCAPE_DEVICE; - break; - default: - gly = ESCAPE_IGNORE; - break; - } - switch (**start) { - case '(': - if ((*start)[-1] == 'O') - gly = ESCAPE_ERROR; - *start = ++*end; - *sz = 2; - break; - case '[': - if ((*start)[-1] == 'O') - gly = (*start)[1] == '5' ? - ESCAPE_UNSUPP : ESCAPE_ERROR; - *start = ++*end; - term = ']'; - break; - default: - if ((*start)[-1] == 'O') { - switch (**start) { - case '0': - gly = ESCAPE_UNSUPP; - break; - case '1': - case '2': - case '3': - case '4': - break; - default: - gly = ESCAPE_ERROR; - break; - } - } - *sz = 1; - break; - } - break; - - /* - * These escapes are of the form \X'Y', where 'X' is the trigger - * and 'Y' is any string. These have opaque sub-strings. - * The \B and \w escapes are handled in roff.c, roff_res(). - */ - case 'A': - case 'b': - case 'D': - case 'R': - case 'X': - case 'Z': - gly = ESCAPE_IGNORE; - /* FALLTHROUGH */ - case 'o': - if (**start == '\0') - return ESCAPE_ERROR; - if (gly == ESCAPE_ERROR) - gly = ESCAPE_OVERSTRIKE; - term = **start; - *start = ++*end; - break; - - /* - * These escapes are of the form \X'N', where 'X' is the trigger - * and 'N' resolves to a numerical expression. - */ - case 'h': - case 'H': - case 'L': - case 'l': - case 'S': - case 'v': - case 'x': - if (strchr(" %&()*+-./0123456789:<=>", **start)) { - if ('\0' != **start) - ++*end; - return ESCAPE_ERROR; - } - switch ((*start)[-1]) { - case 'h': - gly = ESCAPE_HORIZ; - break; - case 'l': - gly = ESCAPE_HLINE; - break; - default: - gly = ESCAPE_IGNORE; - break; - } - term = **start; - *start = ++*end; - break; - - /* - * Special handling for the numbered character escape. - * XXX Do any other escapes need similar handling? - */ - case 'N': - if ('\0' == **start) - return ESCAPE_ERROR; - (*end)++; - if (isdigit((unsigned char)**start)) { - *sz = 1; - return ESCAPE_IGNORE; - } - (*start)++; - while (isdigit((unsigned char)**end)) - (*end)++; - *sz = *end - *start; - if ('\0' != **end) - (*end)++; - return ESCAPE_NUMBERED; - - /* - * Sizes get a special category of their own. - */ - case 's': - gly = ESCAPE_IGNORE; - - /* See +/- counts as a sign. */ - if ('+' == **end || '-' == **end || ASCII_HYPH == **end) - *start = ++*end; - - switch (**end) { - case '(': - *start = ++*end; - *sz = 2; - break; - case '[': - *start = ++*end; - term = ']'; - break; - case '\'': - *start = ++*end; - term = '\''; - break; - case '3': - case '2': - case '1': - *sz = (*end)[-1] == 's' && - isdigit((unsigned char)(*end)[1]) ? 2 : 1; - break; - default: - *sz = 1; - break; - } - - break; - - /* - * Several special characters can be encoded as - * one-byte escape sequences without using \[]. - */ - case ' ': - case '\'': - case '-': - case '.': - case '0': - case ':': - case '_': - case '`': - case 'e': - case '~': - gly = ESCAPE_SPECIAL; - /* FALLTHROUGH */ - default: - if (gly == ESCAPE_ERROR) - gly = ESCAPE_UNDEF; - *start = --*end; - *sz = 1; - break; - } - - /* - * Read up to the terminating character, - * paying attention to nested escapes. - */ - - if ('\0' != term) { - while (**end != term) { - switch (**end) { - case '\0': - return ESCAPE_ERROR; - case '\\': - (*end)++; - if (ESCAPE_ERROR == - mandoc_escape(end, NULL, NULL)) - return ESCAPE_ERROR; - break; - default: - (*end)++; - break; - } - } - *sz = (*end)++ - *start; - - /* - * The file chars.c only provides one common list - * of character names, but \[-] == \- is the only - * one of the characters with one-byte names that - * allows enclosing the name in brackets. - */ - if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') - return ESCAPE_ERROR; - } else { - assert(*sz > 0); - if ((size_t)*sz > strlen(*start)) - return ESCAPE_ERROR; - *end += *sz; - } - - /* Run post-processors. */ - - switch (gly) { - case ESCAPE_FONT: - gly = mandoc_font(*start, *sz); - break; - case ESCAPE_SPECIAL: - if (**start == 'c') { - if (*sz < 6 || *sz > 7 || - strncmp(*start, "char", 4) != 0 || - (int)strspn(*start + 4, "0123456789") + 4 < *sz) - break; - c = 0; - for (i = 4; i < *sz; i++) - c = 10 * c + ((*start)[i] - '0'); - if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) - break; - *start += 4; - *sz -= 4; - gly = ESCAPE_NUMBERED; - break; - } - - /* - * Unicode escapes are defined in groff as \[u0000] - * to \[u10FFFF], where the contained value must be - * a valid Unicode codepoint. Here, however, only - * check the length and range. - */ - if (**start != 'u' || *sz < 5 || *sz > 7) - break; - if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) - break; - if (*sz == 6 && (*start)[1] == '0') - break; - if (*sz == 5 && (*start)[1] == 'D' && - strchr("89ABCDEF", (*start)[2]) != NULL) - break; - if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") - + 1 == *sz) - gly = ESCAPE_UNICODE; - break; - case ESCAPE_DEVICE: - assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T'); - break; - default: - break; - } - - return gly; -} - static int a2time(time_t *t, const char *fmt, const char *p) { @@ -285,11 +285,12 @@ enum mandocerr { }; enum mandoc_esc { - ESCAPE_ERROR = 0, /* bail! unparsable escape */ - ESCAPE_UNSUPP, /* unsupported escape; ignore it */ - ESCAPE_IGNORE, /* escape to be ignored */ - ESCAPE_UNDEF, /* undefined escape; print literal character */ - ESCAPE_SPECIAL, /* a regular special character */ + ESCAPE_EXPAND = 0, /* interpolation and iterative call needed */ + ESCAPE_ERROR, /* non-fatal error: unparsable escape */ + ESCAPE_UNSUPP, /* unsupported escape: warn and ignore */ + ESCAPE_IGNORE, /* valid escape to be ignored */ + ESCAPE_UNDEF, /* undefined escape: print literal character */ + ESCAPE_SPECIAL, /* special character escape */ ESCAPE_FONT, /* a generic font mode */ ESCAPE_FONTBOLD, /* bold font mode */ ESCAPE_FONTITALIC, /* italic font mode */ @@ -207,6 +207,8 @@ static int roff_evalpar(struct roff *, int, static int roff_evalstrcond(const char *, int *); static int roff_expand(struct roff *, struct buf *, int, int, char); +static void roff_expand_patch(struct buf *, int, + const char *, int); static void roff_free1(struct roff *); static void roff_freereg(struct roffreg *); static void roff_freestr(struct roffkv *); @@ -1233,9 +1235,15 @@ deroff(char **dest, const struct roff_node *n) /* --- main functions of the roff parser ---------------------------------- */ +/* + * Save comments preceding the title macro, for example in order to + * preserve Copyright and license headers in HTML output, + * provide diagnostics about RCS ids and trailing whitespace in comments, + * then discard comments including preceding whitespace. + * This function also handles input line continuation. + */ static int -roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, - char newesc) +roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, char ec) { struct roff_node *n; /* used for header comments */ const char *start; /* start of the string to process */ @@ -1245,15 +1253,39 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, int rcsid; /* kind of RCS id seen */ for (start = stesc = buf->buf + pos;; stesc++) { + /* + * XXX Ugly hack: Remove the newline character that + * mparse_buf_r() appended to mark the end of input + * if it is not preceded by an escape character. + */ + if (stesc[0] == '\n') { + assert(stesc[1] == '\0'); + stesc[0] = '\0'; + } + /* The line ends without continuation or comment. */ if (stesc[0] == '\0') return ROFF_CONT; /* Unescaped byte: skip it. */ - if (stesc[0] != newesc) + if (stesc[0] != ec) continue; - /* Backslash at end of line requests line continuation. */ + /* + * XXX Ugly hack: Do not attempt to append another line + * if the function mparse_buf_r() appended a newline + * character to indicate the end of input. + */ + if (stesc[1] == '\n') { + assert(stesc[2] == '\0'); + stesc[0] = '\0'; + return ROFF_CONT; + } + + /* + * An escape character at the end of an input line + * requests line continuation. + */ if (stesc[1] == '\0') { stesc[0] = '\0'; return ROFF_IGN | ROFF_APPEND; @@ -1264,7 +1296,7 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, break; /* Escaped escape character: skip them both. */ - if (stesc[1] == newesc) + if (stesc[1] == ec) stesc++; } @@ -1331,325 +1363,218 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, * which typically produce output glyphs or change formatter state. */ static int -roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char newesc) +roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char ec) { - struct mctx *ctx; /* current macro call context */ - char ubuf[24]; /* buffer to print the number */ - const char *start; /* start of the string to process */ - char *stesc; /* start of an escape sequence ('\\') */ - const char *esct; /* type of esccape sequence */ - const char *stnam; /* start of the name, after "[(*" */ - const char *cp; /* end of the name, e.g. before ']' */ - const char *res; /* the string to be substituted */ - char *nbuf; /* new buffer to copy buf->buf to */ - size_t maxl; /* expected length of the escape name */ - size_t naml; /* actual length of the escape name */ - size_t asz; /* length of the replacement */ - size_t rsz; /* length of the rest of the string */ - int inaml; /* length returned from mandoc_escape() */ + char ubuf[24]; /* buffer to print a number */ + struct mctx *ctx; /* current macro call context */ + const char *res; /* the string to be pasted */ + const char *src; /* source for copying */ + char *dst; /* destination for copying */ + int iesc; /* index of leading escape char */ + int inam; /* index of the escape name */ + int iarg; /* index beginning the argument */ + int iendarg; /* index right after the argument */ + int iend; /* index right after the sequence */ + int deftype; /* type of definition to paste */ + int argi; /* macro argument index */ + int quote_args; /* true for \\$@, false for \\$* */ + int asz; /* length of the replacement */ + int rsz; /* length of the rest of the string */ + int npos; /* position in numeric expression */ int expand_count; /* to avoid infinite loops */ - int npos; /* position in numeric expression */ - int arg_complete; /* argument not interrupted by eol */ - int quote_args; /* true for \\$@, false for \\$* */ - int deftype; /* type of definition to paste */ - enum mandocerr err; /* for escape sequence problems */ - char sign; /* increment number register */ - char term; /* character terminating the escape */ - - start = buf->buf + pos; - stesc = strchr(start, '\0') - 1; - if (stesc >= start && *stesc == '\n') - *stesc-- = '\0'; expand_count = 0; - while (stesc >= start) { - if (*stesc != newesc) { + while (buf->buf[pos] != '\0') { - /* - * If we have a non-standard escape character, - * escape literal backslashes because all - * processing in subsequent functions uses - * the standard escaping rules. - */ + /* + * Skip plain ASCII characters. + * If we have a non-standard escape character, + * escape literal backslashes because all processing in + * subsequent functions uses the standard escaping rules. + */ - if (newesc != ASCII_ESC && *stesc == '\\') { - *stesc = '\0'; - buf->sz = mandoc_asprintf(&nbuf, "%s\\e%s", - buf->buf, stesc + 1) + 1; - start = nbuf + pos; - stesc = nbuf + (stesc - buf->buf); - free(buf->buf); - buf->buf = nbuf; + if (buf->buf[pos] != ec) { + if (ec != ASCII_ESC && buf->buf[pos] == '\\') { + roff_expand_patch(buf, pos, "\\e", pos + 1); + pos++; } - - /* Search backwards for the next escape. */ - - stesc--; + pos++; continue; } - /* If it is escaped, skip it. */ - - for (cp = stesc - 1; cp >= start; cp--) - if (*cp != r->escape) - break; - - if ((stesc - cp) % 2 == 0) { - while (stesc > cp) - *stesc-- = '\\'; - continue; - } else if (stesc[1] == '\0') { - *stesc-- = '\0'; - continue; - } else - *stesc = '\\'; - - /* Decide whether to expand or to check only. */ + /* + * Parse escape sequences, + * issue diagnostic messages when appropriate, + * and skip sequences that do not need expansion. + * If we have a non-standard escape character, translate + * it to backslashes and translate backslashes to \e. + */ - term = '\0'; - cp = stesc + 1; - while (*cp == 'E') - cp++; - esct = cp; - switch (*esct) { - case '*': - case '$': - res = NULL; - break; - case 'B': - case 'w': - term = cp[1]; - /* FALLTHROUGH */ - case 'n': - sign = cp[1]; - if (sign == '+' || sign == '-') - cp++; - res = ubuf; - break; - default: - err = MANDOCERR_OK; - switch(mandoc_escape(&cp, &stnam, &inaml)) { - case ESCAPE_SPECIAL: - if (mchars_spec2cp(stnam, inaml) >= 0) - break; - /* FALLTHROUGH */ - case ESCAPE_ERROR: - err = MANDOCERR_ESC_BAD; - break; - case ESCAPE_UNDEF: - err = MANDOCERR_ESC_UNDEF; - break; - case ESCAPE_UNSUPP: - err = MANDOCERR_ESC_UNSUPP; - break; - default: - break; + if (roff_escape(buf->buf, ln, pos, + &iesc, &iarg, &iendarg, &iend) != ESCAPE_EXPAND) { + while (pos < iend) { + if (buf->buf[pos] == ec) { + buf->buf[pos] = '\\'; + if (pos + 1 < iend) + pos++; + } else if (buf->buf[pos] == '\\') { + roff_expand_patch(buf, + pos, "\\e", pos + 1); + pos++; + iend++; + } + pos++; } - if (err != MANDOCERR_OK) - mandoc_msg(err, ln, (int)(stesc - buf->buf), - "%.*s", (int)(cp - stesc), stesc); - stesc--; continue; } - if (EXPAND_LIMIT < ++expand_count) { - mandoc_msg(MANDOCERR_ROFFLOOP, - ln, (int)(stesc - buf->buf), NULL); - return ROFF_IGN; - } - /* - * The third character decides the length - * of the name of the string or register. - * Save a pointer to the name. + * Treat "\E" just like "\"; + * it only makes a difference in copy mode. */ - if (term == '\0') { - switch (*++cp) { - case '\0': - maxl = 0; - break; - case '(': - cp++; - maxl = 2; - break; - case '[': - cp++; - term = ']'; - maxl = 0; - break; - default: - maxl = 1; - break; - } - } else { - cp += 2; - maxl = 0; - } - stnam = cp; + inam = iesc + 1; + while (buf->buf[inam] == 'E') + inam++; - /* Advance to the end of the name. */ + /* Handle expansion. */ - naml = 0; - arg_complete = 1; - while (maxl == 0 || naml < maxl) { - if (*cp == '\0') { - mandoc_msg(MANDOCERR_ESC_BAD, ln, - (int)(stesc - buf->buf), "%s", stesc); - arg_complete = 0; - break; - } - if (maxl == 0 && *cp == term) { - cp++; - break; - } - if (*cp++ != '\\' || *esct != 'w') { - naml++; - continue; - } - switch (mandoc_escape(&cp, NULL, NULL)) { - case ESCAPE_SPECIAL: - case ESCAPE_UNICODE: - case ESCAPE_NUMBERED: - case ESCAPE_UNDEF: - case ESCAPE_OVERSTRIKE: - naml++; + res = NULL; + switch (buf->buf[inam]) { + case '*': + if (iendarg == iarg) break; - default: + deftype = ROFFDEF_USER | ROFFDEF_PRE; + if ((res = roff_getstrn(r, buf->buf + iarg, + iendarg - iarg, &deftype)) != NULL) break; - } - } - /* - * Retrieve the replacement string; if it is - * undefined, resume searching for escapes. - */ + /* + * If not overriden, + * let \*(.T through to the formatters. + */ - switch (*esct) { - case '*': - if (arg_complete) { - deftype = ROFFDEF_USER | ROFFDEF_PRE; - res = roff_getstrn(r, stnam, naml, &deftype); - - /* - * If not overriden, let \*(.T - * through to the formatters. - */ - - if (res == NULL && naml == 2 && - stnam[0] == '.' && stnam[1] == 'T') { - roff_setstrn(&r->strtab, - ".T", 2, NULL, 0, 0); - stesc--; - continue; - } + if (iendarg - iarg == 2 && + buf->buf[iarg] == '.' && + buf->buf[iarg + 1] == 'T') { + roff_setstrn(&r->strtab, ".T", 2, NULL, 0, 0); + pos = iend; + continue; } + + mandoc_msg(MANDOCERR_STR_UNDEF, ln, iesc, + "%.*s", iendarg - iarg, buf->buf + iarg); break; + case '$': if (r->mstackpos < 0) { - mandoc_msg(MANDOCERR_ARG_UNDEF, ln, - (int)(stesc - buf->buf), "%.3s", stesc); + mandoc_msg(MANDOCERR_ARG_UNDEF, ln, iesc, + "%.*s", iend - iesc, buf->buf + iesc); break; } ctx = r->mstack + r->mstackpos; - npos = esct[1] - '1'; - if (npos >= 0 && npos <= 8) { - res = npos < ctx->argc ? - ctx->argv[npos] : ""; + argi = buf->buf[iarg] - '1'; + if (argi >= 0 && argi <= 8) { + if (argi < ctx->argc) + res = ctx->argv[argi]; break; } - if (esct[1] == '*') + if (buf->buf[iarg] == '*') quote_args = 0; - else if (esct[1] == '@') + else if (buf->buf[iarg] == '@') quote_args = 1; else { - mandoc_msg(MANDOCERR_ARG_NONUM, ln, - (int)(stesc - buf->buf), "%.3s", stesc); + mandoc_msg(MANDOCERR_ARG_NONUM, ln, iesc, + "%.*s", iend - iesc, buf->buf + iesc); break; } asz = 0; - for (npos = 0; npos < ctx->argc; npos++) { - if (npos) + for (argi = 0; argi < ctx->argc; argi++) { + if (argi) asz++; /* blank */ if (quote_args) asz += 2; /* quotes */ - asz += strlen(ctx->argv[npos]); + asz += strlen(ctx->argv[argi]); } - if (asz != 3) { - rsz = buf->sz - (stesc - buf->buf) - 3; - if (asz < 3) - memmove(stesc + asz, stesc + 3, rsz); - buf->sz += asz - 3; - nbuf = mandoc_realloc(buf->buf, buf->sz); - start = nbuf + pos; - stesc = nbuf + (stesc - buf->buf); - buf->buf = nbuf; - if (asz > 3) - memmove(stesc + asz, stesc + 3, rsz); + if (asz != iend - iesc) { + rsz = buf->sz - iend; + if (asz < iend - iesc) + memmove(buf->buf + iesc + asz, + buf->buf + iend, rsz); + buf->sz = iesc + asz + rsz; + buf->buf = mandoc_realloc(buf->buf, buf->sz); + if (asz > iend - iesc) + memmove(buf->buf + iesc + asz, + buf->buf + iend, rsz); } - for (npos = 0; npos < ctx->argc; npos++) { - if (npos) - *stesc++ = ' '; + dst = buf->buf + iesc; + for (argi = 0; argi < ctx->argc; argi++) { + if (argi) + *dst++ = ' '; if (quote_args) - *stesc++ = '"'; - cp = ctx->argv[npos]; - while (*cp != '\0') - *stesc++ = *cp++; + *dst++ = '"'; + src = ctx->argv[argi]; + while (*src != '\0') + *dst++ = *src++; if (quote_args) - *stesc++ = '"'; + *dst++ = '"'; } continue; case 'B': npos = 0; - ubuf[0] = arg_complete && - roff_evalnum(r, ln, stnam, &npos, - NULL, ROFFNUM_SCALE) && - stnam + npos + 1 == cp ? '1' : '0'; + ubuf[0] = iendarg > iarg && iend > iendarg && + roff_evalnum(r, ln, buf->buf + iarg, &npos, + NULL, ROFFNUM_SCALE) && + npos == iendarg - iarg ? '1' : '0'; ubuf[1] = '\0'; + res = ubuf; break; case 'n': - if (arg_complete) + if (iendarg > iarg) (void)snprintf(ubuf, sizeof(ubuf), "%d", - roff_getregn(r, stnam, naml, sign)); + roff_getregn(r, buf->buf + iarg, + iendarg - iarg, buf->buf[inam + 1])); else ubuf[0] = '\0'; + res = ubuf; break; case 'w': - /* use even incomplete args */ - (void)snprintf(ubuf, sizeof(ubuf), "%d", - 24 * (int)naml); + (void)snprintf(ubuf, sizeof(ubuf), + "%d", (iendarg - iarg) * 24); + res = ubuf; + break; + default: break; } - - if (res == NULL) { - if (*esct == '*') - mandoc_msg(MANDOCERR_STR_UNDEF, - ln, (int)(stesc - buf->buf), - "%.*s", (int)naml, stnam); + if (res == NULL) res = ""; - } else if (buf->sz + strlen(res) > SHRT_MAX) { - mandoc_msg(MANDOCERR_ROFFLOOP, - ln, (int)(stesc - buf->buf), NULL); + if (++expand_count > EXPAND_LIMIT || + buf->sz + strlen(res) > SHRT_MAX) { + mandoc_msg(MANDOCERR_ROFFLOOP, ln, iesc, NULL); return ROFF_IGN; } - - /* Replace the escape sequence by the string. */ - - *stesc = '\0'; - buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", - buf->buf, res, cp) + 1; - - /* Prepare for the next replacement. */ - - start = nbuf + pos; - stesc = nbuf + (stesc - buf->buf) + strlen(res); - free(buf->buf); - buf->buf = nbuf; + roff_expand_patch(buf, iesc, res, iend); } return ROFF_CONT; } /* + * Replace the substring from the start position (inclusive) + * to end position (exclusive) with the repl(acement) string. + */ +static void +roff_expand_patch(struct buf *buf, int start, const char *repl, int end) +{ + char *nbuf; + + buf->buf[start] = '\0'; + buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", buf->buf, repl, + buf->buf + end) + 1; + free(buf->buf); + buf->buf = nbuf; +} + +/* * Parse a quoted or unquoted roff-style request or macro argument. * Return a pointer to the parsed argument, which is either the original * pointer or advanced by one byte in case the argument is quoted. diff --git a/roff_escape.c b/roff_escape.c new file mode 100644 index 00000000..1b5dc640 --- /dev/null +++ b/roff_escape.c @@ -0,0 +1,477 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022 + * Ingo Schwarze <schwarze@openbsd.org> + * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Parser for roff(7) escape sequences. + * To be used by all mandoc(1) parsers and formatters. + */ +#include <assert.h> +#include <ctype.h> +#include <limits.h> +#include <stdio.h> +#include <string.h> + +#include "mandoc.h" +#include "roff.h" +#include "roff_int.h" + +/* + * Traditional escape sequence interpreter for general use + * including in high-level formatters. This function does not issue + * diagnostics and is not usable for expansion in the roff(7) parser. + * It is documented in the mandoc_escape(3) manual page. + */ +enum mandoc_esc +mandoc_escape(const char **rendarg, const char **rarg, int *rargl) +{ + int iarg, iendarg, iend; + enum mandoc_esc rval; + + rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend); + assert(rval != ESCAPE_EXPAND); + if (rarg != NULL) + *rarg = *rendarg + iarg; + if (rargl != NULL) + *rargl = iendarg - iarg; + *rendarg += iend; + return rval; +} + +/* + * Full-featured escape sequence parser. + * If it encounters a nested escape sequence that requires expansion + * by the parser and re-parsing, the positions of that inner escape + * sequence are returned in *resc ... *rend. + * Otherwise, *resc is set to aesc and the positions of the escape + * sequence starting at aesc are returned. + * Diagnostic messages are generated if and only if resc != NULL, + * that is, if and only if called by roff_expand(). + */ +enum mandoc_esc +roff_escape(const char *buf, const int ln, const int aesc, + int *resc, int *rarg, int *rendarg, int *rend) +{ + int iesc; /* index of leading escape char */ + int iarg; /* index beginning the argument */ + int iendarg; /* index right after the argument */ + int iend; /* index right after the sequence */ + int sesc, sarg, sendarg, send; /* for sub-escape */ + int maxl; /* expected length of the argument */ + int argl; /* actual length of the argument */ + int c, i; /* for \[char...] parsing */ + enum mandoc_esc rval; /* return value */ + enum mandocerr err; /* diagnostic code */ + char esc_name; + char term; /* byte terminating the argument */ + + /* + * Treat "\E" just like "\"; + * it only makes a difference in copy mode. + */ + + iesc = iarg = aesc; + do { + iarg++; + } while (buf[iarg] == 'E'); + + /* + * Sort the following cases first by syntax category, + * then by escape sequence type, and finally by ASCII code. + */ + + esc_name = buf[iarg]; + iendarg = iend = ++iarg; + maxl = INT_MAX; + term = '\0'; + switch (esc_name) { + + /* Escape sequences taking no arguments at all. */ + + case '!': + case '?': + rval = ESCAPE_UNSUPP; + goto out; + + case '%': + case '&': + case ')': + case ',': + case '/': + case '^': + case 'a': + case 'd': + case 'r': + case 't': + case 'u': + case '{': + case '|': + case '}': + rval = ESCAPE_IGNORE; + goto out; + + case '\\': + default: + iarg--; + rval = ESCAPE_UNDEF; + goto out; + + case ' ': + case '\'': + case '-': + case '.': + case '0': + case ':': + case '_': + case '`': + case 'e': + case '~': + iarg--; + argl = 1; + rval = ESCAPE_SPECIAL; + goto out; + case 'p': + rval = ESCAPE_BREAK; + goto out; + case 'c': + rval = ESCAPE_NOSPACE; + goto out; + case 'z': + rval = ESCAPE_SKIPCHAR; + goto out; + + /* Standard argument format. */ + + case '$': + case '*': + case 'n': + rval = ESCAPE_EXPAND; + break; + case 'F': + case 'M': + case 'O': + case 'V': + case 'Y': + case 'g': + case 'k': + case 'm': + rval = ESCAPE_IGNORE; + break; + case '(': + case '[': + rval = ESCAPE_SPECIAL; + iendarg = iend = --iarg; + break; + case 'f': + rval = ESCAPE_FONT; + break; + + /* Quoted arguments */ + + case 'B': + case 'w': + rval = ESCAPE_EXPAND; + term = '\b'; + break; + case 'A': + case 'D': + case 'H': + case 'L': + case 'R': + case 'S': + case 'X': + case 'Z': + case 'b': + case 'v': + case 'x': + rval = ESCAPE_IGNORE; + term = '\b'; + break; + case 'C': + if (buf[iarg] != '\'') { + rval = ESCAPE_ERROR; + goto out; + } + rval = ESCAPE_SPECIAL; + term = '\b'; + break; + case 'N': + rval = ESCAPE_NUMBERED; + term = '\b'; + break; + case 'h': + rval = ESCAPE_HORIZ; + term = '\b'; + break; + case 'l': + rval = ESCAPE_HLINE; + term = '\b'; + break; + case 'o': + rval = ESCAPE_OVERSTRIKE; + term = '\b'; + break; + + /* Sizes support both forms, with additional peculiarities. */ + + case 's': + rval = ESCAPE_IGNORE; + if (buf[iarg] == '+' || buf[iarg] == '-'|| + buf[iarg] == ASCII_HYPH) + iarg++; + switch (buf[iarg]) { + case '(': + maxl = 2; + iarg++; + break; + case '[': + term = ']'; + iarg++; + break; + case '\'': + term = '\''; + iarg++; + break; + case '1': + case '2': + case '3': + if (buf[iarg - 1] == 's' && + isdigit((unsigned char)buf[iarg + 1])) { + maxl = 2; + break; + } + /* FALLTHROUGH */ + default: + maxl = 1; + break; + } + iendarg = iend = iarg; + } + + /* Decide how to end the argument. */ + + if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) && + buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg, + &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND) + goto out_sub; + + if (term == '\b') { + if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) || + (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>", + buf[iarg]) != NULL)) { + iendarg = iend = iarg + 1; + rval = ESCAPE_ERROR; + goto out; + } + term = buf[iarg++]; + } else if (term == '\0' && maxl == INT_MAX) { + if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-')) + iarg++; + switch (buf[iarg]) { + case '(': + maxl = 2; + iarg++; + break; + case '[': + if (buf[++iarg] == ' ') { + iendarg = iend = iarg + 1; + rval = ESCAPE_ERROR; + goto out; + } + term = ']'; + break; + default: + maxl = 1; + break; + } + } + + /* Advance to the end of the argument. */ + + iendarg = iarg; + while (maxl > 0) { + if (buf[iendarg] == '\0') { + /* Ignore an incomplete argument except for \w. */ + if (esc_name != 'w') + iendarg = iarg; + break; + } + if (buf[iendarg] == term) { + iend = iendarg + 1; + break; + } + if (esc_name == 'N' && + isdigit((unsigned char)buf[iendarg]) == 0) { + iend = iendarg + 1; + break; + } + if (buf[iendarg] == buf[iesc]) { + if (roff_escape(buf, ln, iendarg, + &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND) + goto out_sub; + iendarg = iend = send; + } else { + if (maxl != INT_MAX) + maxl--; + iend = ++iendarg; + } + } + if (resc != NULL && ((maxl != INT_MAX && maxl != 0) || + (term != '\0' && buf[iendarg] != term))) + mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc); + + /* Post-process depending on the content of the argument. */ + + argl = iendarg - iarg; + switch (esc_name) { + case '*': + if (resc == NULL && argl == 2 && + buf[iarg] == '.' && buf[iarg + 1] == 'T') + rval = ESCAPE_DEVICE; + break; + case 'O': + switch (buf[iarg]) { + case '0': + rval = ESCAPE_UNSUPP; + break; + case '1': + case '2': + case '3': + case '4': + rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR; + break; + case '5': + rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP : + ESCAPE_ERROR; + break; + default: + rval = ESCAPE_ERROR; + break; + } + break; + default: + break; + } + + switch (rval) { + case ESCAPE_FONT: + rval = mandoc_font(buf + iarg, argl); + break; + + case ESCAPE_SPECIAL: + + /* + * The file chars.c only provides one common list of + * character names, but \[-] == \- is the only one of + * the characters with one-byte names that allows + * enclosing the name in brackets. + */ + + if (term != '\0' && argl == 1 && buf[iarg] != '-') { + rval = ESCAPE_ERROR; + break; + } + + /* Treat \[char...] as an alias for \N'...'. */ + + if (buf[iarg] == 'c') { + if (argl < 6 || argl > 7 || + strncmp(buf + iarg, "char", 4) != 0 || + (int)strspn(buf + iarg + 4, "0123456789") + + 4 < argl) + break; + c = 0; + for (i = iarg; i < iendarg; i++) + c = 10 * c + (buf[i] - '0'); + if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) + break; + iarg += 4; + rval = ESCAPE_NUMBERED; + break; + } + + /* + * Unicode escapes are defined in groff as \[u0000] + * to \[u10FFFF], where the contained value must be + * a valid Unicode codepoint. Here, however, only + * check the length and range. + */ + + if (buf[iarg] != 'u' || argl < 5 || argl > 7) + break; + if (argl == 7 && + (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) + break; + if (argl == 6 && buf[iarg + 1] == '0') + break; + if (argl == 5 && buf[iarg + 1] == 'D' && + strchr("89ABCDEF", buf[iarg + 2]) != NULL) + break; + if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef") + + 1 == argl) + rval = ESCAPE_UNICODE; + break; + default: + break; + } + goto out; + +out_sub: + iesc = sesc; + iarg = sarg; + iendarg = sendarg; + iend = send; + rval = ESCAPE_EXPAND; + +out: + if (rarg != NULL) + *rarg = iarg; + if (rendarg != NULL) + *rendarg = iendarg; + if (rend != NULL) + *rend = iend; + if (resc == NULL) + return rval; + + /* + * Diagnostic messages are only issued when called + * from the parser, not when called from the formatters. + */ + + *resc = iesc; + switch (rval) { + case ESCAPE_ERROR: + err = MANDOCERR_ESC_BAD; + break; + case ESCAPE_UNSUPP: + err = MANDOCERR_ESC_UNSUPP; + break; + case ESCAPE_UNDEF: + if (esc_name == '\\') + return rval; + err = MANDOCERR_ESC_UNDEF; + break; + case ESCAPE_SPECIAL: + if (mchars_spec2cp(buf + iarg, argl) >= 0) + return rval; + err = MANDOCERR_ESC_BAD; + break; + default: + return rval; + } + mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc); + return rval; +} @@ -1,6 +1,6 @@ /* $OpenBSD: roff_int.h,v 1.16 2019/01/05 00:36:46 schwarze Exp $ */ /* - * Copyright (c) 2013-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org> + * Copyright (c) 2013-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org> * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any @@ -82,6 +82,8 @@ struct ohash *roffhash_alloc(enum roff_tok, enum roff_tok); enum roff_tok roffhash_find(struct ohash *, const char *, size_t); void roffhash_free(struct ohash *); +enum mandoc_esc roff_escape(const char *, const int, const int, + int *, int *, int *, int *); void roff_state_reset(struct roff_man *); void roff_validate(struct roff_man *); |