summaryrefslogtreecommitdiffstats
path: root/mandoc.c
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@openbsd.org>2018-12-15 19:30:25 +0000
committerIngo Schwarze <schwarze@openbsd.org>2018-12-15 19:30:25 +0000
commit0e3f0b740ea18224c3b2c07114be601dd8be97bb (patch)
treec930c6fd7e739e926a7fad1c372897af5ea601fb /mandoc.c
parent2b0b19a54638a1b40d908611acc8498a911df29c (diff)
downloadmandoc-0e3f0b740ea18224c3b2c07114be601dd8be97bb.tar.gz
Several improvements to escape sequence handling.
* Add the missing special character \_ (underscore). * Partial implementations of \a (leader character) and \E (uninterpreted escape character). * Parse and ignore \r (reverse line feed). * Add a WARNING message about undefined escape sequences. * Add an UNSUPP message about unsupported escape sequences. * Mark \! and \? (transparent throughput) and \O (suppress output) as unsupported. * Treat the various variants of zero-width spaces as one-byte escape sequences rather than as special characters, to avoid defining bogus forms with square brackets. * For special characters with one-byte names, do not define bogus forms with square brackets, except for \[-], which is valid. * In the form with square brackets, undefined special characters do not fall back to printing the name verbatim, not even for one-byte names. * Starting a special character name with a blank is an error. * Undefined escape sequences never abort formatting of the input string, not even in HTML output mode. * Document the newly handled escapes, and a few that were missing. * Regression tests for most of the above.
Diffstat (limited to 'mandoc.c')
-rw-r--r--mandoc.c94
1 files changed, 78 insertions, 16 deletions
diff --git a/mandoc.c b/mandoc.c
index c20fe33c..aec05f73 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -56,6 +56,14 @@ mandoc_escape(const char **end, const char **start, int *sz)
sz = &local_sz;
/*
+ * Treat "\E" just like "\";
+ * it only makes a difference in copy mode.
+ */
+
+ if (**end == 'E')
+ ++*end;
+
+ /*
* Beyond the backslash, at least one input character
* is part of the escape sequence. With one exception
* (see below), that character won't be returned.
@@ -77,6 +85,10 @@ mandoc_escape(const char **end, const char **start, int *sz)
*sz = 2;
break;
case '[':
+ if (**start == ' ') {
+ ++*end;
+ return ESCAPE_ERROR;
+ }
gly = ESCAPE_SPECIAL;
term = ']';
break;
@@ -91,11 +103,26 @@ mandoc_escape(const char **end, const char **start, int *sz)
/*
* Escapes taking no arguments at all.
*/
- case 'd':
- case 'u':
+ case '!':
+ case '?':
+ return ESCAPE_UNSUPP;
+ case '%':
+ case '&':
+ case ')':
case ',':
case '/':
+ case '^':
+ case 'a':
+ case 'd':
+ case 'r':
+ case 't':
+ case 'u':
+ case '{':
+ case '|':
+ case '}':
return ESCAPE_IGNORE;
+ case 'c':
+ return ESCAPE_NOSPACE;
case 'p':
return ESCAPE_BREAK;
@@ -113,28 +140,46 @@ mandoc_escape(const char **end, const char **start, int *sz)
* 'X' is the trigger. These have opaque sub-strings.
*/
case 'F':
+ case 'f':
case 'g':
case 'k':
case 'M':
case 'm':
case 'n':
+ case 'O':
case 'V':
case 'Y':
- gly = ESCAPE_IGNORE;
- /* FALLTHROUGH */
- case 'f':
- if (ESCAPE_ERROR == gly)
- gly = ESCAPE_FONT;
+ gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
switch (**start) {
case '(':
+ if ((*start)[-1] == 'O')
+ gly = ESCAPE_ERROR;
*start = ++*end;
*sz = 2;
break;
case '[':
+ if ((*start)[-1] == 'O')
+ gly = (*start)[1] == '5' ?
+ ESCAPE_UNSUPP : ESCAPE_ERROR;
*start = ++*end;
term = ']';
break;
default:
+ if ((*start)[-1] == 'O') {
+ switch (**start) {
+ case '0':
+ gly = ESCAPE_UNSUPP;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ break;
+ default:
+ gly = ESCAPE_ERROR;
+ break;
+ }
+ }
*sz = 1;
break;
}
@@ -257,18 +302,29 @@ mandoc_escape(const char **end, const char **start, int *sz)
break;
/*
- * Anything else is assumed to be a glyph.
- * In this case, pass back the character after the backslash.
+ * Several special characters can be encoded as
+ * one-byte escape sequences without using \[].
*/
- default:
+ case ' ':
+ case '\'':
+ case '-':
+ case '.':
+ case '0':
+ case ':':
+ case '_':
+ case '`':
+ case 'e':
+ case '~':
gly = ESCAPE_SPECIAL;
+ /* FALLTHROUGH */
+ default:
+ if (gly == ESCAPE_ERROR)
+ gly = ESCAPE_UNDEF;
*start = --*end;
*sz = 1;
break;
}
- assert(ESCAPE_ERROR != gly);
-
/*
* Read up to the terminating character,
* paying attention to nested escapes.
@@ -291,6 +347,15 @@ mandoc_escape(const char **end, const char **start, int *sz)
}
}
*sz = (*end)++ - *start;
+
+ /*
+ * The file chars.c only provides one common list
+ * of character names, but \[-] == \- is the only
+ * one of the characters with one-byte names that
+ * allows enclosing the name in brackets.
+ */
+ if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
+ return ESCAPE_ERROR;
} else {
assert(*sz > 0);
if ((size_t)*sz > strlen(*start))
@@ -346,10 +411,6 @@ mandoc_escape(const char **end, const char **start, int *sz)
break;
case ESCAPE_SPECIAL:
if (**start == 'c') {
- if (*sz == 1) {
- gly = ESCAPE_NOSPACE;
- break;
- }
if (*sz < 6 || *sz > 7 ||
strncmp(*start, "char", 4) != 0 ||
(int)strspn(*start + 4, "0123456789") + 4 < *sz)
@@ -431,6 +492,7 @@ mandoc_getarg(char **cpp, int ln, int *pos)
* backslashes and backslash-t to literal tabs.
*/
switch (cp[1]) {
+ case 'a':
case 't':
cp[0] = '\t';
/* FALLTHROUGH */