From 714ad8829b754f1142d7471b3dbf5e2400f8e96d Mon Sep 17 00:00:00 2001 From: Kristaps Dzonsons Date: Tue, 20 Jul 2010 14:56:42 +0000 Subject: Strip non-graphable input characters from input. The manuals specifically say that this is not allowed, and were it allowed, output would be inconsistent across output media (-Tps will puke, non-your-charset terminals will puke, etc.). With this done, simplify check_text() to only check escapes and for tabs. Add in a new tab warning, too. --- main.c | 22 ++++++++++++++++++++++ man_validate.c | 44 +++++++++++++++++++++++++------------------- mandoc.h | 1 + mdoc_validate.c | 38 +++++++++++++++++++------------------- 4 files changed, 67 insertions(+), 38 deletions(-) diff --git a/main.c b/main.c index 8a997e9a..9d5a90bd 100644 --- a/main.c +++ b/main.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ static const char * const mandocerrs[MANDOCERR_MAX] = { "list type must come first", "bad standard", "bad library", + "tab in non-literal context", "bad escape sequence", "unterminated quoted string", "argument requires the width argument", @@ -491,6 +493,26 @@ fdesc(struct curparse *curp) ++lnn; break; } + + /* + * Warn about bogus characters. If you're using + * non-ASCII encoding, you're screwing your + * readers. Since I'd rather this not happen, + * I'll be helpful and drop these characters so + * we don't display gibberish. Note to manual + * writers: use special characters. + */ + + if ( ! isgraph((u_char)blk.buf[i]) && + ! isblank((u_char)blk.buf[i])) { + if ( ! mmsg(MANDOCERR_BADCHAR, curp, + lnn_start, pos, + "ignoring byte")) + goto bailout; + i++; + continue; + } + /* Trailing backslash is like a plain character. */ if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { if (pos >= (int)ln.sz) diff --git a/man_validate.c b/man_validate.c index 57f8be9f..0d96953c 100644 --- a/man_validate.c +++ b/man_validate.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mandoc.h" #include "libman.h" @@ -206,32 +207,37 @@ check_text(CHKARGS) { char *p; int pos, c; - - assert(n->string); + size_t sz; for (p = n->string, pos = n->pos + 1; *p; p++, pos++) { - if ('\\' == *p) { - c = mandoc_special(p); - if (c) { - p += c - 1; - pos += c - 1; - continue; - } + sz = strcspn(p, "\t\\"); + p += (int)sz; + + if ('\0' == *p) + break; + + pos += (int)sz; - c = man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE); - if ( ! (MAN_IGN_ESCAPE & m->pflags) && ! c) - return(c); + if ('\t' == *p) { + if (MAN_LITERAL & m->flags) + continue; + if (man_pmsg(m, n->line, pos, MANDOCERR_BADTAB)) + continue; + return(0); } - /* - * FIXME: we absolutely cannot let \b get through or it - * will destroy some assumptions in terms of format. - */ + /* Check the special character. */ - if ('\t' == *p || isprint((u_char)*p) || ASCII_HYPH == *p) + c = mandoc_special(p); + if (c) { + p += c - 1; + pos += c - 1; continue; - if ( ! man_pmsg(m, n->line, pos, MANDOCERR_BADCHAR)) - return(0); + } + + c = man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE); + if ( ! (MAN_IGN_ESCAPE & m->pflags) && ! c) + return(c); } return(1); diff --git a/mandoc.h b/mandoc.h index 8b3dd794..c7ac2a3e 100644 --- a/mandoc.h +++ b/mandoc.h @@ -39,6 +39,7 @@ enum mandocerr { MANDOCERR_LISTFIRST, /* list type must come first */ MANDOCERR_BADSTANDARD, /* bad standard */ MANDOCERR_BADLIB, /* bad library */ + MANDOCERR_BADTAB, /* tab in non-literal context */ MANDOCERR_BADESCAPE, /* bad escape sequence */ MANDOCERR_BADQUOTE, /* unterminated quoted string */ MANDOCERR_NOWIDTHARG, /* argument requires the width argument */ diff --git a/mdoc_validate.c b/mdoc_validate.c index 6bd6c14d..cc00a1ae 100644 --- a/mdoc_validate.c +++ b/mdoc_validate.c @@ -453,26 +453,29 @@ check_argv(struct mdoc *m, struct mdoc_node *n, struct mdoc_argv *v) static int -check_text(struct mdoc *mdoc, int line, int pos, char *p) +check_text(struct mdoc *m, int ln, int pos, char *p) { int c; - - /* - * FIXME: we absolutely cannot let \b get through or it will - * destroy some assumptions in terms of format. - */ + size_t sz; for ( ; *p; p++, pos++) { + sz = strcspn(p, "\t\\"); + p += (int)sz; + + if ('\0' == *p) + break; + + pos += (int)sz; + if ('\t' == *p) { - if ( ! (MDOC_LITERAL & mdoc->flags)) - if ( ! mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADCHAR)) - return(0); - } else if ( ! isprint((u_char)*p) && ASCII_HYPH != *p) - if ( ! mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADCHAR)) - return(0); + if (MDOC_LITERAL & m->flags) + continue; + if (mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB)) + continue; + return(0); + } - if ('\\' != *p) - continue; + /* Check the special character. */ c = mandoc_special(p); if (c) { @@ -481,8 +484,8 @@ check_text(struct mdoc *mdoc, int line, int pos, char *p) continue; } - c = mdoc_pmsg(mdoc, line, pos, MANDOCERR_BADESCAPE); - if ( ! (MDOC_IGN_ESCAPE & mdoc->pflags) && ! c) + c = mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE); + if ( ! (MDOC_IGN_ESCAPE & m->pflags) && ! c) return(c); } @@ -490,8 +493,6 @@ check_text(struct mdoc *mdoc, int line, int pos, char *p) } - - static int check_parent(PRE_ARGS, enum mdoct tok, enum mdoc_type t) { @@ -509,7 +510,6 @@ check_parent(PRE_ARGS, enum mdoct tok, enum mdoc_type t) } - static int pre_display(PRE_ARGS) { -- cgit