diff options
author | Kristaps Dzonsons <kristaps@bsd.lv> | 2011-03-20 11:41:24 +0000 |
---|---|---|
committer | Kristaps Dzonsons <kristaps@bsd.lv> | 2011-03-20 11:41:24 +0000 |
commit | 0dd455c3616d3ebf78cdbd98891180fa3e4db3ed (patch) | |
tree | 337b6245a2f58a67a8c98a109601c0f1280995df | |
parent | a94b19b8c2bc397ca6842a84bf286e1456f51d98 (diff) | |
download | mandoc-0dd455c3616d3ebf78cdbd98891180fa3e4db3ed.tar.gz |
Split the document parsing sequence out of main.c and into read.c,
putting the interface into mandoc.h. This effectively makes the
function of main.c be command-line handling, invoking the parser, and
sending its output to the output handler. The sequence of parsing
(pfile(), pdesc(), etc.) has changed very little but for clean-up of
some state variables (curp->fd, etc.).
-rw-r--r-- | Makefile | 6 | ||||
-rw-r--r-- | main.c | 675 | ||||
-rw-r--r-- | mandoc.h | 25 | ||||
-rw-r--r-- | read.c | 611 |
4 files changed, 709 insertions, 608 deletions
@@ -66,15 +66,15 @@ MANSRCS = man_macro.c man.c man_hash.c man_validate.c \ MAINLNS = main.ln mdoc_term.ln chars.ln term.ln tree.ln \ compat.ln man_term.ln html.ln mdoc_html.ln \ man_html.ln out.ln term_ps.ln term_ascii.ln \ - tbl_term.ln tbl_html.ln + tbl_term.ln tbl_html.ln read.ln MAINOBJS = main.o mdoc_term.o chars.o term.o tree.o compat.o \ man_term.o html.o mdoc_html.o man_html.o out.o \ - term_ps.o term_ascii.o tbl_term.o tbl_html.o + term_ps.o term_ascii.o tbl_term.o tbl_html.o read.o MAINSRCS = main.c mdoc_term.c chars.c term.c tree.c compat.c \ man_term.c html.c mdoc_html.c man_html.c out.c \ - term_ps.c term_ascii.c tbl_term.c tbl_html.c + term_ps.c term_ascii.c tbl_term.c tbl_html.c read.c LLNS = llib-llibmdoc.ln llib-llibman.ln llib-lmandoc.ln \ llib-llibmandoc.ln llib-llibroff.ln @@ -19,12 +19,7 @@ #include "config.h" #endif -#include <sys/mman.h> -#include <sys/stat.h> - #include <assert.h> -#include <ctype.h> -#include <fcntl.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> @@ -37,14 +32,6 @@ #include "man.h" #include "roff.h" -#ifndef MAP_FILE -#define MAP_FILE 0 -#endif - -#define REPARSE_LIMIT 1000 - -/* FIXME: Intel's compiler? LLVM? pcc? */ - #if !defined(__GNUC__) || (__GNUC__ < 2) # if !defined(lint) # define __attribute__(x) @@ -55,43 +42,21 @@ typedef void (*out_mdoc)(void *, const struct mdoc *); typedef void (*out_man)(void *, const struct man *); typedef void (*out_free)(void *); -struct buf { - char *buf; - size_t sz; -}; - -enum intt { - INTT_AUTO, - INTT_MDOC, - INTT_MAN -}; - enum outt { - OUTT_ASCII = 0, - OUTT_TREE, - OUTT_HTML, - OUTT_XHTML, - OUTT_LINT, - OUTT_PS, - OUTT_PDF + OUTT_ASCII = 0, /* -Tascii */ + OUTT_TREE, /* -Ttree */ + OUTT_HTML, /* -Thtml */ + OUTT_XHTML, /* -Txhtml */ + OUTT_LINT, /* -Tlint */ + OUTT_PS, /* -Tps */ + OUTT_PDF /* -Tpdf */ }; struct curparse { - enum mandoclevel exit_status; /* status of all file parses */ + struct mparse *mp; const char *file; /* current file-name */ - enum mandoclevel file_status; /* error status of current parse */ - int fd; /* current file-descriptor */ - int line; /* line number in the file */ enum mandoclevel wlevel; /* ignore messages below this */ int wstop; /* stop after a file with a warning */ - enum intt inttype; /* which parser to use */ - struct man *pman; /* persistent man parser */ - struct mdoc *pmdoc; /* persistent mdoc parser */ - struct man *man; /* man parser */ - struct mdoc *mdoc; /* mdoc parser */ - struct roff *roff; /* roff parser (!NULL) */ - struct regset regs; /* roff registers */ - int reparse_count; /* finite interpolation stack */ enum outt outtype; /* which output to use */ out_mdoc outmdoc; /* mdoc output ptr */ out_man outman; /* man output ptr */ @@ -227,15 +192,13 @@ static const char * const mandocerrs[MANDOCERR_MAX] = { "static buffer exhausted", }; -static void parsebuf(struct curparse *, struct buf, int); -static void pdesc(struct curparse *); -static void fdesc(struct curparse *); -static void ffile(const char *, struct curparse *); -static int pfile(const char *, struct curparse *); -static int moptions(enum intt *, char *); +static void evt_close(void *, const char *); +static int evt_open(void *, const char *); +static int moptions(enum mparset *, char *); static void mmsg(enum mandocerr, void *, int, int, const char *); -static void pset(const char *, int, struct curparse *); +static void parse(struct curparse *, int, + const char *, enum mandoclevel *); static int toptions(struct curparse *, char *); static void usage(void) __attribute__((noreturn)); static void version(void) __attribute__((noreturn)); @@ -248,6 +211,8 @@ main(int argc, char *argv[]) { int c; struct curparse curp; + enum mparset type; + enum mandoclevel rc; progname = strrchr(argv[0], '/'); if (progname == NULL) @@ -257,16 +222,15 @@ main(int argc, char *argv[]) memset(&curp, 0, sizeof(struct curparse)); - curp.inttype = INTT_AUTO; + type = MPARSE_AUTO; curp.outtype = OUTT_ASCII; curp.wlevel = MANDOCLEVEL_FATAL; - curp.exit_status = MANDOCLEVEL_OK; /* LINTED */ while (-1 != (c = getopt(argc, argv, "m:O:T:VW:"))) switch (c) { case ('m'): - if ( ! moptions(&curp.inttype, optarg)) + if ( ! moptions(&type, optarg)) return((int)MANDOCLEVEL_BADARG); break; case ('O'): @@ -289,50 +253,44 @@ main(int argc, char *argv[]) /* NOTREACHED */ } + curp.mp = mparse_alloc(type, evt_open, evt_close, mmsg, &curp); + argc -= optind; argv += optind; - if (NULL == *argv) { - curp.file = "<stdin>"; - curp.fd = STDIN_FILENO; + rc = MANDOCLEVEL_OK; - fdesc(&curp); - } + if (NULL == *argv) + parse(&curp, STDIN_FILENO, "<stdin>", &rc); while (*argv) { - ffile(*argv, &curp); - if (MANDOCLEVEL_OK != curp.exit_status && curp.wstop) + parse(&curp, -1, *argv, &rc); + if (MANDOCLEVEL_OK != rc && curp.wstop) break; ++argv; } if (curp.outfree) (*curp.outfree)(curp.outdata); - if (curp.pmdoc) - mdoc_free(curp.pmdoc); - if (curp.pman) - man_free(curp.pman); - if (curp.roff) - roff_free(curp.roff); - - return((int)curp.exit_status); -} + if (curp.mp) + mparse_free(curp.mp); + return((int)rc); +} static void version(void) { - (void)printf("%s %s\n", progname, VERSION); + printf("%s %s\n", progname, VERSION); exit((int)MANDOCLEVEL_OK); } - static void usage(void) { - (void)fprintf(stderr, "usage: %s " + fprintf(stderr, "usage: %s " "[-V] " "[-foption] " "[-mformat] " @@ -345,202 +303,49 @@ usage(void) exit((int)MANDOCLEVEL_BADARG); } -static void -ffile(const char *file, struct curparse *curp) -{ - - /* - * Called once per input file. Get the file ready for reading, - * pass it through to the parser-driver, then close it out. - * XXX: don't do anything special as this is only called for - * files; stdin goes directly to fdesc(). - */ - - curp->file = file; - - if (-1 == (curp->fd = open(curp->file, O_RDONLY, 0))) { - perror(curp->file); - curp->exit_status = MANDOCLEVEL_SYSERR; - return; - } - - fdesc(curp); - - if (-1 == close(curp->fd)) - perror(curp->file); -} - static int -pfile(const char *file, struct curparse *curp) +evt_open(void *arg, const char *file) { - const char *savefile; - int fd, savefd; - if (-1 == (fd = open(file, O_RDONLY, 0))) { - perror(file); - curp->file_status = MANDOCLEVEL_SYSERR; - return(0); - } - - savefile = curp->file; - savefd = curp->fd; - - curp->file = file; - curp->fd = fd; - - pdesc(curp); - - curp->file = savefile; - curp->fd = savefd; - - if (-1 == close(fd)) - perror(file); - - return(MANDOCLEVEL_FATAL > curp->file_status ? 1 : 0); + evt_close(arg, file); + return(1); } - static void -resize_buf(struct buf *buf, size_t initial) -{ - - buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; - buf->buf = mandoc_realloc(buf->buf, buf->sz); -} - - -static int -read_whole_file(struct curparse *curp, struct buf *fb, int *with_mmap) +evt_close(void *arg, const char *file) { - struct stat st; - size_t off; - ssize_t ssz; - - if (-1 == fstat(curp->fd, &st)) { - perror(curp->file); - return(0); - } - - /* - * If we're a regular file, try just reading in the whole entry - * via mmap(). This is faster than reading it into blocks, and - * since each file is only a few bytes to begin with, I'm not - * concerned that this is going to tank any machines. - */ - - if (S_ISREG(st.st_mode)) { - if (st.st_size >= (1U << 31)) { - fprintf(stderr, "%s: input too large\n", - curp->file); - return(0); - } - *with_mmap = 1; - fb->sz = (size_t)st.st_size; - fb->buf = mmap(NULL, fb->sz, PROT_READ, - MAP_FILE|MAP_SHARED, curp->fd, 0); - if (fb->buf != MAP_FAILED) - return(1); - } - - /* - * If this isn't a regular file (like, say, stdin), then we must - * go the old way and just read things in bit by bit. - */ - - *with_mmap = 0; - off = 0; - fb->sz = 0; - fb->buf = NULL; - for (;;) { - if (off == fb->sz) { - if (fb->sz == (1U << 31)) { - fprintf(stderr, "%s: input too large\n", - curp->file); - break; - } - resize_buf(fb, 65536); - } - ssz = read(curp->fd, fb->buf + (int)off, fb->sz - off); - if (ssz == 0) { - fb->sz = off; - return(1); - } - if (ssz == -1) { - perror(curp->file); - break; - } - off += (size_t)ssz; - } + struct curparse *p; - free(fb->buf); - fb->buf = NULL; - return(0); + p = (struct curparse *)arg; + p->file = file; } - static void -fdesc(struct curparse *curp) +parse(struct curparse *curp, int fd, + const char *file, enum mandoclevel *level) { + enum mandoclevel rc; + struct mdoc *mdoc; + struct man *man; - /* - * Called once per file with an opened file descriptor. All - * pre-file-parse operations (whether stdin or a file) should go - * here. - * - * This calls down into the nested parser, which drills down and - * fully parses a file and all its dependences (i.e., `so'). It - * then runs the cleanup validators and pushes to output. - */ - - /* Zero the parse type. */ - - curp->mdoc = NULL; - curp->man = NULL; - curp->file_status = MANDOCLEVEL_OK; + /* Begin by parsing the file itself. */ - /* Make sure the mandotory roff parser is initialised. */ - - if (NULL == curp->roff) { - curp->roff = roff_alloc(&curp->regs, curp, mmsg); - assert(curp->roff); - } - - /* Fully parse the file. */ - - pdesc(curp); - - if (MANDOCLEVEL_FATAL <= curp->file_status) - goto cleanup; - - /* NOTE a parser may not have been assigned, yet. */ - - if ( ! (curp->man || curp->mdoc)) { - fprintf(stderr, "%s: Not a manual\n", curp->file); - curp->file_status = MANDOCLEVEL_FATAL; - goto cleanup; - } + assert(file); + assert(fd >= -1); - /* Clean up the parse routine ASTs. */ + rc = mparse_readfd(curp->mp, fd, file); - if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) { - assert(MANDOCLEVEL_FATAL <= curp->file_status); - goto cleanup; - } + /* Stop immediately if the parse has failed. */ - if (curp->man && ! man_endparse(curp->man)) { - assert(MANDOCLEVEL_FATAL <= curp->file_status); + if (MANDOCLEVEL_FATAL <= rc) goto cleanup; - } - - assert(curp->roff); - roff_endparse(curp->roff); /* - * With -Wstop and warnings or errors of at least - * the requested level, do not produce output. + * With -Wstop and warnings or errors of at least the requested + * level, do not produce output. */ - if (MANDOCLEVEL_OK != curp->file_status && curp->wstop) + if (MANDOCLEVEL_OK != rc && curp->wstop) goto cleanup; /* If unset, allocate output dev now (if applicable). */ @@ -594,373 +399,33 @@ fdesc(struct curparse *curp) } } + mparse_result(curp->mp, &mdoc, &man); + /* Execute the out device, if it exists. */ - if (curp->man && curp->outman) - (*curp->outman)(curp->outdata, curp->man); - if (curp->mdoc && curp->outmdoc) - (*curp->outmdoc)(curp->outdata, curp->mdoc); + if (man && curp->outman) + (*curp->outman)(curp->outdata, man); + if (mdoc && curp->outmdoc) + (*curp->outmdoc)(curp->outdata, mdoc); cleanup: - memset(&curp->regs, 0, sizeof(struct regset)); - - /* Reset the current-parse compilers. */ - - if (curp->mdoc) - mdoc_reset(curp->mdoc); - if (curp->man) - man_reset(curp->man); - - assert(curp->roff); - roff_reset(curp->roff); - - if (curp->exit_status < curp->file_status) - curp->exit_status = curp->file_status; - - return; -} - -static void -pdesc(struct curparse *curp) -{ - struct buf blk; - int with_mmap; - - /* - * Run for each opened file; may be called more than once for - * each full parse sequence if the opened file is nested (i.e., - * from `so'). Simply sucks in the whole file and moves into - * the parse phase for the file. - */ - - if ( ! read_whole_file(curp, &blk, &with_mmap)) { - curp->file_status = MANDOCLEVEL_SYSERR; - return; - } - - /* Line number is per-file. */ - - curp->line = 1; - - parsebuf(curp, blk, 1); - - if (with_mmap) - munmap(blk.buf, blk.sz); - else - free(blk.buf); -} - -/* - * Main parse routine for an opened file. This is called for each - * opened file and simply loops around the full input file, possibly - * nesting (i.e., with `so'). - */ -static void -parsebuf(struct curparse *curp, struct buf blk, int start) -{ - const struct tbl_span *span; - struct buf ln; - enum rofferr rr; - int i, of, rc; - int pos; /* byte number in the ln buffer */ - int lnn; /* line number in the real file */ - unsigned char c; - - memset(&ln, 0, sizeof(struct buf)); - - lnn = curp->line; - pos = 0; - - for (i = 0; i < (int)blk.sz; ) { - if (0 == pos && '\0' == blk.buf[i]) - break; - - if (start) { - curp->line = lnn; - curp->reparse_count = 0; - } - - while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { - - /* - * When finding an unescaped newline character, - * leave the character loop to process the line. - * Skip a preceding carriage return, if any. - */ - - if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz && - '\n' == blk.buf[i + 1]) - ++i; - if ('\n' == blk.buf[i]) { - ++i; - ++lnn; - break; - } - - /* - * Warn about bogus characters. If you're using - * non-ASCII encoding, you're screwing your - * readers. Since I'd rather this not happen, - * I'll be helpful and drop these characters so - * we don't display gibberish. Note to manual - * writers: use special characters. - */ - - c = (unsigned char) blk.buf[i]; - - if ( ! (isascii(c) && - (isgraph(c) || isblank(c)))) { - mmsg(MANDOCERR_BADCHAR, curp, - curp->line, pos, "ignoring byte"); - i++; - continue; - } - - /* Trailing backslash = a plain char. */ - - if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { - if (pos >= (int)ln.sz) - resize_buf(&ln, 256); - ln.buf[pos++] = blk.buf[i++]; - continue; - } - - /* - * Found escape and at least one other character. - * When it's a newline character, skip it. - * When there is a carriage return in between, - * skip that one as well. - */ - - if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz && - '\n' == blk.buf[i + 2]) - ++i; - if ('\n' == blk.buf[i + 1]) { - i += 2; - ++lnn; - continue; - } - - if ('"' == blk.buf[i + 1]) { - i += 2; - /* Comment, skip to end of line */ - for (; i < (int)blk.sz; ++i) { - if ('\n' == blk.buf[i]) { - ++i; - ++lnn; - break; - } - } - - /* Backout trailing whitespaces */ - for (; pos > 0; --pos) { - if (ln.buf[pos - 1] != ' ') - break; - if (pos > 2 && ln.buf[pos - 2] == '\\') - break; - } - break; - } - - /* Some other escape sequence, copy & cont. */ - - if (pos + 1 >= (int)ln.sz) - resize_buf(&ln, 256); - - ln.buf[pos++] = blk.buf[i++]; - ln.buf[pos++] = blk.buf[i++]; - } - - if (pos >= (int)ln.sz) - resize_buf(&ln, 256); - - ln.buf[pos] = '\0'; - - /* - * A significant amount of complexity is contained by - * the roff preprocessor. It's line-oriented but can be - * expressed on one line, so we need at times to - * readjust our starting point and re-run it. The roff - * preprocessor can also readjust the buffers with new - * data, so we pass them in wholesale. - */ - - of = 0; - -rerun: - rr = roff_parseln - (curp->roff, curp->line, - &ln.buf, &ln.sz, of, &of); - - switch (rr) { - case (ROFF_REPARSE): - if (REPARSE_LIMIT >= ++curp->reparse_count) - parsebuf(curp, ln, 0); - else - mmsg(MANDOCERR_ROFFLOOP, curp, - curp->line, pos, NULL); - pos = 0; - continue; - case (ROFF_APPEND): - pos = (int)strlen(ln.buf); - continue; - case (ROFF_RERUN): - goto rerun; - case (ROFF_IGN): - pos = 0; - continue; - case (ROFF_ERR): - assert(MANDOCLEVEL_FATAL <= curp->file_status); - break; - case (ROFF_SO): - if (pfile(ln.buf + of, curp)) { - pos = 0; - continue; - } else - break; - default: - break; - } - - /* - * If we encounter errors in the recursive parsebuf() - * call, make sure we don't continue parsing. - */ - - if (MANDOCLEVEL_FATAL <= curp->file_status) - break; - - /* - * If input parsers have not been allocated, do so now. - * We keep these instanced betwen parsers, but set them - * locally per parse routine since we can use different - * parsers with each one. - */ - - if ( ! (curp->man || curp->mdoc)) - pset(ln.buf + of, pos - of, curp); - - /* - * Lastly, push down into the parsers themselves. One - * of these will have already been set in the pset() - * routine. - * If libroff returns ROFF_TBL, then add it to the - * currently open parse. Since we only get here if - * there does exist data (see tbl_data.c), we're - * guaranteed that something's been allocated. - * Do the same for ROFF_EQN. - */ - - rc = -1; - - if (ROFF_TBL == rr) - while (NULL != (span = roff_span(curp->roff))) { - rc = curp->man ? - man_addspan(curp->man, span) : - mdoc_addspan(curp->mdoc, span); - if (0 == rc) - break; - } - else if (ROFF_EQN == rr) - rc = curp->mdoc ? - mdoc_addeqn(curp->mdoc, - roff_eqn(curp->roff)) : - man_addeqn(curp->man, - roff_eqn(curp->roff)); - else if (curp->man || curp->mdoc) - rc = curp->man ? - man_parseln(curp->man, - curp->line, ln.buf, of) : - mdoc_parseln(curp->mdoc, - curp->line, ln.buf, of); - - if (0 == rc) { - assert(MANDOCLEVEL_FATAL <= curp->file_status); - break; - } - - /* Temporary buffers typically are not full. */ - - if (0 == start && '\0' == blk.buf[i]) - break; - - /* Start the next input line. */ - - pos = 0; - } - - free(ln.buf); -} - -static void -pset(const char *buf, int pos, struct curparse *curp) -{ - int i; - - /* - * Try to intuit which kind of manual parser should be used. If - * passed in by command-line (-man, -mdoc), then use that - * explicitly. If passed as -mandoc, then try to guess from the - * line: either skip dot-lines, use -mdoc when finding `.Dt', or - * default to -man, which is more lenient. - * - * Separate out pmdoc/pman from mdoc/man: the first persists - * through all parsers, while the latter is used per-parse. - */ - - if ('.' == buf[0] || '\'' == buf[0]) { - for (i = 1; buf[i]; i++) - if (' ' != buf[i] && '\t' != buf[i]) - break; - if ('\0' == buf[i]) - return; - } + mparse_reset(curp->mp); - switch (curp->inttype) { - case (INTT_MDOC): - if (NULL == curp->pmdoc) - curp->pmdoc = mdoc_alloc - (&curp->regs, curp, mmsg); - assert(curp->pmdoc); - curp->mdoc = curp->pmdoc; - return; - case (INTT_MAN): - if (NULL == curp->pman) - curp->pman = man_alloc - (&curp->regs, curp, mmsg); - assert(curp->pman); - curp->man = curp->pman; - return; - default: - break; - } - - if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) { - if (NULL == curp->pmdoc) - curp->pmdoc = mdoc_alloc - (&curp->regs, curp, mmsg); - assert(curp->pmdoc); - curp->mdoc = curp->pmdoc; - return; - } - - if (NULL == curp->pman) - curp->pman = man_alloc(&curp->regs, curp, mmsg); - assert(curp->pman); - curp->man = curp->pman; + if (*level < rc) + *level = rc; } static int -moptions(enum intt *tflags, char *arg) +moptions(enum mparset *tflags, char *arg) { if (0 == strcmp(arg, "doc")) - *tflags = INTT_MDOC; + *tflags = MPARSE_MDOC; else if (0 == strcmp(arg, "andoc")) - *tflags = INTT_AUTO; + *tflags = MPARSE_AUTO; else if (0 == strcmp(arg, "an")) - *tflags = INTT_MAN; + *tflags = MPARSE_MAN; else { fprintf(stderr, "%s: Bad argument\n", arg); return(0); @@ -978,8 +443,7 @@ toptions(struct curparse *curp, char *arg) else if (0 == strcmp(arg, "lint")) { curp->outtype = OUTT_LINT; curp->wlevel = MANDOCLEVEL_WARNING; - } - else if (0 == strcmp(arg, "tree")) + } else if (0 == strcmp(arg, "tree")) curp->outtype = OUTT_TREE; else if (0 == strcmp(arg, "html")) curp->outtype = OUTT_HTML; @@ -1051,12 +515,13 @@ mmsg(enum mandocerr t, void *arg, int ln, int col, const char *msg) if (level < cp->wlevel) return; - fprintf(stderr, "%s:%d:%d: %s: %s", - cp->file, ln, col + 1, mandoclevels[level], mandocerrs[t]); + fprintf(stderr, "%s:%d:%d: %s: %s", cp->file, ln, col + 1, + mandoclevels[level], mandocerrs[t]); + if (msg) fprintf(stderr, ": %s", msg); + fputc('\n', stderr); - if (cp->file_status < level) - cp->file_status = level; + mparse_setstatus(cp->mp, level); } @@ -328,11 +328,36 @@ enum mdelim { DELIM_CLOSE }; +/* + * The type of parse sequence. This value is usually passed via the + * mandoc(1) command line of -man and -mdoc. It's almost exclusively + * -mandoc but the others have been retained for compatibility. + */ +enum mparset { + MPARSE_AUTO, /* magically determine the document type */ + MPARSE_MDOC, /* assume -mdoc */ + MPARSE_MAN /* assume -man */ +}; + typedef void (*mandocmsg)(enum mandocerr, void *, int, int, const char *); +typedef int (*mevt_open)(void *, const char *); +typedef void (*mevt_close)(void *, const char *); + +struct mparse; +struct mdoc; +struct man; __BEGIN_DECLS +void mparse_free(struct mparse *); +void mparse_reset(struct mparse *); +struct mparse *mparse_alloc(enum mparset, mevt_open, + mevt_close, mandocmsg, void *); +void mparse_setstatus(struct mparse *, enum mandoclevel); +enum mandoclevel mparse_readfd(struct mparse *, int, const char *); +void mparse_result(struct mparse *, struct mdoc **, struct man **); + void *mandoc_calloc(size_t, size_t); void *mandoc_malloc(size_t); void *mandoc_realloc(void *, size_t); @@ -0,0 +1,611 @@ +/* $Id$ */ +/* + * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> + * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <sys/mman.h> +#include <sys/stat.h> + +#include <assert.h> +#include <ctype.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "mandoc.h" +#include "mdoc.h" +#include "man.h" +#include "roff.h" + +#ifndef MAP_FILE +#define MAP_FILE 0 +#endif + +#define REPARSE_LIMIT 1000 + +struct buf { + char *buf; /* binary input buffer */ + size_t sz; /* size of binary buffer */ +}; + +struct mparse { + enum mandoclevel file_status; /* status of current parse */ + int line; /* line number in the file */ + enum mparset inttype; /* which parser to use */ + struct man *pman; /* persistent man parser */ + struct mdoc *pmdoc; /* persistent mdoc parser */ + struct man *man; /* man parser */ + struct mdoc *mdoc; /* mdoc parser */ + struct roff *roff; /* roff parser (!NULL) */ + struct regset regs; /* roff registers */ + int reparse_count; /* finite interp. stack */ + mandocmsg mmsg; /* warning/error message handler */ + void *arg; /* argument to mmsg */ + mevt_open evt_open; /* file-open event */ + mevt_close evt_close; /* file-close event */ + const char *svfile; +}; + +static void resize_buf(struct buf *, size_t); +static void mparse_buf_r(struct mparse *, struct buf, int); +static void mparse_readfd_r(struct mparse *, int, const char *, int); +static void pset(const char *, int, struct mparse *); +static void pdesc(struct mparse *, const char *, int); +static int read_whole_file(const char *, int, struct buf *, int *); +static void mparse_end(struct mparse *); + +static void +resize_buf(struct buf *buf, size_t initial) +{ + + buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; + buf->buf = mandoc_realloc(buf->buf, buf->sz); +} + +static void +pset(const char *buf, int pos, struct mparse *curp) +{ + int i; + + /* + * Try to intuit which kind of manual parser should be used. If + * passed in by command-line (-man, -mdoc), then use that + * explicitly. If passed as -mandoc, then try to guess from the + * line: either skip dot-lines, use -mdoc when finding `.Dt', or + * default to -man, which is more lenient. + * + * Separate out pmdoc/pman from mdoc/man: the first persists + * through all parsers, while the latter is used per-parse. + */ + + if ('.' == buf[0] || '\'' == buf[0]) { + for (i = 1; buf[i]; i++) + if (' ' != buf[i] && '\t' != buf[i]) + break; + if ('\0' == buf[i]) + return; + } + + switch (curp->inttype) { + case (MPARSE_MDOC): + if (NULL == curp->pmdoc) + curp->pmdoc = mdoc_alloc + (&curp->regs, curp->arg, curp->mmsg); + assert(curp->pmdoc); + curp->mdoc = curp->pmdoc; + return; + case (MPARSE_MAN): + if (NULL == curp->pman) + curp->pman = man_alloc + (&curp->regs, curp->arg, curp->mmsg); + assert(curp->pman); + curp->man = curp->pman; + return; + default: + break; + } + + if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) { + if (NULL == curp->pmdoc) + curp->pmdoc = mdoc_alloc + (&curp->regs, curp->arg, curp->mmsg); + assert(curp->pmdoc); + curp->mdoc = curp->pmdoc; + return; + } + + if (NULL == curp->pman) + curp->pman = man_alloc + (&curp->regs, curp->arg, curp->mmsg); + assert(curp->pman); + curp->man = curp->pman; +} + +/* + * Main parse routine for an opened file. This is called for each + * opened file and simply loops around the full input file, possibly + * nesting (i.e., with `so'). + */ +static void +mparse_buf_r(struct mparse *curp, struct buf blk, int start) +{ + const struct tbl_span *span; + struct buf ln; + enum rofferr rr; + int i, of, rc; + int pos; /* byte number in the ln buffer */ + int lnn; /* line number in the real file */ + unsigned char c; + + memset(&ln, 0, sizeof(struct buf)); + + lnn = curp->line; + pos = 0; + + for (i = 0; i < (int)blk.sz; ) { + if (0 == pos && '\0' == blk.buf[i]) + break; + + if (start) { + curp->line = lnn; + curp->reparse_count = 0; + } + + while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { + + /* + * When finding an unescaped newline character, + * leave the character loop to process the line. + * Skip a preceding carriage return, if any. + */ + + if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz && + '\n' == blk.buf[i + 1]) + ++i; + if ('\n' == blk.buf[i]) { + ++i; + ++lnn; + break; + } + + /* + * Warn about bogus characters. If you're using + * non-ASCII encoding, you're screwing your + * readers. Since I'd rather this not happen, + * I'll be helpful and drop these characters so + * we don't display gibberish. Note to manual + * writers: use special characters. + */ + + c = (unsigned char) blk.buf[i]; + + if ( ! (isascii(c) && + (isgraph(c) || isblank(c)))) { + curp->mmsg(MANDOCERR_BADCHAR, curp->arg, + curp->line, pos, "ignoring byte"); + i++; + continue; + } + + /* Trailing backslash = a plain char. */ + + if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { + if (pos >= (int)ln.sz) + resize_buf(&ln, 256); + ln.buf[pos++] = blk.buf[i++]; + continue; + } + + /* + * Found escape and at least one other character. + * When it's a newline character, skip it. + * When there is a carriage return in between, + * skip that one as well. + */ + + if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz && + '\n' == blk.buf[i + 2]) + ++i; + if ('\n' == blk.buf[i + 1]) { + i += 2; + ++lnn; + continue; + } + + if ('"' == blk.buf[i + 1]) { + i += 2; + /* Comment, skip to end of line */ + for (; i < (int)blk.sz; ++i) { + if ('\n' == blk.buf[i]) { + ++i; + ++lnn; + break; + } + } + + /* Backout trailing whitespaces */ + for (; pos > 0; --pos) { + if (ln.buf[pos - 1] != ' ') + break; + if (pos > 2 && ln.buf[pos - 2] == '\\') + break; + } + break; + } + + /* Some other escape sequence, copy & cont. */ + + if (pos + 1 >= (int)ln.sz) + resize_buf(&ln, 256); + + ln.buf[pos++] = blk.buf[i++]; + ln.buf[pos++] = blk.buf[i++]; + } + + if (pos >= (int)ln.sz) + resize_buf(&ln, 256); + + ln.buf[pos] = '\0'; + + /* + * A significant amount of complexity is contained by + * the roff preprocessor. It's line-oriented but can be + * expressed on one line, so we need at times to + * readjust our starting point and re-run it. The roff + * preprocessor can also readjust the buffers with new + * data, so we pass them in wholesale. + */ + + of = 0; + +rerun: + rr = roff_parseln + (curp->roff, curp->line, + &ln.buf, &ln.sz, of, &of); + + switch (rr) { + case (ROFF_REPARSE): + if (REPARSE_LIMIT >= ++curp->reparse_count) + mparse_buf_r(curp, ln, 0); + else + curp->mmsg(MANDOCERR_ROFFLOOP, curp->arg, + curp->line, pos, NULL); + pos = 0; + continue; + case (ROFF_APPEND): + pos = (int)strlen(ln.buf); + continue; + case (ROFF_RERUN): + goto rerun; + case (ROFF_IGN): + pos = 0; + continue; + case (ROFF_ERR): + assert(MANDOCLEVEL_FATAL <= curp->file_status); + break; + case (ROFF_SO): + mparse_readfd_r(curp, -1, ln.buf + of, 1); + if (MANDOCLEVEL_FATAL <= curp->file_status) + break; + pos = 0; + continue; + default: + break; + } + + /* + * If we encounter errors in the recursive parse, make + * sure we don't continue parsing. + */ + + if (MANDOCLEVEL_FATAL <= curp->file_status) + break; + + /* + * If input parsers have not been allocated, do so now. + * We keep these instanced betwen parsers, but set them + * locally per parse routine since we can use different + * parsers with each one. + */ + + if ( ! (curp->man || curp->mdoc)) + pset(ln.buf + of, pos - of, curp); + + /* + * Lastly, push down into the parsers themselves. One + * of these will have already been set in the pset() + * routine. + * If libroff returns ROFF_TBL, then add it to the + * currently open parse. Since we only get here if + * there does exist data (see tbl_data.c), we're + * guaranteed that something's been allocated. + * Do the same for ROFF_EQN. + */ + + rc = -1; + + if (ROFF_TBL == rr) + while (NULL != (span = roff_span(curp->roff))) { + rc = curp->man ? + man_addspan(curp->man, span) : + mdoc_addspan(curp->mdoc, span); + if (0 == rc) + break; + } + else if (ROFF_EQN == rr) + rc = curp->mdoc ? + mdoc_addeqn(curp->mdoc, + roff_eqn(curp->roff)) : + man_addeqn(curp->man, + roff_eqn(curp->roff)); + else if (curp->man || curp->mdoc) + rc = curp->man ? + man_parseln(curp->man, + curp->line, ln.buf, of) : + mdoc_parseln(curp->mdoc, + curp->line, ln.buf, of); + + if (0 == rc) { + assert(MANDOCLEVEL_FATAL <= curp->file_status); + break; + } + + /* Temporary buffers typically are not full. */ + + if (0 == start && '\0' == blk.buf[i]) + break; + + /* Start the next input line. */ + + pos = 0; + } + + free(ln.buf); +} + +static void +pdesc(struct mparse *curp, const char *file, int fd) +{ + struct buf blk; + int with_mmap; + + /* + * Run for each opened file; may be called more than once for + * each full parse sequence if the opened file is nested (i.e., + * from `so'). Simply sucks in the whole file and moves into + * the parse phase for the file. + */ + + if ( ! read_whole_file(file, fd, &blk, &with_mmap)) { + curp->file_status = MANDOCLEVEL_SYSERR; + return; + } + + /* Line number is per-file. */ + + curp->line = 1; + + mparse_buf_r(curp, blk, 1); + + if (with_mmap) + munmap(blk.buf, blk.sz); + else + free(blk.buf); +} + +static int +read_whole_file(const char *file, int fd, struct buf *fb, int *with_mmap) +{ + struct stat st; + size_t off; + ssize_t ssz; + + if (-1 == fstat(fd, &st)) { + perror(file); + return(0); + } + + /* + * If we're a regular file, try just reading in the whole entry + * via mmap(). This is faster than reading it into blocks, and + * since each file is only a few bytes to begin with, I'm not + * concerned that this is going to tank any machines. + */ + + if (S_ISREG(st.st_mode)) { + if (st.st_size >= (1U << 31)) { + fprintf(stderr, "%s: input too large\n", file); + return(0); + } + *with_mmap = 1; + fb->sz = (size_t)st.st_size; + fb->buf = mmap(NULL, fb->sz, PROT_READ, + MAP_FILE|MAP_SHARED, fd, 0); + if (fb->buf != MAP_FAILED) + return(1); + } + + /* + * If this isn't a regular file (like, say, stdin), then we must + * go the old way and just read things in bit by bit. + */ + + *with_mmap = 0; + off = 0; + fb->sz = 0; + fb->buf = NULL; + for (;;) { + if (off == fb->sz) { + if (fb->sz == (1U << 31)) { + fprintf(stderr, "%s: input too large\n", file); + break; + } + resize_buf(fb, 65536); + } + ssz = read(fd, fb->buf + (int)off, fb->sz - off); + if (ssz == 0) { + fb->sz = off; + return(1); + } + if (ssz == -1) { + perror(file); + break; + } + off += (size_t)ssz; + } + + free(fb->buf); + fb->buf = NULL; + return(0); +} + +static void +mparse_end(struct mparse *curp) +{ + + if (MANDOCLEVEL_FATAL <= curp->file_status) + return; + + if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) { + assert(MANDOCLEVEL_FATAL <= curp->file_status); + return; + } + + if (curp->man && ! man_endparse(curp->man)) { + assert(MANDOCLEVEL_FATAL <= curp->file_status); + return; + } + +#if 0 + /* NOTE a parser may not have been assigned, yet. */ + + if ( ! (curp->man || curp->mdoc)) { + /* FIXME: make into an mandoc.h error. */ + fprintf(stderr, "%s: Not a manual\n", curp->file); + curp->file_status = MANDOCLEVEL_FATAL; + goto cleanup; + } +#endif + + roff_endparse(curp->roff); +} + +static void +mparse_readfd_r(struct mparse *curp, int fd, const char *file, int re) +{ + const char *svfile; + + if ( ! (*curp->evt_open)(curp->arg, file)) { + curp->file_status = MANDOCLEVEL_SYSERR; + return; + } + + if (-1 == fd) + if (-1 == (fd = open(file, O_RDONLY, 0))) { + perror(file); + curp->file_status = MANDOCLEVEL_SYSERR; + return; + } + + svfile = curp->svfile; + curp->svfile = file; + + pdesc(curp, file, fd); + + if (0 == re && MANDOCLEVEL_FATAL > curp->file_status) + mparse_end(curp); + + if (STDIN_FILENO != fd && -1 == close(fd)) + perror(file); + + (*curp->evt_close)(curp->arg, svfile); + curp->svfile = svfile; +} + +enum mandoclevel +mparse_readfd(struct mparse *curp, int fd, const char *file) +{ + + mparse_readfd_r(curp, fd, file, 0); + return(curp->file_status); +} + +void +mparse_setstatus(struct mparse *curp, enum mandoclevel lvl) +{ + + if (curp->file_status < lvl) + curp->file_status = lvl; +} + +struct mparse * +mparse_alloc(enum mparset inttype, mevt_open eopen, + mevt_close eclose, mandocmsg mmsg, void *arg) +{ + struct mparse *curp; + + curp = mandoc_calloc(1, sizeof(struct mparse)); + + curp->mmsg = mmsg; + curp->arg = arg; + curp->inttype = inttype; + curp->evt_open = eopen; + curp->evt_close = eclose; + + curp->roff = roff_alloc(&curp->regs, arg, mmsg); + return(curp); +} + +void +mparse_reset(struct mparse *curp) +{ + + memset(&curp->regs, 0, sizeof(struct regset)); + + roff_reset(curp->roff); + + if (curp->mdoc) + mdoc_reset(curp->mdoc); + if (curp->man) + man_reset(curp->man); + + curp->file_status = MANDOCLEVEL_OK; + curp->mdoc = NULL; + curp->man = NULL; +} + +void +mparse_free(struct mparse *curp) +{ + + if (curp->pmdoc) + mdoc_free(curp->pmdoc); + if (curp->pman) + man_free(curp->pman); + if (curp->roff) + roff_free(curp->roff); + + free(curp); +} + +void +mparse_result(struct mparse *curp, struct mdoc **mdoc, struct man **man) +{ + + *mdoc = curp->mdoc; + *man = curp->man; +} |