diff options
author | Ingo Schwarze <schwarze@openbsd.org> | 2014-10-25 01:03:52 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@openbsd.org> | 2014-10-25 01:03:52 +0000 |
commit | dd148a56f3f3e29132148b8f2bace859b7590d34 (patch) | |
tree | 5290383cf43df4fcf23e706458fc63fd96eb8cbf /read.c | |
parent | e3f177878d1b2ecad452ce2d7d08861420b2ffbb (diff) | |
download | mandoc-dd148a56f3f3e29132148b8f2bace859b7590d34.tar.gz |
integrate preconv(1) into mandoc(1);
enhances functionality and reduces code and docs by more than 300 lines
Diffstat (limited to 'read.c')
-rw-r--r-- | read.c | 71 |
1 files changed, 52 insertions, 19 deletions
@@ -45,11 +45,6 @@ #define REPARSE_LIMIT 1000 -struct buf { - char *buf; /* binary input buffer */ - size_t sz; /* size of binary buffer */ -}; - struct mparse { struct man *pman; /* persistent man parser */ struct mdoc *pmdoc; /* persistent mdoc parser */ @@ -65,6 +60,7 @@ struct mparse { enum mandoclevel file_status; /* status of current parse */ enum mandoclevel wlevel; /* ignore messages below this */ int options; /* parser options */ + int filenc; /* encoding of the current file */ int reparse_count; /* finite interp. stack */ int line; /* line number in the file */ }; @@ -326,13 +322,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start) lnn = curp->line; pos = 0; - for (i = 0; i < (int)blk.sz; ) { + for (i = blk.offs; i < (int)blk.sz; ) { if (0 == pos && '\0' == blk.buf[i]) break; if (start) { curp->line = lnn; curp->reparse_count = 0; + + if (lnn < 3 && + curp->filenc & MPARSE_UTF8 && + curp->filenc & MPARSE_LATIN1) { + blk.offs = i; + curp->filenc = preconv_cue(&blk); + } } while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { @@ -353,27 +356,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start) } /* - * Make sure we have space for at least - * one backslash and one other character - * and the trailing NUL byte. + * Make sure we have space for the worst + * case of 11 bytes: "\\[u10ffff]\0" */ - if (pos + 2 >= (int)ln.sz) + if (pos + 11 > (int)ln.sz) resize_buf(&ln, 256); /* - * Warn about bogus characters. If you're using - * non-ASCII encoding, you're screwing your - * readers. Since I'd rather this not happen, - * I'll be helpful and replace these characters - * with "?", so we don't display gibberish. - * Note to manual writers: use special characters. + * Encode 8-bit input. */ - c = (unsigned char) blk.buf[i]; + c = blk.buf[i]; + if (c & 0x80) { + blk.offs = i; + ln.offs = pos; + if (curp->filenc && preconv_encode( + &blk, &ln, &curp->filenc)) { + pos = ln.offs; + i = blk.offs; + } else { + mandoc_vmsg(MANDOCERR_BADCHAR, + curp, curp->line, pos, + "0x%x", c); + ln.buf[pos++] = '?'; + i++; + } + continue; + } + + /* + * Exclude control characters. + */ - if ( ! (isascii(c) && - (isgraph(c) || isblank(c)))) { + if (c == 0x7f || (c < 0x20 && c != 0x09)) { mandoc_vmsg(MANDOCERR_BADCHAR, curp, curp->line, pos, "0x%x", c); i++; @@ -633,6 +649,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd, return(0); } *with_mmap = 1; + fb->offs = 0; fb->sz = (size_t)st.st_size; fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); if (fb->buf != MAP_FAILED) @@ -664,6 +681,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd, ssz = read(fd, fb->buf + (int)off, fb->sz - off); if (ssz == 0) { fb->sz = off; + fb->offs = 0; return(1); } if (ssz == -1) { @@ -735,6 +753,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file) curp->line = 1; recursion_depth++; + /* Skip an UTF-8 byte order mark. */ + if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && + (unsigned char)blk.buf[0] == 0xef && + (unsigned char)blk.buf[1] == 0xbb && + (unsigned char)blk.buf[2] == 0xbf) { + blk.offs = 3; + curp->filenc &= ~MPARSE_LATIN1; + } + mparse_buf_r(curp, blk, 1); if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status) @@ -752,6 +779,7 @@ mparse_readmem(struct mparse *curp, const void *buf, size_t len, blk.buf = UNCONST(buf); blk.sz = len; + blk.offs = 0; mparse_parse_buffer(curp, blk, file); return(curp->file_status); @@ -762,6 +790,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file) { struct buf blk; int with_mmap; + int save_filenc; if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) { curp->file_status = MANDOCLEVEL_SYSERR; @@ -780,7 +809,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file) */ if (read_whole_file(curp, file, fd, &blk, &with_mmap)) { + save_filenc = curp->filenc; + curp->filenc = curp->options & + (MPARSE_UTF8 | MPARSE_LATIN1); mparse_parse_buffer(curp, blk, file); + curp->filenc = save_filenc; #if HAVE_MMAP if (with_mmap) munmap(blk.buf, blk.sz); |