diff options
author | Kristaps Dzonsons <kristaps@bsd.lv> | 2011-04-09 15:29:40 +0000 |
---|---|---|
committer | Kristaps Dzonsons <kristaps@bsd.lv> | 2011-04-09 15:29:40 +0000 |
commit | 523c7db3c47602feb761c95b387c6c93138264f5 (patch) | |
tree | b36433935eca0c86910b6a754aef3fa1363fe8bb /mandoc.c | |
parent | e99813aa631956ea6bc1323879f2817f2fe6f761 (diff) | |
download | mandoc-523c7db3c47602feb761c95b387c6c93138264f5.tar.gz |
Remove a2roffdeco() and mandoc_special() functions and replace them with
a public (mandoc.h) function mandoc_escape(), which merges the
functionality of both prior functions.
Reason: code duplication. The a2roffdeco() and mandoc_special()
functions were pretty much the same thing and both quite complex. This
allows one function to receive improvements in (e.g.) subexpression
handling and performance, instead of having to replicate functionality.
As such, the mandoc_escape() function already handles a superset of the
escapes handled in previous versions and has improvements in performance
(using strcspn(), for example) and reliable handling of subexpressions.
This code Works For Me, but may need work to catch any regressions.
Since the benefits are great (leaner code, simpler API), I'd rather have
it in-tree than floating as a patch.
Diffstat (limited to 'mandoc.c')
-rw-r--r-- | mandoc.c | 444 |
1 files changed, 304 insertions, 140 deletions
@@ -35,199 +35,363 @@ static int a2time(time_t *, const char *, const char *); static char *time2a(time_t); +static int numescape(const char *); -int -mandoc_special(char *p) +/* + * Pass over recursive numerical expressions. This context of this + * function is important: it's only called within character-terminating + * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial + * recursion: we don't care about what's in these blocks. + * This returns the number of characters skipped or -1 if an error + * occurs (the caller should bail). + */ +static int +numescape(const char *start) +{ + int i; + size_t sz; + const char *cp; + + i = 0; + + /* The expression consists of a subexpression. */ + + if ('\\' == start[i]) { + cp = &start[++i]; + /* + * Read past the end of the subexpression. + * Bail immediately on errors. + */ + if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) + return(-1); + return(i + cp - &start[i]); + } + + if ('(' != start[i++]) + return(0); + + /* + * A parenthesised subexpression. Read until the closing + * parenthesis, making sure to handle any nested subexpressions + * that might ruin our parse. + */ + + while (')' != start[i]) { + sz = strcspn(&start[i], ")\\"); + i += (int)sz; + + if ('\0' == start[i]) + return(-1); + else if ('\\' != start[i]) + continue; + + cp = &start[++i]; + if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) + return(-1); + i += cp - &start[i]; + } + + /* Read past the terminating ')'. */ + return(++i); +} + +/* + * Handle an escaped sequeence. This should be called with any + * string subsequent a `\'. Pass a pointer to this substring as "end"; + * it will be set to the supremum of the parsed escape sequence. If + * this returns ESCAPE_ERROR, the string is bogus and should be thrown + * away. If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the + * first relevant character of the substring (font, glyph, whatever) of + * length sz. Both "start" and "sz" may be NULL. + */ +enum mandoc_esc +mandoc_escape(const char **end, const char **start, int *sz) { - int len, i; - char term; - char *sv; - - len = 0; + char c, term, numeric; + int i, lim, ssz, rlim; + const char *cp, *rstart; + enum mandoc_esc gly; + + cp = *end; + rstart = cp; + if (start) + *start = rstart; + i = 0; + gly = ESCAPE_ERROR; term = '\0'; - sv = p; + numeric = 0; - assert('\\' == *p); - p++; + switch ((c = cp[i++])) { + /* + * First the glyphs. There are several different forms of + * these, but each eventually returns a substring of the glyph + * name. + */ + case ('('): + gly = ESCAPE_SPECIAL; + lim = 2; + break; + case ('['): + gly = ESCAPE_SPECIAL; + term = ']'; + break; + case ('C'): + if ('\'' != cp[i]) + return(ESCAPE_ERROR); + gly = ESCAPE_SPECIAL; + term = '\''; + break; - switch (*p++) { -#if 0 - case ('Z'): + /* + * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where + * 'X' is the trigger. These have opaque sub-strings. + */ + case ('F'): /* FALLTHROUGH */ - case ('X'): + case ('g'): /* FALLTHROUGH */ - case ('x'): + case ('k'): /* FALLTHROUGH */ - case ('S'): + case ('M'): /* FALLTHROUGH */ - case ('R'): + case ('m'): /* FALLTHROUGH */ - case ('N'): + case ('n'): /* FALLTHROUGH */ - case ('l'): + case ('V'): /* FALLTHROUGH */ - case ('L'): + case ('Y'): + if (ESCAPE_ERROR == gly) + gly = ESCAPE_IGNORE; /* FALLTHROUGH */ - case ('H'): + case ('*'): + if (ESCAPE_ERROR == gly) + gly = ESCAPE_PREDEF; /* FALLTHROUGH */ - case ('h'): + case ('f'): + if (ESCAPE_ERROR == gly) + gly = ESCAPE_FONT; + + rstart= &cp[i]; + if (start) + *start = rstart; + + switch (cp[i++]) { + case ('('): + lim = 2; + break; + case ('['): + term = ']'; + break; + default: + lim = 1; + i--; + break; + } + break; + + /* + * These escapes are of the form \X'Y', where 'X' is the trigger + * and 'Y' is any string. These have opaque sub-strings. + */ + case ('A'): /* FALLTHROUGH */ - case ('D'): + case ('b'): /* FALLTHROUGH */ - case ('C'): + case ('D'): /* FALLTHROUGH */ - case ('b'): + case ('o'): /* FALLTHROUGH */ - case ('B'): + case ('R'): /* FALLTHROUGH */ - case ('a'): + case ('X'): /* FALLTHROUGH */ - case ('A'): - if (*p++ != '\'') - return(0); + case ('Z'): + if ('\'' != cp[i++]) + return(ESCAPE_ERROR); + gly = ESCAPE_IGNORE; term = '\''; break; -#endif + + /* + * These escapes are of the form \X'N', where 'X' is the trigger + * and 'N' resolves to a numerical expression. + */ + case ('B'): + /* FALLTHROUGH */ case ('h'): /* FALLTHROUGH */ + case ('H'): + /* FALLTHROUGH */ + case ('L'): + /* FALLTHROUGH */ + case ('l'): + /* FALLTHROUGH */ + case ('N'): + if (ESCAPE_ERROR == gly) + gly = ESCAPE_NUMBERED; + /* FALLTHROUGH */ + case ('S'): + /* FALLTHROUGH */ case ('v'): /* FALLTHROUGH */ + case ('w'): + /* FALLTHROUGH */ + case ('x'): + if (ESCAPE_ERROR == gly) + gly = ESCAPE_IGNORE; + if ('\'' != cp[i++]) + return(ESCAPE_ERROR); + term = numeric = '\''; + break; + + /* + * Sizes get a special category of their own. + */ case ('s'): - if (ASCII_HYPH == *p) - *p = '-'; + gly = ESCAPE_IGNORE; - i = 0; - if ('+' == *p || '-' == *p) { - p++; - i = 1; - } + rstart = &cp[i]; + if (start) + *start = rstart; + + /* See +/- counts as a sign. */ + c = cp[i]; + if ('+' == c || '-' == c || ASCII_HYPH == c) + ++i; - switch (*p++) { + switch (cp[i++]) { case ('('): - len = 2; + lim = 2; break; case ('['): - term = ']'; + term = numeric = ']'; break; case ('\''): - term = '\''; + term = numeric = '\''; break; - case ('0'): - i = 1; - /* FALLTHROUGH */ default: - len = 1; - p--; + lim = 1; + i--; break; } - if (ASCII_HYPH == *p) - *p = '-'; - if ('+' == *p || '-' == *p) { - if (i) - return(0); - p++; - } - - /* Handle embedded numerical subexp or escape. */ - - if ('(' == *p) { - while (*p && ')' != *p) - if ('\\' == *p++) { - i = mandoc_special(--p); - if (0 == i) - return(0); - p += i; - } - - if (')' == *p++) - break; + /* See +/- counts as a sign. */ + c = cp[i]; + if ('+' == c || '-' == c || ASCII_HYPH == c) + ++i; - return(0); - } else if ('\\' == *p) { - if (0 == (i = mandoc_special(p))) - return(0); - p += i; - } + break; + /* + * Anything else is assumed to be a glyph. + */ + default: + gly = ESCAPE_SPECIAL; + lim = 1; + i--; break; -#if 0 - case ('Y'): - /* FALLTHROUGH */ - case ('V'): - /* FALLTHROUGH */ - case ('$'): - /* FALLTHROUGH */ - case ('n'): - /* FALLTHROUGH */ -#endif - case ('k'): - /* FALLTHROUGH */ - case ('M'): - /* FALLTHROUGH */ - case ('m'): - /* FALLTHROUGH */ - case ('f'): - /* FALLTHROUGH */ - case ('F'): - /* FALLTHROUGH */ - case ('*'): - switch (*p++) { - case ('('): - len = 2; + } + + assert(ESCAPE_ERROR != gly); + + rstart = &cp[i]; + if (start) + *start = rstart; + + /* + * If a terminating block has been specified, we need to + * handle the case of recursion, which could have their + * own terminating blocks that mess up our parse. This, by the + * way, means that the "start" and "size" values will be + * effectively meaningless. + */ + + ssz = 0; + if (numeric && -1 == (ssz = numescape(&cp[i]))) + return(ESCAPE_ERROR); + + i += ssz; + rlim = -1; + + /* + * We have a character terminator. Try to read up to that + * character. If we can't (i.e., we hit the nil), then return + * an error; if we can, calculate our length, read past the + * terminating character, and exit. + */ + + if ('\0' != term) { + *end = strchr(&cp[i], term); + if ('\0' == *end) + return(ESCAPE_ERROR); + + rlim = *end - &cp[i]; + if (sz) + *sz = rlim; + (*end)++; + goto out; + } + + assert(lim > 0); + + /* + * We have a numeric limit. If the string is shorter than that, + * stop and return an error. Else adjust our endpoint, length, + * and return the current glyph. + */ + + if ((size_t)lim > strlen(&cp[i])) + return(ESCAPE_ERROR); + + rlim = lim; + if (sz) + *sz = rlim; + + *end = &cp[i] + lim; + +out: + assert(rlim >= 0 && rstart); + + /* Run post-processors. */ + + switch (gly) { + case (ESCAPE_FONT): + if (1 != rlim) break; - case ('['): - term = ']'; + switch (*rstart) { + case ('3'): + /* FALLTHROUGH */ + case ('B'): + gly = ESCAPE_FONTBOLD; break; - default: - len = 1; - p--; + case ('2'): + /* FALLTHROUGH */ + case ('I'): + gly = ESCAPE_FONTITALIC; break; - } - break; - case ('('): - len = 2; - break; - case ('['): - term = ']'; - break; - case ('z'): - len = 1; - if ('\\' == *p) { - if (0 == (i = mandoc_special(p))) - return(0); - p += i; - return(*p ? (int)(p - sv) : 0); - } - break; - case ('o'): - /* FALLTHROUGH */ - case ('w'): - if ('\'' == *p++) { - term = '\''; + case ('P'): + gly = ESCAPE_FONTPREV; + break; + case ('1'): + /* FALLTHROUGH */ + case ('R'): + gly = ESCAPE_FONTROMAN; break; } - /* FALLTHROUGH */ + case (ESCAPE_SPECIAL): + if (1 != rlim) + break; + if ('c' == *rstart) + gly = ESCAPE_NOSPACE; + break; default: - len = 1; - p--; break; } - if (term) { - for ( ; *p && term != *p; p++) - if (ASCII_HYPH == *p) - *p = '-'; - return(*p ? (int)(p - sv) : 0); - } - - for (i = 0; *p && i < len; i++, p++) - if (ASCII_HYPH == *p) - *p = '-'; - return(i == len ? (int)(p - sv) : 0); + return(gly); } - void * mandoc_calloc(size_t num, size_t size) { |