summaryrefslogtreecommitdiffstats
path: root/mandocdb.c
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@openbsd.org>2014-12-05 14:26:40 +0000
committerIngo Schwarze <schwarze@openbsd.org>2014-12-05 14:26:40 +0000
commit4d6a4a431dd9e4c2c45a3f9691744d7f31fd2c34 (patch)
treed41560fffa992cf860ea62785a1b5f8b2b997710 /mandocdb.c
parent00da30523681abb2b25dcfc0c8b45ce7cd9892b2 (diff)
downloadmandoc-4d6a4a431dd9e4c2c45a3f9691744d7f31fd2c34.tar.gz
Render text before, not after accumulating flag bits, such that flags
for different representations of the same string end up in the same database entry. Improves name classification for 500 manuals.
Diffstat (limited to 'mandocdb.c')
-rw-r--r--mandocdb.c223
1 files changed, 107 insertions, 116 deletions
diff --git a/mandocdb.c b/mandocdb.c
index a7c6b4b8..5638eb84 100644
--- a/mandocdb.c
+++ b/mandocdb.c
@@ -84,10 +84,9 @@ enum op {
};
struct str {
- char *rendered; /* key in UTF-8 or ASCII form */
const struct mpage *mpage; /* if set, the owning parse */
uint64_t mask; /* bitmask in sequence */
- char key[]; /* may contain escape sequences */
+ char key[]; /* rendered text */
};
struct inodev {
@@ -139,7 +138,7 @@ struct mdoc_handler {
};
static void dbclose(int);
-static void dbadd(struct mpage *, struct mchars *);
+static void dbadd(struct mpage *);
static void dbadd_mlink(const struct mlink *mlink);
static void dbadd_mlink_name(const struct mlink *mlink);
static int dbopen(int);
@@ -153,7 +152,7 @@ static void mlink_check(struct mpage *, struct mlink *);
static void mlink_free(struct mlink *);
static void mlinks_undupe(struct mpage *);
static void mpages_free(void);
-static void mpages_merge(struct mchars *, struct mparse *);
+static void mpages_merge(struct mparse *);
static void names_check(void);
static void parse_cat(struct mpage *, int);
static void parse_man(struct mpage *, const struct man_meta *,
@@ -179,11 +178,10 @@ static int parse_mdoc_Sh(struct mpage *, const struct mdoc_meta *,
static int parse_mdoc_Xr(struct mpage *, const struct mdoc_meta *,
const struct mdoc_node *);
static void putkey(const struct mpage *, char *, uint64_t);
-static void putkeys(const struct mpage *,
- const char *, size_t, uint64_t);
+static void putkeys(const struct mpage *, char *, size_t, uint64_t);
static void putmdockey(const struct mpage *,
const struct mdoc_node *, uint64_t);
-static void render_key(struct mchars *, struct str *);
+static int render_string(char **, size_t *);
static void say(const char *, const char *, ...);
static int set_basedir(const char *, int);
static int treescan(void);
@@ -200,6 +198,7 @@ static int write_utf8; /* write UTF-8 output; else ASCII */
static int exitcode; /* to be returned by main */
static enum op op; /* operational mode */
static char basedir[PATH_MAX]; /* current base directory */
+static struct mchars *mchars; /* table of named characters */
static struct ohash mpages; /* table of distinct manual pages */
static struct ohash mlinks; /* table of directory entries */
static struct ohash names; /* table of all names */
@@ -341,7 +340,6 @@ main(int argc, char *argv[])
int ch, i;
size_t j, sz;
const char *path_arg;
- struct mchars *mc;
struct manpaths dirs;
struct mparse *mp;
struct ohash_info mpages_info, mlinks_info;
@@ -441,9 +439,9 @@ main(int argc, char *argv[])
}
exitcode = (int)MANDOCLEVEL_OK;
- mc = mchars_alloc();
+ mchars = mchars_alloc();
mp = mparse_alloc(mparse_options, MANDOCLEVEL_FATAL, NULL,
- mc, NULL);
+ mchars, NULL);
ohash_init(&mpages, 6, &mpages_info);
ohash_init(&mlinks, 6, &mlinks_info);
@@ -479,7 +477,7 @@ main(int argc, char *argv[])
goto out;
}
if (OP_DELETE != op)
- mpages_merge(mc, mp);
+ mpages_merge(mp);
dbclose(OP_DEFAULT == op ? 0 : 1);
} else {
/*
@@ -526,7 +524,7 @@ main(int argc, char *argv[])
if (0 == dbopen(0))
continue;
- mpages_merge(mc, mp);
+ mpages_merge(mp);
if (warnings && !nodb &&
! (MPARSE_QUICK & mparse_options))
names_check();
@@ -542,7 +540,7 @@ main(int argc, char *argv[])
out:
manpath_free(&dirs);
mparse_free(mp);
- mchars_free(mc);
+ mchars_free(mchars);
mpages_free();
ohash_delete(&mpages);
ohash_delete(&mlinks);
@@ -1089,7 +1087,7 @@ mlink_check(struct mpage *mpage, struct mlink *mlink)
* and filename to determine whether the file is parsable or not.
*/
static void
-mpages_merge(struct mchars *mc, struct mparse *mp)
+mpages_merge(struct mparse *mp)
{
char any[] = "any";
struct ohash_info str_info;
@@ -1243,7 +1241,7 @@ mpages_merge(struct mchars *mc, struct mparse *mp)
mlink = mlink->next)
mlink_check(mpage, mlink);
- dbadd(mpage, mc);
+ dbadd(mpage);
nextpage:
if (mparse_wait(mp) != MANDOCLEVEL_OK) {
@@ -1602,7 +1600,7 @@ static int
parse_mdoc_Fd(struct mpage *mpage, const struct mdoc_meta *meta,
const struct mdoc_node *n)
{
- const char *start, *end;
+ char *start, *end;
size_t sz;
if (SEC_SYNOPSIS != n->sec ||
@@ -1771,18 +1769,19 @@ parse_mdoc_body(struct mpage *mpage, const struct mdoc_meta *meta,
* When we finish the manual, we'll dump the table.
*/
static void
-putkeys(const struct mpage *mpage,
- const char *cp, size_t sz, uint64_t v)
+putkeys(const struct mpage *mpage, char *cp, size_t sz, uint64_t v)
{
struct ohash *htab;
struct str *s;
const char *end;
unsigned int slot;
- int i;
+ int i, mustfree;
if (0 == sz)
return;
+ mustfree = render_string(&cp, &sz);
+
if (TYPE_Nm & v) {
htab = &names;
v &= name_mask;
@@ -1815,6 +1814,9 @@ putkeys(const struct mpage *mpage,
}
s->mpage = mpage;
s->mask = v;
+
+ if (mustfree)
+ free(cp);
}
/*
@@ -1870,20 +1872,19 @@ utf8(unsigned int cp, char out[7])
}
/*
- * Store the rendered version of a key, or alias the pointer
- * if the key contains no escape sequences.
+ * If the string contains escape sequences,
+ * replace it with an allocated rendering and return 1,
+ * such that the caller can free it after use.
+ * Otherwise, do nothing and return 0.
*/
-static void
-render_key(struct mchars *mc, struct str *key)
+static int
+render_string(char **public, size_t *psz)
{
- size_t sz, bsz, pos;
+ const char *src, *scp, *addcp, *seq;
+ char *dst;
+ size_t ssz, dsz, addsz;
char utfbuf[7], res[6];
- char *buf;
- const char *seq, *cpp, *val;
- int len, u;
- enum mandoc_esc esc;
-
- assert(NULL == key->rendered);
+ int seqlen, unicode;
res[0] = '\\';
res[1] = '\t';
@@ -1892,68 +1893,62 @@ render_key(struct mchars *mc, struct str *key)
res[4] = ASCII_BREAK;
res[5] = '\0';
- val = key->key;
- bsz = strlen(val);
+ src = scp = *public;
+ ssz = *psz;
+ dst = NULL;
+ dsz = 0;
- /*
- * Pre-check: if we have no stop-characters, then set the
- * pointer as ourselvse and get out of here.
- */
- if (strcspn(val, res) == bsz) {
- key->rendered = key->key;
- return;
- }
+ while (scp < src + *psz) {
- /* Pre-allocate by the length of the input */
+ /* Leave normal characters unchanged. */
- buf = mandoc_malloc(++bsz);
- pos = 0;
+ if (strchr(res, *scp) == NULL) {
+ if (dst != NULL)
+ dst[dsz++] = *scp;
+ scp++;
+ continue;
+ }
- while ('\0' != *val) {
/*
- * Halt on the first escape sequence.
- * This also halts on the end of string, in which case
- * we just copy, fallthrough, and exit the loop.
+ * Found something that requires replacing,
+ * make sure we have a destination buffer.
*/
- if ((sz = strcspn(val, res)) > 0) {
- memcpy(&buf[pos], val, sz);
- pos += sz;
- val += sz;
+
+ if (dst == NULL) {
+ dst = mandoc_malloc(ssz + 1);
+ dsz = scp - src;
+ memcpy(dst, src, dsz);
}
- switch (*val) {
- case ASCII_HYPH:
- buf[pos++] = '-';
- val++;
- continue;
+ /* Handle single-char special characters. */
+
+ switch (*scp) {
+ case '\\':
+ break;
case '\t':
/* FALLTHROUGH */
case ASCII_NBRSP:
- buf[pos++] = ' ';
- val++;
+ dst[dsz++] = ' ';
+ scp++;
+ continue;
+ case ASCII_HYPH:
+ dst[dsz++] = '-';
/* FALLTHROUGH */
case ASCII_BREAK:
+ scp++;
continue;
default:
- break;
+ abort();
}
- if ('\\' != *val)
- break;
-
- /* Read past the slash. */
-
- val++;
/*
- * Parse the escape sequence and see if it's a
- * predefined character or special character.
+ * Found an escape sequence.
+ * Read past the slash, then parse it.
+ * Ignore everything except characters.
*/
- esc = mandoc_escape((const char **)&val,
- &seq, &len);
- if (ESCAPE_ERROR == esc)
- break;
- if (ESCAPE_SPECIAL != esc)
+ scp++;
+ if (mandoc_escape(&scp, &seq, &seqlen) != ESCAPE_SPECIAL)
continue;
/*
@@ -1962,32 +1957,44 @@ render_key(struct mchars *mc, struct str *key)
*/
if (write_utf8) {
- if ((u = mchars_spec2cp(mc, seq, len)) <= 0)
+ unicode = mchars_spec2cp(mchars, seq, seqlen);
+ if (unicode <= 0)
continue;
- cpp = utfbuf;
- if (0 == (sz = utf8(u, utfbuf)))
+ addsz = utf8(unicode, utfbuf);
+ if (addsz == 0)
continue;
- sz = strlen(cpp);
+ addcp = utfbuf;
} else {
- cpp = mchars_spec2str(mc, seq, len, &sz);
- if (NULL == cpp)
+ addcp = mchars_spec2str(mchars, seq, seqlen, &addsz);
+ if (addcp == NULL)
continue;
- if (ASCII_NBRSP == *cpp) {
- cpp = " ";
- sz = 1;
+ if (*addcp == ASCII_NBRSP) {
+ addcp = " ";
+ addsz = 1;
}
}
/* Copy the rendered glyph into the stream. */
- bsz += sz;
- buf = mandoc_realloc(buf, bsz);
- memcpy(&buf[pos], cpp, sz);
- pos += sz;
+ ssz += addsz;
+ dst = mandoc_realloc(dst, ssz + 1);
+ memcpy(dst + dsz, addcp, addsz);
+ dsz += addsz;
}
+ if (dst != NULL) {
+ *public = dst;
+ *psz = dsz;
+ }
+
+ /* Trim trailing whitespace and NUL-terminate. */
- buf[pos] = '\0';
- key->rendered = buf;
+ while (*psz > 0 && (*public)[*psz - 1] == ' ')
+ --*psz;
+ if (dst != NULL) {
+ (*public)[*psz] = '\0';
+ return(1);
+ } else
+ return(0);
}
static void
@@ -2035,28 +2042,24 @@ dbadd_mlink_name(const struct mlink *mlink)
* Also, handle escape sequences at the last possible moment.
*/
static void
-dbadd(struct mpage *mpage, struct mchars *mc)
+dbadd(struct mpage *mpage)
{
struct mlink *mlink;
struct str *key;
+ char *cp;
size_t i;
unsigned int slot;
+ int mustfree;
mlink = mpage->mlinks;
if (nodb) {
for (key = ohash_first(&names, &slot); NULL != key;
- key = ohash_next(&names, &slot)) {
- if (key->rendered != key->key)
- free(key->rendered);
+ key = ohash_next(&names, &slot))
free(key);
- }
for (key = ohash_first(&strings, &slot); NULL != key;
- key = ohash_next(&strings, &slot)) {
- if (key->rendered != key->key)
- free(key->rendered);
+ key = ohash_next(&strings, &slot))
free(key);
- }
if (0 == debug)
return;
while (NULL != mlink) {
@@ -2085,21 +2088,17 @@ dbadd(struct mpage *mpage, struct mchars *mc)
if (debug)
say(mlink->file, "Adding to database");
- i = strlen(mpage->desc) + 1;
- key = mandoc_calloc(1, sizeof(struct str) + i);
- memcpy(key->key, mpage->desc, i);
- render_key(mc, key);
-
+ cp = mpage->desc;
+ i = strlen(cp);
+ mustfree = render_string(&cp, &i);
i = 1;
- SQL_BIND_TEXT(stmts[STMT_INSERT_PAGE], i, key->rendered);
+ SQL_BIND_TEXT(stmts[STMT_INSERT_PAGE], i, cp);
SQL_BIND_INT(stmts[STMT_INSERT_PAGE], i, mpage->form);
SQL_STEP(stmts[STMT_INSERT_PAGE]);
mpage->pageid = sqlite3_last_insert_rowid(db);
sqlite3_reset(stmts[STMT_INSERT_PAGE]);
-
- if (key->rendered != key->key)
- free(key->rendered);
- free(key);
+ if (mustfree)
+ free(cp);
while (NULL != mlink) {
dbadd_mlink(mlink);
@@ -2110,31 +2109,23 @@ dbadd(struct mpage *mpage, struct mchars *mc)
for (key = ohash_first(&names, &slot); NULL != key;
key = ohash_next(&names, &slot)) {
assert(key->mpage == mpage);
- if (NULL == key->rendered)
- render_key(mc, key);
i = 1;
SQL_BIND_INT64(stmts[STMT_INSERT_NAME], i, key->mask);
- SQL_BIND_TEXT(stmts[STMT_INSERT_NAME], i, key->rendered);
+ SQL_BIND_TEXT(stmts[STMT_INSERT_NAME], i, key->key);
SQL_BIND_INT64(stmts[STMT_INSERT_NAME], i, mpage->pageid);
SQL_STEP(stmts[STMT_INSERT_NAME]);
sqlite3_reset(stmts[STMT_INSERT_NAME]);
- if (key->rendered != key->key)
- free(key->rendered);
free(key);
}
for (key = ohash_first(&strings, &slot); NULL != key;
key = ohash_next(&strings, &slot)) {
assert(key->mpage == mpage);
- if (NULL == key->rendered)
- render_key(mc, key);
i = 1;
SQL_BIND_INT64(stmts[STMT_INSERT_KEY], i, key->mask);
- SQL_BIND_TEXT(stmts[STMT_INSERT_KEY], i, key->rendered);
+ SQL_BIND_TEXT(stmts[STMT_INSERT_KEY], i, key->key);
SQL_BIND_INT64(stmts[STMT_INSERT_KEY], i, mpage->pageid);
SQL_STEP(stmts[STMT_INSERT_KEY]);
sqlite3_reset(stmts[STMT_INSERT_KEY]);
- if (key->rendered != key->key)
- free(key->rendered);
free(key);
}
}