Distinguish between escape sequences that produce no output

whatsoever (for example \fR) and escape sequences that produce invisible zero-width output (for example \&). No, i'm not joking, groff does make that distinction, and it has consequences in some situations, for example for vertical spacing in no-fill mode. Heirloom and Plan 9 behaviour is subtly different, but in case of doubt, we want to follow groff. While this fixes the behaviour for the majority of escape sequences, in particular for those most likely to occur in practice, it is not perfect yet because some of the more exotic ESCAPE_IGNORE sequences are actually of the "no output whatsoever" type but treated as "invisible zero-width" for now. With the new ASCII_NBRZW mechanism in place, switching them over one by one when the need arises will no longer be very difficult.
author: Ingo Schwarze <schwarze@openbsd.org> 2022-08-15 18:12:30 +0000
committer: Ingo Schwarze <schwarze@openbsd.org> 2022-08-15 18:12:30 +0000
commit: 6ecd9c5caba944d910343802c058940234492e5a (patch)
tree: 6fe94640b4da8a1edda86aa430d5214b2eb33c59 /term.c
parent: ffa876e5886e09f94c3fe78fbef3cc7ebcd8a611 (diff)
download: mandoc-6ecd9c5caba944d910343802c058940234492e5a.tar.gz
1 files changed, 16 insertions, 6 deletions
diff --git a/term.c b/term.c
index d08a21b5..9823bb92 100644
--- a/term.c
+++ b/term.c
@@ -208,7 +208,6 @@ term_flushln(struct termp *p)
 			return;
 
 		endline(p);
-		p->viscol = 0;
 
 		/*
 		 * Normally, start the next line at the same indentation
@@ -314,6 +313,8 @@ term_fill(struct termp *p, size_t *nbr, size_t *vbr, size_t vtarget)
 				vis = term_tab_next(vis);
 				vis -= p->tcol->taboff;
 				break;
+			case ASCII_NBRZW:  /* Non-breakable zero-width. */
+				break;
 			case ASCII_NBRSP:  /* Non-breakable space. */
 				p->tcol->buf[ic] = ' ';
 				/* FALLTHROUGH */
@@ -365,6 +366,7 @@ term_field(struct termp *p, size_t vbl, size_t nbr)
 		switch (p->tcol->buf[ic]) {
 		case '\n':
 		case ASCII_BREAK:
+		case ASCII_NBRZW:
 			continue;
 		case '\t':
 		case ' ':
@@ -571,18 +573,23 @@ term_word(struct termp *p, const char *word)
 			break;
 		case ESCAPE_NUMBERED:
 			uc = mchars_num2char(seq, sz);
-			if (uc < 0)
-				continue;
-			break;
+			if (uc >= 0)
+				break;
+			bufferc(p, ASCII_NBRZW);
+			continue;
 		case ESCAPE_SPECIAL:
 			if (p->enc == TERMENC_ASCII) {
 				cp = mchars_spec2str(seq, sz, &ssz);
 				if (cp != NULL)
 					encode(p, cp, ssz);
+				else
+					bufferc(p, ASCII_NBRZW);
 			} else {
 				uc = mchars_spec2cp(seq, sz);
 				if (uc > 0)
 					encode1(p, uc);
+				else
+					bufferc(p, ASCII_NBRZW);
 			}
 			continue;
 		case ESCAPE_UNDEF:
@@ -744,6 +751,9 @@ term_word(struct termp *p, const char *word)
 			if (p->col > p->tcol->lastcol)
 				p->col = p->tcol->lastcol;
 			continue;
+		case ESCAPE_IGNORE:
+			bufferc(p, ASCII_NBRZW);
+			continue;
 		default:
 			continue;
 		}
@@ -935,8 +945,8 @@ term_strlen(const struct termp *p, const char *cp)
 	int		 ssz, skip, uc;
 	const char	*seq, *rhs;
 	enum mandoc_esc	 esc;
-	static const char rej[] = { '\\', ASCII_NBRSP, ASCII_HYPH,
-			ASCII_BREAK, '\0' };
+	static const char rej[] = { '\\', ASCII_NBRSP, ASCII_NBRZW,
+		ASCII_BREAK, ASCII_HYPH, '\0' };
 
 	/*
 	 * Account for escaped sequences within string length
author	Ingo Schwarze <schwarze@openbsd.org>	2022-08-15 18:12:30 +0000
committer	Ingo Schwarze <schwarze@openbsd.org>	2022-08-15 18:12:30 +0000
commit	6ecd9c5caba944d910343802c058940234492e5a (patch)
tree	6fe94640b4da8a1edda86aa430d5214b2eb33c59 /term.c
parent	ffa876e5886e09f94c3fe78fbef3cc7ebcd8a611 (diff)
download	mandoc-6ecd9c5caba944d910343802c058940234492e5a.tar.gz