From 61fcdf77510d89258927a8cfc957096701a22191 Mon Sep 17 00:00:00 2001
From: Ingo Schwarze <schwarze@openbsd.org>
Date: Mon, 13 Oct 2014 17:17:45 +0000
Subject: Stricter syntax checking of Unicode character names: Require exactly
 4, 5 or 6 hex digits and allow nothing else. This avoids mishandling stuff
 like \[ua] and \C'uA' as Unicode and also fixes underlining in eqn(7) -Thtml
 output which uses \[ul]. Problem found and semantics suggested by kristaps@.

---
 mandoc.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'mandoc.c')

diff --git a/mandoc.c b/mandoc.c
index f454a40e..db7db0c4 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -79,24 +79,13 @@ mandoc_escape(const char **end, const char **start, int *sz)
 		break;
 	case '[':
 		gly = ESCAPE_SPECIAL;
-		/*
-		 * Unicode escapes are defined in groff as \[uXXXX] to
-		 * \[u10FFFF], where the contained value must be a valid
-		 * Unicode codepoint.  Here, however, only check whether
-		 * it's not a zero-width escape.
-		 */
-		if ('u' == (*start)[0] && ']' != (*start)[1])
-			gly = ESCAPE_UNICODE;
 		term = ']';
 		break;
 	case 'C':
 		if ('\'' != **start)
 			return(ESCAPE_ERROR);
 		*start = ++*end;
-		if ('u' == (*start)[0] && '\'' != (*start)[1])
-			gly = ESCAPE_UNICODE;
-		else
-			gly = ESCAPE_SPECIAL;
+		gly = ESCAPE_SPECIAL;
 		term = '\'';
 		break;
 
@@ -344,6 +333,16 @@ mandoc_escape(const char **end, const char **start, int *sz)
 	case ESCAPE_SPECIAL:
 		if (1 == *sz && 'c' == **start)
 			gly = ESCAPE_NOSPACE;
+		/*
+		 * Unicode escapes are defined in groff as \[uXXXX]
+		 * to \[u10FFFF], where the contained value must be
+		 * a valid Unicode codepoint.  Here, however, only
+		 * check the length and the validity of all digits.
+		 */
+		else if (*sz > 4 && *sz < 8 && **start == 'u' &&
+		    (int)strspn(*start + 1, "0123456789ABCDEFabcdef")
+		    + 1 == *sz)
+			gly = ESCAPE_UNICODE;
 		break;
 	default:
 		break;
-- 
cgit