From: schwarze <schwarze@openbsd.org>
Date: Fri, 19 Dec 2014 04:57:11 +0000 (+0000)
Subject: Rewrite the low-level UTF-8 parser from scratch.
X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=52a7f4662432db837ecf4c838b4be59349e5f106;p=openbsd

Rewrite the low-level UTF-8 parser from scratch.
It accepted invalid byte sequences like 0xc080-c1bf, 0xe08080-e09fbf,
0xeda080-edbfbf, and 0xf0808080-f08fbfbf, produced valid roff Unicode
escape sequences from them, and the algorithm contained strong
defenses against any attempt to fix it.

This cures an assertion failure in the terminal formatter caused
by sneaking in ASCII 0x08 (backspace) by "encoding" it as an (invalid)
multibyte UTF-8 sequence, found by jsg@ with afl.

As a bonus, the new algorithm also reduces the code in the function
by about 20%.
---

diff --git a/regress/usr.bin/mandoc/char/unicode/Makefile b/regress/usr.bin/mandoc/char/unicode/Makefile
index 5fd17875c22..b75ba6dbf07 100644
--- a/regress/usr.bin/mandoc/char/unicode/Makefile
+++ b/regress/usr.bin/mandoc/char/unicode/Makefile
@@ -1,13 +1,13 @@
-# $OpenBSD: Makefile,v 1.3 2014/10/29 03:39:26 schwarze Exp $
+# $OpenBSD: Makefile,v 1.4 2014/12/19 04:57:11 schwarze Exp $
 
-REGRESS_TARGETS  = ascii invalid latin1 latin1diff
+REGRESS_TARGETS  = ascii input invalid latin1 latin1diff
 REGRESS_TARGETS += man mdoc named namediff nogroff
 SKIP_ASCII 	 = man mdoc
 UTF8_TARGETS	 = ${REGRESS_TARGETS}
 HTML_TARGETS	 = ascii invalid latin1 latin1diff named namediff nogroff
-LINT_TARGETS	 = invalid
+LINT_TARGETS	 = input invalid
 
-SKIP_GROFF	 = nogroff
+SKIP_GROFF	 = input nogroff
 SKIP_GROFF_ASCII = latin1diff namediff
 
 .include <bsd.regress.mk>
diff --git a/regress/usr.bin/mandoc/char/unicode/input.in b/regress/usr.bin/mandoc/char/unicode/input.in
new file mode 100644
index 00000000000..15a59f5bfd6
Binary files /dev/null and b/regress/usr.bin/mandoc/char/unicode/input.in differ
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_ascii b/regress/usr.bin/mandoc/char/unicode/input.out_ascii
new file mode 100644
index 00000000000..a9946d1b528
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_ascii
@@ -0,0 +1,67 @@
+CHAR-UNICODE-INPUT(1)       General Commands Manual      CHAR-UNICODE-INPUT(1)
+
+
+
+NNAAMMEE
+       char-unicode-input - Unicode characters in the input file
+
+DDEESSCCRRIIPPTTIIOONN
+       lowest valid: <80>
+
+   OOnnee--bbyyttee rraannggee
+
+       U+0000   0x00   <NUL>?   lowest ASCII
+       U+001f   0x1f   <US>?    highest ASCII control character
+       U+007f   0x7f   <DEL>?   highest ASCII
+                0x80   ?        leading lowest continuation
+                0xbf   ?        leading highest continuation
+
+   TTwwoo--bbyyttee rraannggee
+
+       U+0000   0xc080     ??         lowest obfuscated ASCII
+       U+007f   0xc1bf     ??         highest obfuscated ASCII
+                0xc278     ?x         ASCII continuation
+       U+0080   0xc280     <80><80>   lowest two-byte
+                0xc2c380   ?`A       high continuation
+       U+07FF   0xdfbf     <?><?>     highest two-byte
+
+   TThhrreeee--bbyyttee rraannggee
+
+       U+0000   0xe08080   ???      lowest obfuscated ASCII
+       U+007f   0xe081bf   ???      highest obfuscated ASCII
+       U+0080   0xe08280   ???      lowest obfuscated two-byte
+       U+07FF   0xe09fbf   ???      highest obfuscated two-byte
+       U+0800   0xe0a080   <?><?>   lowest three-byte
+       U+0FFF   0xe0bfbf   <?><?>   end of first middle byte
+       U+1000   0xe18080   <?><?>   begin of second middle byte
+       U+CFFF   0xecbfbf   <?><?>   end of last normal middle byte
+       U+D000   0xed8080   <?><?>   begin of strange middle byte
+       U+D7FF   0xed9fbf   <?><?>   highest public three-byte
+       U+D800   0xeda080   <?>???   lowest surrogate
+       U+DFFF   0xedbfbf   <?>???   highest surrogate
+       U+E000   0xee8080   <?><?>   lowest private use
+       U+FFFF   0xefbfbf   <?><?>   highest three-byte
+
+   FFoouurr--bbyyttee rraannggee
+
+       U+0000     0xf0808080     ????     lowest obfuscated ASCII
+       U+007f     0xf08081bf     ????     highest obfuscated ASCII
+       U+0080     0xf0808280     ????     lowest obfuscated two-byte
+       U+07FF     0xf0809fbf     ????     highest obfuscated two-byte
+       U+0800     0xf080a080     ????     lowest obfuscated three-byte
+       U+FFFF     0xf08fbfbf     ????     highest obfuscated three-byte
+       U+10000    0xf0908080     <?><?>   lowest four-byte
+       U+3FFFF    0xf0bfbfbf     <?><?>   end of first middle byte
+       U+40000    0xf1808080     <?><?>   second middle byte
+       U+FFFFF    0xf3bfbfbf     <?><?>   last normal middle byte
+       U+100000   0xf4808080     <?><?>   strange middle byte
+       U+10FFFF   0xf48fbfbf     <?><?>   last valid four-byte
+       U+110000   0xf4908080     ????     lowest beyond Unicode
+       U+13FFFF   0xf4bfbfbf     ????     end of strange middle byte
+       U+140000   0xf5808080     ????     lowest invalid middle byte
+       U+1FFFFF   0xf7bfbfbf     ????     highest four-byte
+       U+200000   0xf888808080   ?????    lowest five-byte
+
+
+
+OpenBSD                        December 19, 2014         CHAR-UNICODE-INPUT(1)
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_lint b/regress/usr.bin/mandoc/char/unicode/input.out_lint
new file mode 100644
index 00000000000..77b6161cbab
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_lint
@@ -0,0 +1,79 @@
+mandoc: input.in:10:21: ERROR: skipping bad character: 0x0
+mandoc: input.in:11:21: ERROR: skipping bad character: 0x1f
+mandoc: input.in:12:21: ERROR: skipping bad character: 0x7f
+mandoc: input.in:13:7: ERROR: skipping bad character: 0x80
+mandoc: input.in:14:7: ERROR: skipping bad character: 0xbf
+mandoc: input.in:20:15: ERROR: skipping bad character: 0xc0
+mandoc: input.in:20:16: ERROR: skipping bad character: 0x80
+mandoc: input.in:21:15: ERROR: skipping bad character: 0xc1
+mandoc: input.in:21:16: ERROR: skipping bad character: 0xbf
+mandoc: input.in:22:9: ERROR: skipping bad character: 0xc2
+mandoc: input.in:24:11: ERROR: skipping bad character: 0xc2
+mandoc: input.in:31:17: ERROR: skipping bad character: 0xc0
+mandoc: input.in:31:18: ERROR: skipping bad character: 0x80
+mandoc: input.in:31:19: ERROR: skipping bad character: 0x80
+mandoc: input.in:32:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:32:18: ERROR: skipping bad character: 0x81
+mandoc: input.in:32:19: ERROR: skipping bad character: 0xbf
+mandoc: input.in:33:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:33:18: ERROR: skipping bad character: 0x82
+mandoc: input.in:33:19: ERROR: skipping bad character: 0x80
+mandoc: input.in:34:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:34:18: ERROR: skipping bad character: 0x9f
+mandoc: input.in:34:19: ERROR: skipping bad character: 0xbf
+mandoc: input.in:41:25: ERROR: skipping bad character: 0xed
+mandoc: input.in:41:26: ERROR: skipping bad character: 0xa0
+mandoc: input.in:41:27: ERROR: skipping bad character: 0x80
+mandoc: input.in:42:25: ERROR: skipping bad character: 0xed
+mandoc: input.in:42:26: ERROR: skipping bad character: 0xbf
+mandoc: input.in:42:27: ERROR: skipping bad character: 0xbf
+mandoc: input.in:50:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:50:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:50:21: ERROR: skipping bad character: 0x80
+mandoc: input.in:50:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:51:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:51:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:51:21: ERROR: skipping bad character: 0x81
+mandoc: input.in:51:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:52:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:52:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:52:21: ERROR: skipping bad character: 0x82
+mandoc: input.in:52:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:53:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:53:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:53:21: ERROR: skipping bad character: 0x9f
+mandoc: input.in:53:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:54:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:54:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:54:21: ERROR: skipping bad character: 0xa0
+mandoc: input.in:54:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:55:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:55:20: ERROR: skipping bad character: 0x8f
+mandoc: input.in:55:21: ERROR: skipping bad character: 0xbf
+mandoc: input.in:55:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:62:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:62:32: ERROR: skipping bad character: 0x90
+mandoc: input.in:62:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:62:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:62:21: WARNING: invalid escape sequence: \[u110000]
+mandoc: input.in:63:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:63:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:21: WARNING: invalid escape sequence: \[u13FFFF]
+mandoc: input.in:64:31: ERROR: skipping bad character: 0xf5
+mandoc: input.in:64:32: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:21: WARNING: invalid escape sequence: \[u140000]
+mandoc: input.in:65:31: ERROR: skipping bad character: 0xf7
+mandoc: input.in:65:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:21: WARNING: invalid escape sequence: \[u1FFFFF]
+mandoc: input.in:66:33: ERROR: skipping bad character: 0xf8
+mandoc: input.in:66:34: ERROR: skipping bad character: 0x88
+mandoc: input.in:66:35: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:36: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:37: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:23: WARNING: invalid escape sequence: \[u200000]
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_utf8 b/regress/usr.bin/mandoc/char/unicode/input.out_utf8
new file mode 100644
index 00000000000..fc7a59e1092
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_utf8
@@ -0,0 +1,67 @@
+CHAR-UNICODE-INPUT(1)       General Commands Manual      CHAR-UNICODE-INPUT(1)
+
+
+
+NNAAMMEE
+       char-unicode-input - Unicode characters in the input file
+
+DDEESSCCRRIIPPTTIIOONN
+       lowest valid: ï¿½
+
+   OOnnee--bbyyttee rraannggee
+
+       U+0000   0x00   ï¿½?   lowest ASCII
+       U+001f   0x1f   ï¿½?   highest ASCII control character
+       U+007f   0x7f   ï¿½?   highest ASCII
+                0x80   ?    leading lowest continuation
+                0xbf   ?    leading highest continuation
+
+   TTwwoo--bbyyttee rraannggee
+
+       U+0000   0xc080     ??   lowest obfuscated ASCII
+       U+007f   0xc1bf     ??   highest obfuscated ASCII
+                0xc278     ?x   ASCII continuation
+       U+0080   0xc280     ï¿½ï¿½   lowest two-byte
+                0xc2c380   ?Ã   high continuation
+       U+07FF   0xdfbf     ß¿ß¿     highest two-byte
+
+   TThhrreeee--bbyyttee rraannggee
+
+       U+0000   0xe08080   ???    lowest obfuscated ASCII
+       U+007f   0xe081bf   ???    highest obfuscated ASCII
+       U+0080   0xe08280   ???    lowest obfuscated two-byte
+       U+07FF   0xe09fbf   ???    highest obfuscated two-byte
+       U+0800   0xe0a080   à à        lowest three-byte
+       U+0FFF   0xe0bfbf   à¿¿à¿¿       end of first middle byte
+       U+1000   0xe18080   áá     begin of second middle byte
+       U+CFFF   0xecbfbf   ì¿¿ì¿¿   end of last normal middle byte
+       U+D000   0xed8080   íí   begin of strange middle byte
+       U+D7FF   0xed9fbf   í¿í¿       highest public three-byte
+       U+D800   0xeda080   í ???   lowest surrogate
+       U+DFFF   0xedbfbf   í¿¿???   highest surrogate
+       U+E000   0xee8080   îî     lowest private use
+       U+FFFF   0xefbfbf   ï¿¿ï¿¿       highest three-byte
+
+   FFoouurr--bbyyttee rraannggee
+
+       U+0000     0xf0808080     ????    lowest obfuscated ASCII
+       U+007f     0xf08081bf     ????    highest obfuscated ASCII
+       U+0080     0xf0808280     ????    lowest obfuscated two-byte
+       U+07FF     0xf0809fbf     ????    highest obfuscated two-byte
+       U+0800     0xf080a080     ????    lowest obfuscated three-byte
+       U+FFFF     0xf08fbfbf     ????    highest obfuscated three-byte
+       U+10000    0xf0908080     ðð        lowest four-byte
+       U+3FFFF    0xf0bfbfbf     ð¿¿¿ð¿¿¿        end of first middle byte
+       U+40000    0xf1808080     ññ        second middle byte
+       U+FFFFF    0xf3bfbfbf     ó¿¿¿ó¿¿¿        last normal middle byte
+       U+100000   0xf4808080     ôô      strange middle byte
+       U+10FFFF   0xf48fbfbf     ô¿¿ô¿¿        last valid four-byte
+       U+110000   0xf4908080     ????    lowest beyond Unicode
+       U+13FFFF   0xf4bfbfbf     ????    end of strange middle byte
+       U+140000   0xf5808080     ????    lowest invalid middle byte
+       U+1FFFFF   0xf7bfbfbf     ????    highest four-byte
+       U+200000   0xf888808080   ?????   lowest five-byte
+
+
+
+OpenBSD                        December 19, 2014         CHAR-UNICODE-INPUT(1)
diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c
index 8e4a1739f76..3d5a30655f6 100644
--- a/usr.bin/mandoc/preconv.c
+++ b/usr.bin/mandoc/preconv.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: preconv.c,v 1.4 2014/11/28 19:25:03 schwarze Exp $ */
+/*	$OpenBSD: preconv.c,v 1.5 2014/12/19 04:57:11 schwarze Exp $ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -17,6 +17,7 @@
  */
 #include <sys/types.h>
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include "mandoc.h"
@@ -26,88 +27,70 @@ int
 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
     int *filenc)
 {
-	size_t		 i;
-	int		 state;
+	unsigned char	*cu;
+	int		 nby;
 	unsigned int	 accum;
-	unsigned char	 cu;
+
+	cu = ib->buf + *ii;
+	assert(*cu & 0x80);
 
 	if ( ! (*filenc & MPARSE_UTF8))
 		goto latin;
 
-	state = 0;
-	accum = 0U;
-
-	for (i = *ii; i < ib->sz; i++) {
-		cu = ib->buf[i];
-		if (state) {
-			if ( ! (cu & 128) || (cu & 64)) {
-				/* Bad sequence header. */
-				break;
-			}
-
-			/* Accept only legitimate bit patterns. */
-
-			if (cu > 191 || cu < 128) {
-				/* Bad in-sequence bits. */
-				break;
-			}
-
-			accum |= (cu & 63) << --state * 6;
-
-			if (state)
-				continue;
-
-			if (accum < 0x80)
-				ob->buf[(*oi)++] = accum;
-			else
-				*oi += snprintf(ob->buf + *oi,
-				    11, "\\[u%.4X]", accum);
-			*ii = i + 1;
-			*filenc &= ~MPARSE_LATIN1;
-			return(1);
-		} else {
-			/*
-			 * Entering a UTF-8 state:  if we encounter a
-			 * UTF-8 bitmask, calculate the expected UTF-8
-			 * state from it.
-			 */
-			for (state = 0; state < 7; state++)
-				if ( ! (cu & (1 << (7 - state))))
-					break;
-
-			/* Accept only legitimate bit patterns. */
-
-			switch (state--) {
-			case (4):
-				if (cu <= 244 && cu >= 240) {
-					accum = (cu & 7) << 18;
-					continue;
-				}
-				/* Bad 4-sequence start bits. */
-				break;
-			case (3):
-				if (cu <= 239 && cu >= 224) {
-					accum = (cu & 15) << 12;
-					continue;
-				}
-				/* Bad 3-sequence start bits. */
-				break;
-			case (2):
-				if (cu <= 223 && cu >= 194) {
-					accum = (cu & 31) << 6;
-					continue;
-				}
-				/* Bad 2-sequence start bits. */
-				break;
-			default:
-				/* Bad sequence bit mask. */
-				break;
-			}
-			break;
-		}
+	nby = 1;
+	while (nby < 5 && *cu & (1 << (7 - nby)))
+		nby++;
+
+	switch (nby) {
+	case 2:
+		accum = *cu & 0x1f;
+		if (accum < 0x02)  /* Obfuscated ASCII. */
+			goto latin;
+		break;
+	case 3:
+		accum = *cu & 0x0f;
+		break;
+	case 4:
+		accum = *cu & 0x07;
+		if (accum > 0x04) /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:  /* Bad sequence header. */
+		goto latin;
+	}
+
+	cu++;
+	switch (nby) {
+	case 3:
+		if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+		    (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+			goto latin;
+		break;
+	case 4:
+		if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+		    (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:
+		break;
+	}
+
+	while (--nby) {
+		if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+			goto latin;
+		accum <<= 6;
+		accum += *cu & 0x3f;
+		cu++;
 	}
 
-	/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+	assert(accum > 0x7f);
+	assert(accum < 0x110000);
+	assert(accum < 0xd800 || accum > 0xdfff);
+
+	*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+	*ii = (char *)cu - ib->buf;
+	*filenc &= ~MPARSE_LATIN1;
+	return(1);
 
 latin:
 	if ( ! (*filenc & MPARSE_LATIN1))