From: schwarze Date: Fri, 19 Dec 2014 04:57:11 +0000 (+0000) Subject: Rewrite the low-level UTF-8 parser from scratch. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=52a7f4662432db837ecf4c838b4be59349e5f106;p=openbsd Rewrite the low-level UTF-8 parser from scratch. It accepted invalid byte sequences like 0xc080-c1bf, 0xe08080-e09fbf, 0xeda080-edbfbf, and 0xf0808080-f08fbfbf, produced valid roff Unicode escape sequences from them, and the algorithm contained strong defenses against any attempt to fix it. This cures an assertion failure in the terminal formatter caused by sneaking in ASCII 0x08 (backspace) by "encoding" it as an (invalid) multibyte UTF-8 sequence, found by jsg@ with afl. As a bonus, the new algorithm also reduces the code in the function by about 20%. --- diff --git a/regress/usr.bin/mandoc/char/unicode/Makefile b/regress/usr.bin/mandoc/char/unicode/Makefile index 5fd17875c22..b75ba6dbf07 100644 --- a/regress/usr.bin/mandoc/char/unicode/Makefile +++ b/regress/usr.bin/mandoc/char/unicode/Makefile @@ -1,13 +1,13 @@ -# $OpenBSD: Makefile,v 1.3 2014/10/29 03:39:26 schwarze Exp $ +# $OpenBSD: Makefile,v 1.4 2014/12/19 04:57:11 schwarze Exp $ -REGRESS_TARGETS = ascii invalid latin1 latin1diff +REGRESS_TARGETS = ascii input invalid latin1 latin1diff REGRESS_TARGETS += man mdoc named namediff nogroff SKIP_ASCII = man mdoc UTF8_TARGETS = ${REGRESS_TARGETS} HTML_TARGETS = ascii invalid latin1 latin1diff named namediff nogroff -LINT_TARGETS = invalid +LINT_TARGETS = input invalid -SKIP_GROFF = nogroff +SKIP_GROFF = input nogroff SKIP_GROFF_ASCII = latin1diff namediff .include diff --git a/regress/usr.bin/mandoc/char/unicode/input.in b/regress/usr.bin/mandoc/char/unicode/input.in new file mode 100644 index 00000000000..15a59f5bfd6 Binary files /dev/null and b/regress/usr.bin/mandoc/char/unicode/input.in differ diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_ascii b/regress/usr.bin/mandoc/char/unicode/input.out_ascii new file mode 100644 index 00000000000..a9946d1b528 --- /dev/null +++ b/regress/usr.bin/mandoc/char/unicode/input.out_ascii @@ -0,0 +1,67 @@ +CHAR-UNICODE-INPUT(1) General Commands Manual CHAR-UNICODE-INPUT(1) + + + +NNAAMMEE + char-unicode-input - Unicode characters in the input file + +DDEESSCCRRIIPPTTIIOONN + lowest valid: <80> + + OOnnee--bbyyttee rraannggee + + U+0000 0x00 ? lowest ASCII + U+001f 0x1f ? highest ASCII control character + U+007f 0x7f ? highest ASCII + 0x80 ? leading lowest continuation + 0xbf ? leading highest continuation + + TTwwoo--bbyyttee rraannggee + + U+0000 0xc080 ?? lowest obfuscated ASCII + U+007f 0xc1bf ?? highest obfuscated ASCII + 0xc278 ?x ASCII continuation + U+0080 0xc280 <80><80> lowest two-byte + 0xc2c380 ?`A high continuation + U+07FF 0xdfbf highest two-byte + + TThhrreeee--bbyyttee rraannggee + + U+0000 0xe08080 ??? lowest obfuscated ASCII + U+007f 0xe081bf ??? highest obfuscated ASCII + U+0080 0xe08280 ??? lowest obfuscated two-byte + U+07FF 0xe09fbf ??? highest obfuscated two-byte + U+0800 0xe0a080 lowest three-byte + U+0FFF 0xe0bfbf end of first middle byte + U+1000 0xe18080 begin of second middle byte + U+CFFF 0xecbfbf end of last normal middle byte + U+D000 0xed8080 begin of strange middle byte + U+D7FF 0xed9fbf highest public three-byte + U+D800 0xeda080 ??? lowest surrogate + U+DFFF 0xedbfbf ??? highest surrogate + U+E000 0xee8080 lowest private use + U+FFFF 0xefbfbf highest three-byte + + FFoouurr--bbyyttee rraannggee + + U+0000 0xf0808080 ???? lowest obfuscated ASCII + U+007f 0xf08081bf ???? highest obfuscated ASCII + U+0080 0xf0808280 ???? lowest obfuscated two-byte + U+07FF 0xf0809fbf ???? highest obfuscated two-byte + U+0800 0xf080a080 ???? lowest obfuscated three-byte + U+FFFF 0xf08fbfbf ???? highest obfuscated three-byte + U+10000 0xf0908080 lowest four-byte + U+3FFFF 0xf0bfbfbf end of first middle byte + U+40000 0xf1808080 second middle byte + U+FFFFF 0xf3bfbfbf last normal middle byte + U+100000 0xf4808080 strange middle byte + U+10FFFF 0xf48fbfbf last valid four-byte + U+110000 0xf4908080 ???? lowest beyond Unicode + U+13FFFF 0xf4bfbfbf ???? end of strange middle byte + U+140000 0xf5808080 ???? lowest invalid middle byte + U+1FFFFF 0xf7bfbfbf ???? highest four-byte + U+200000 0xf888808080 ????? lowest five-byte + + + +OpenBSD December 19, 2014 CHAR-UNICODE-INPUT(1) diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_lint b/regress/usr.bin/mandoc/char/unicode/input.out_lint new file mode 100644 index 00000000000..77b6161cbab --- /dev/null +++ b/regress/usr.bin/mandoc/char/unicode/input.out_lint @@ -0,0 +1,79 @@ +mandoc: input.in:10:21: ERROR: skipping bad character: 0x0 +mandoc: input.in:11:21: ERROR: skipping bad character: 0x1f +mandoc: input.in:12:21: ERROR: skipping bad character: 0x7f +mandoc: input.in:13:7: ERROR: skipping bad character: 0x80 +mandoc: input.in:14:7: ERROR: skipping bad character: 0xbf +mandoc: input.in:20:15: ERROR: skipping bad character: 0xc0 +mandoc: input.in:20:16: ERROR: skipping bad character: 0x80 +mandoc: input.in:21:15: ERROR: skipping bad character: 0xc1 +mandoc: input.in:21:16: ERROR: skipping bad character: 0xbf +mandoc: input.in:22:9: ERROR: skipping bad character: 0xc2 +mandoc: input.in:24:11: ERROR: skipping bad character: 0xc2 +mandoc: input.in:31:17: ERROR: skipping bad character: 0xc0 +mandoc: input.in:31:18: ERROR: skipping bad character: 0x80 +mandoc: input.in:31:19: ERROR: skipping bad character: 0x80 +mandoc: input.in:32:17: ERROR: skipping bad character: 0xe0 +mandoc: input.in:32:18: ERROR: skipping bad character: 0x81 +mandoc: input.in:32:19: ERROR: skipping bad character: 0xbf +mandoc: input.in:33:17: ERROR: skipping bad character: 0xe0 +mandoc: input.in:33:18: ERROR: skipping bad character: 0x82 +mandoc: input.in:33:19: ERROR: skipping bad character: 0x80 +mandoc: input.in:34:17: ERROR: skipping bad character: 0xe0 +mandoc: input.in:34:18: ERROR: skipping bad character: 0x9f +mandoc: input.in:34:19: ERROR: skipping bad character: 0xbf +mandoc: input.in:41:25: ERROR: skipping bad character: 0xed +mandoc: input.in:41:26: ERROR: skipping bad character: 0xa0 +mandoc: input.in:41:27: ERROR: skipping bad character: 0x80 +mandoc: input.in:42:25: ERROR: skipping bad character: 0xed +mandoc: input.in:42:26: ERROR: skipping bad character: 0xbf +mandoc: input.in:42:27: ERROR: skipping bad character: 0xbf +mandoc: input.in:50:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:50:20: ERROR: skipping bad character: 0x80 +mandoc: input.in:50:21: ERROR: skipping bad character: 0x80 +mandoc: input.in:50:22: ERROR: skipping bad character: 0x80 +mandoc: input.in:51:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:51:20: ERROR: skipping bad character: 0x80 +mandoc: input.in:51:21: ERROR: skipping bad character: 0x81 +mandoc: input.in:51:22: ERROR: skipping bad character: 0xbf +mandoc: input.in:52:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:52:20: ERROR: skipping bad character: 0x80 +mandoc: input.in:52:21: ERROR: skipping bad character: 0x82 +mandoc: input.in:52:22: ERROR: skipping bad character: 0x80 +mandoc: input.in:53:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:53:20: ERROR: skipping bad character: 0x80 +mandoc: input.in:53:21: ERROR: skipping bad character: 0x9f +mandoc: input.in:53:22: ERROR: skipping bad character: 0xbf +mandoc: input.in:54:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:54:20: ERROR: skipping bad character: 0x80 +mandoc: input.in:54:21: ERROR: skipping bad character: 0xa0 +mandoc: input.in:54:22: ERROR: skipping bad character: 0x80 +mandoc: input.in:55:19: ERROR: skipping bad character: 0xf0 +mandoc: input.in:55:20: ERROR: skipping bad character: 0x8f +mandoc: input.in:55:21: ERROR: skipping bad character: 0xbf +mandoc: input.in:55:22: ERROR: skipping bad character: 0xbf +mandoc: input.in:62:31: ERROR: skipping bad character: 0xf4 +mandoc: input.in:62:32: ERROR: skipping bad character: 0x90 +mandoc: input.in:62:33: ERROR: skipping bad character: 0x80 +mandoc: input.in:62:34: ERROR: skipping bad character: 0x80 +mandoc: input.in:62:21: WARNING: invalid escape sequence: \[u110000] +mandoc: input.in:63:31: ERROR: skipping bad character: 0xf4 +mandoc: input.in:63:32: ERROR: skipping bad character: 0xbf +mandoc: input.in:63:33: ERROR: skipping bad character: 0xbf +mandoc: input.in:63:34: ERROR: skipping bad character: 0xbf +mandoc: input.in:63:21: WARNING: invalid escape sequence: \[u13FFFF] +mandoc: input.in:64:31: ERROR: skipping bad character: 0xf5 +mandoc: input.in:64:32: ERROR: skipping bad character: 0x80 +mandoc: input.in:64:33: ERROR: skipping bad character: 0x80 +mandoc: input.in:64:34: ERROR: skipping bad character: 0x80 +mandoc: input.in:64:21: WARNING: invalid escape sequence: \[u140000] +mandoc: input.in:65:31: ERROR: skipping bad character: 0xf7 +mandoc: input.in:65:32: ERROR: skipping bad character: 0xbf +mandoc: input.in:65:33: ERROR: skipping bad character: 0xbf +mandoc: input.in:65:34: ERROR: skipping bad character: 0xbf +mandoc: input.in:65:21: WARNING: invalid escape sequence: \[u1FFFFF] +mandoc: input.in:66:33: ERROR: skipping bad character: 0xf8 +mandoc: input.in:66:34: ERROR: skipping bad character: 0x88 +mandoc: input.in:66:35: ERROR: skipping bad character: 0x80 +mandoc: input.in:66:36: ERROR: skipping bad character: 0x80 +mandoc: input.in:66:37: ERROR: skipping bad character: 0x80 +mandoc: input.in:66:23: WARNING: invalid escape sequence: \[u200000] diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_utf8 b/regress/usr.bin/mandoc/char/unicode/input.out_utf8 new file mode 100644 index 00000000000..fc7a59e1092 --- /dev/null +++ b/regress/usr.bin/mandoc/char/unicode/input.out_utf8 @@ -0,0 +1,67 @@ +CHAR-UNICODE-INPUT(1) General Commands Manual CHAR-UNICODE-INPUT(1) + + + +NNAAMMEE + char-unicode-input - Unicode characters in the input file + +DDEESSCCRRIIPPTTIIOONN + lowest valid: � + + OOnnee--bbyyttee rraannggee + + U+0000 0x00 �? lowest ASCII + U+001f 0x1f �? highest ASCII control character + U+007f 0x7f �? highest ASCII + 0x80 ? leading lowest continuation + 0xbf ? leading highest continuation + + TTwwoo--bbyyttee rraannggee + + U+0000 0xc080 ?? lowest obfuscated ASCII + U+007f 0xc1bf ?? highest obfuscated ASCII + 0xc278 ?x ASCII continuation + U+0080 0xc280 �� lowest two-byte + 0xc2c380 ?À high continuation + U+07FF 0xdfbf ß¿ß¿ highest two-byte + + TThhrreeee--bbyyttee rraannggee + + U+0000 0xe08080 ??? lowest obfuscated ASCII + U+007f 0xe081bf ??? highest obfuscated ASCII + U+0080 0xe08280 ??? lowest obfuscated two-byte + U+07FF 0xe09fbf ??? highest obfuscated two-byte + U+0800 0xe0a080 ࠀࠀ lowest three-byte + U+0FFF 0xe0bfbf à¿¿à¿¿ end of first middle byte + U+1000 0xe18080 ကက begin of second middle byte + U+CFFF 0xecbfbf ì¿¿ì¿¿ end of last normal middle byte + U+D000 0xed8080 퀀퀀 begin of strange middle byte + U+D7FF 0xed9fbf ퟿퟿ highest public three-byte + U+D800 0xeda080 í €??? lowest surrogate + U+DFFF 0xedbfbf í¿¿??? highest surrogate + U+E000 0xee8080  lowest private use + U+FFFF 0xefbfbf ï¿¿ï¿¿ highest three-byte + + FFoouurr--bbyyttee rraannggee + + U+0000 0xf0808080 ???? lowest obfuscated ASCII + U+007f 0xf08081bf ???? highest obfuscated ASCII + U+0080 0xf0808280 ???? lowest obfuscated two-byte + U+07FF 0xf0809fbf ???? highest obfuscated two-byte + U+0800 0xf080a080 ???? lowest obfuscated three-byte + U+FFFF 0xf08fbfbf ???? highest obfuscated three-byte + U+10000 0xf0908080 𐀀𐀀 lowest four-byte + U+3FFFF 0xf0bfbfbf ð¿¿¿ð¿¿¿ end of first middle byte + U+40000 0xf1808080 ñ€€€ñ€€€ second middle byte + U+FFFFF 0xf3bfbfbf ó¿¿¿ó¿¿¿ last normal middle byte + U+100000 0xf4808080 ô€€€ô€€€ strange middle byte + U+10FFFF 0xf48fbfbf ô¿¿ô¿¿ last valid four-byte + U+110000 0xf4908080 ???? lowest beyond Unicode + U+13FFFF 0xf4bfbfbf ???? end of strange middle byte + U+140000 0xf5808080 ???? lowest invalid middle byte + U+1FFFFF 0xf7bfbfbf ???? highest four-byte + U+200000 0xf888808080 ????? lowest five-byte + + + +OpenBSD December 19, 2014 CHAR-UNICODE-INPUT(1) diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c index 8e4a1739f76..3d5a30655f6 100644 --- a/usr.bin/mandoc/preconv.c +++ b/usr.bin/mandoc/preconv.c @@ -1,4 +1,4 @@ -/* $OpenBSD: preconv.c,v 1.4 2014/11/28 19:25:03 schwarze Exp $ */ +/* $OpenBSD: preconv.c,v 1.5 2014/12/19 04:57:11 schwarze Exp $ */ /* * Copyright (c) 2011 Kristaps Dzonsons * Copyright (c) 2014 Ingo Schwarze @@ -17,6 +17,7 @@ */ #include +#include #include #include #include "mandoc.h" @@ -26,88 +27,70 @@ int preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, int *filenc) { - size_t i; - int state; + unsigned char *cu; + int nby; unsigned int accum; - unsigned char cu; + + cu = ib->buf + *ii; + assert(*cu & 0x80); if ( ! (*filenc & MPARSE_UTF8)) goto latin; - state = 0; - accum = 0U; - - for (i = *ii; i < ib->sz; i++) { - cu = ib->buf[i]; - if (state) { - if ( ! (cu & 128) || (cu & 64)) { - /* Bad sequence header. */ - break; - } - - /* Accept only legitimate bit patterns. */ - - if (cu > 191 || cu < 128) { - /* Bad in-sequence bits. */ - break; - } - - accum |= (cu & 63) << --state * 6; - - if (state) - continue; - - if (accum < 0x80) - ob->buf[(*oi)++] = accum; - else - *oi += snprintf(ob->buf + *oi, - 11, "\\[u%.4X]", accum); - *ii = i + 1; - *filenc &= ~MPARSE_LATIN1; - return(1); - } else { - /* - * Entering a UTF-8 state: if we encounter a - * UTF-8 bitmask, calculate the expected UTF-8 - * state from it. - */ - for (state = 0; state < 7; state++) - if ( ! (cu & (1 << (7 - state)))) - break; - - /* Accept only legitimate bit patterns. */ - - switch (state--) { - case (4): - if (cu <= 244 && cu >= 240) { - accum = (cu & 7) << 18; - continue; - } - /* Bad 4-sequence start bits. */ - break; - case (3): - if (cu <= 239 && cu >= 224) { - accum = (cu & 15) << 12; - continue; - } - /* Bad 3-sequence start bits. */ - break; - case (2): - if (cu <= 223 && cu >= 194) { - accum = (cu & 31) << 6; - continue; - } - /* Bad 2-sequence start bits. */ - break; - default: - /* Bad sequence bit mask. */ - break; - } - break; - } + nby = 1; + while (nby < 5 && *cu & (1 << (7 - nby))) + nby++; + + switch (nby) { + case 2: + accum = *cu & 0x1f; + if (accum < 0x02) /* Obfuscated ASCII. */ + goto latin; + break; + case 3: + accum = *cu & 0x0f; + break; + case 4: + accum = *cu & 0x07; + if (accum > 0x04) /* Beyond Unicode. */ + goto latin; + break; + default: /* Bad sequence header. */ + goto latin; + } + + cu++; + switch (nby) { + case 3: + if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ + (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ + goto latin; + break; + case 4: + if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ + (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ + goto latin; + break; + default: + break; + } + + while (--nby) { + if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ + goto latin; + accum <<= 6; + accum += *cu & 0x3f; + cu++; } - /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */ + assert(accum > 0x7f); + assert(accum < 0x110000); + assert(accum < 0xd800 || accum > 0xdfff); + + *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); + *ii = (char *)cu - ib->buf; + *filenc &= ~MPARSE_LATIN1; + return(1); latin: if ( ! (*filenc & MPARSE_LATIN1))