Rewrite the low-level UTF-8 parser from scratch.

author schwarze <schwarze@openbsd.org>

Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)

committer schwarze <schwarze@openbsd.org>

Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)
author schwarze <schwarze@openbsd.org>
Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)
committer schwarze <schwarze@openbsd.org>
Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)
diff --git a/regress/usr.bin/mandoc/char/unicode/Makefile b/regress/usr.bin/mandoc/char/unicode/Makefile

index 5fd1787..b75ba6d 100644 (file)
--- a/regress/usr.bin/mandoc/char/unicode/Makefile
+++ b/regress/usr.bin/mandoc/char/unicode/Makefile
@@ -1,13 +1,13 @@
-# $OpenBSD: Makefile,v 1.3 2014/10/29 03:39:26 schwarze Exp $
+# $OpenBSD: Makefile,v 1.4 2014/12/19 04:57:11 schwarze Exp $
  
-REGRESS_TARGETS  = ascii invalid latin1 latin1diff
+REGRESS_TARGETS  = ascii input invalid latin1 latin1diff
  REGRESS_TARGETS += man mdoc named namediff nogroff
  SKIP_ASCII      = man mdoc
  UTF8_TARGETS    = ${REGRESS_TARGETS}
  HTML_TARGETS    = ascii invalid latin1 latin1diff named namediff nogroff
-LINT_TARGETS    = invalid
+LINT_TARGETS    = input invalid
  
-SKIP_GROFF      = nogroff
+SKIP_GROFF      = input nogroff
  SKIP_GROFF_ASCII = latin1diff namediff
  
  .include <bsd.regress.mk>
diff --git a/regress/usr.bin/mandoc/char/unicode/input.in b/regress/usr.bin/mandoc/char/unicode/input.in

new file mode 100644 (file)

index 0000000..15a59f5

Binary files /dev/null and b/regress/usr.bin/mandoc/char/unicode/input.in differ
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_ascii b/regress/usr.bin/mandoc/char/unicode/input.out_ascii

new file mode 100644 (file)

index 0000000..a9946d1
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_ascii
@@ -0,0 +1,67 @@
+CHAR-UNICODE-INPUT(1)       General Commands Manual      CHAR-UNICODE-INPUT(1)
+
+
+
+N\bNA\bAM\bME\bE
+       char-unicode-input - Unicode characters in the input file
+
+D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
+       lowest valid: <80>
+
+   O\bOn\bne\be-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0x00   <NUL>?   lowest ASCII
+       U+001f   0x1f   <US>?    highest ASCII control character
+       U+007f   0x7f   <DEL>?   highest ASCII
+                0x80   ?        leading lowest continuation
+                0xbf   ?        leading highest continuation
+
+   T\bTw\bwo\bo-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0xc080     ??         lowest obfuscated ASCII
+       U+007f   0xc1bf     ??         highest obfuscated ASCII
+                0xc278     ?x         ASCII continuation
+       U+0080   0xc280     <80><80>   lowest two-byte
+                0xc2c380   ?`\bA       high continuation
+       U+07FF   0xdfbf     <?><?>     highest two-byte
+
+   T\bTh\bhr\bre\bee\be-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0xe08080   ???      lowest obfuscated ASCII
+       U+007f   0xe081bf   ???      highest obfuscated ASCII
+       U+0080   0xe08280   ???      lowest obfuscated two-byte
+       U+07FF   0xe09fbf   ???      highest obfuscated two-byte
+       U+0800   0xe0a080   <?><?>   lowest three-byte
+       U+0FFF   0xe0bfbf   <?><?>   end of first middle byte
+       U+1000   0xe18080   <?><?>   begin of second middle byte
+       U+CFFF   0xecbfbf   <?><?>   end of last normal middle byte
+       U+D000   0xed8080   <?><?>   begin of strange middle byte
+       U+D7FF   0xed9fbf   <?><?>   highest public three-byte
+       U+D800   0xeda080   <?>???   lowest surrogate
+       U+DFFF   0xedbfbf   <?>???   highest surrogate
+       U+E000   0xee8080   <?><?>   lowest private use
+       U+FFFF   0xefbfbf   <?><?>   highest three-byte
+
+   F\bFo\bou\bur\br-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000     0xf0808080     ????     lowest obfuscated ASCII
+       U+007f     0xf08081bf     ????     highest obfuscated ASCII
+       U+0080     0xf0808280     ????     lowest obfuscated two-byte
+       U+07FF     0xf0809fbf     ????     highest obfuscated two-byte
+       U+0800     0xf080a080     ????     lowest obfuscated three-byte
+       U+FFFF     0xf08fbfbf     ????     highest obfuscated three-byte
+       U+10000    0xf0908080     <?><?>   lowest four-byte
+       U+3FFFF    0xf0bfbfbf     <?><?>   end of first middle byte
+       U+40000    0xf1808080     <?><?>   second middle byte
+       U+FFFFF    0xf3bfbfbf     <?><?>   last normal middle byte
+       U+100000   0xf4808080     <?><?>   strange middle byte
+       U+10FFFF   0xf48fbfbf     <?><?>   last valid four-byte
+       U+110000   0xf4908080     ????     lowest beyond Unicode
+       U+13FFFF   0xf4bfbfbf     ????     end of strange middle byte
+       U+140000   0xf5808080     ????     lowest invalid middle byte
+       U+1FFFFF   0xf7bfbfbf     ????     highest four-byte
+       U+200000   0xf888808080   ?????    lowest five-byte
+
+
+
+OpenBSD                        December 19, 2014         CHAR-UNICODE-INPUT(1)
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_lint b/regress/usr.bin/mandoc/char/unicode/input.out_lint

new file mode 100644 (file)

index 0000000..77b6161
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_lint
@@ -0,0 +1,79 @@
+mandoc: input.in:10:21: ERROR: skipping bad character: 0x0
+mandoc: input.in:11:21: ERROR: skipping bad character: 0x1f
+mandoc: input.in:12:21: ERROR: skipping bad character: 0x7f
+mandoc: input.in:13:7: ERROR: skipping bad character: 0x80
+mandoc: input.in:14:7: ERROR: skipping bad character: 0xbf
+mandoc: input.in:20:15: ERROR: skipping bad character: 0xc0
+mandoc: input.in:20:16: ERROR: skipping bad character: 0x80
+mandoc: input.in:21:15: ERROR: skipping bad character: 0xc1
+mandoc: input.in:21:16: ERROR: skipping bad character: 0xbf
+mandoc: input.in:22:9: ERROR: skipping bad character: 0xc2
+mandoc: input.in:24:11: ERROR: skipping bad character: 0xc2
+mandoc: input.in:31:17: ERROR: skipping bad character: 0xc0
+mandoc: input.in:31:18: ERROR: skipping bad character: 0x80
+mandoc: input.in:31:19: ERROR: skipping bad character: 0x80
+mandoc: input.in:32:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:32:18: ERROR: skipping bad character: 0x81
+mandoc: input.in:32:19: ERROR: skipping bad character: 0xbf
+mandoc: input.in:33:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:33:18: ERROR: skipping bad character: 0x82
+mandoc: input.in:33:19: ERROR: skipping bad character: 0x80
+mandoc: input.in:34:17: ERROR: skipping bad character: 0xe0
+mandoc: input.in:34:18: ERROR: skipping bad character: 0x9f
+mandoc: input.in:34:19: ERROR: skipping bad character: 0xbf
+mandoc: input.in:41:25: ERROR: skipping bad character: 0xed
+mandoc: input.in:41:26: ERROR: skipping bad character: 0xa0
+mandoc: input.in:41:27: ERROR: skipping bad character: 0x80
+mandoc: input.in:42:25: ERROR: skipping bad character: 0xed
+mandoc: input.in:42:26: ERROR: skipping bad character: 0xbf
+mandoc: input.in:42:27: ERROR: skipping bad character: 0xbf
+mandoc: input.in:50:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:50:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:50:21: ERROR: skipping bad character: 0x80
+mandoc: input.in:50:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:51:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:51:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:51:21: ERROR: skipping bad character: 0x81
+mandoc: input.in:51:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:52:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:52:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:52:21: ERROR: skipping bad character: 0x82
+mandoc: input.in:52:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:53:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:53:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:53:21: ERROR: skipping bad character: 0x9f
+mandoc: input.in:53:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:54:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:54:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:54:21: ERROR: skipping bad character: 0xa0
+mandoc: input.in:54:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:55:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:55:20: ERROR: skipping bad character: 0x8f
+mandoc: input.in:55:21: ERROR: skipping bad character: 0xbf
+mandoc: input.in:55:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:62:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:62:32: ERROR: skipping bad character: 0x90
+mandoc: input.in:62:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:62:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:62:21: WARNING: invalid escape sequence: \[u110000]
+mandoc: input.in:63:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:63:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:21: WARNING: invalid escape sequence: \[u13FFFF]
+mandoc: input.in:64:31: ERROR: skipping bad character: 0xf5
+mandoc: input.in:64:32: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:64:21: WARNING: invalid escape sequence: \[u140000]
+mandoc: input.in:65:31: ERROR: skipping bad character: 0xf7
+mandoc: input.in:65:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:65:21: WARNING: invalid escape sequence: \[u1FFFFF]
+mandoc: input.in:66:33: ERROR: skipping bad character: 0xf8
+mandoc: input.in:66:34: ERROR: skipping bad character: 0x88
+mandoc: input.in:66:35: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:36: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:37: ERROR: skipping bad character: 0x80
+mandoc: input.in:66:23: WARNING: invalid escape sequence: \[u200000]
diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_utf8 b/regress/usr.bin/mandoc/char/unicode/input.out_utf8

new file mode 100644 (file)

index 0000000..fc7a59e
--- /dev/null
+++ b/regress/usr.bin/mandoc/char/unicode/input.out_utf8
@@ -0,0 +1,67 @@
+CHAR-UNICODE-INPUT(1)       General Commands Manual      CHAR-UNICODE-INPUT(1)
+
+
+
+N\bNA\bAM\bME\bE
+       char-unicode-input - Unicode characters in the input file
+
+D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
+       lowest valid: �
+
+   O\bOn\bne\be-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0x00   �?   lowest ASCII
+       U+001f   0x1f   �?   highest ASCII control character
+       U+007f   0x7f   �?   highest ASCII
+                0x80   ?    leading lowest continuation
+                0xbf   ?    leading highest continuation
+
+   T\bTw\bwo\bo-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0xc080     ??   lowest obfuscated ASCII
+       U+007f   0xc1bf     ??   highest obfuscated ASCII
+                0xc278     ?x   ASCII continuation
+       U+0080   0xc280     ��   lowest two-byte
+                0xc2c380   ?À   high continuation
+       U+07FF   0xdfbf     ߿߿     highest two-byte
+
+   T\bTh\bhr\bre\bee\be-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000   0xe08080   ???    lowest obfuscated ASCII
+       U+007f   0xe081bf   ???    highest obfuscated ASCII
+       U+0080   0xe08280   ???    lowest obfuscated two-byte
+       U+07FF   0xe09fbf   ???    highest obfuscated two-byte
+       U+0800   0xe0a080   ࠀࠀ       lowest three-byte
+       U+0FFF   0xe0bfbf   ࿿࿿       end of first middle byte
+       U+1000   0xe18080   ကက     begin of second middle byte
+       U+CFFF   0xecbfbf   쿿쿿   end of last normal middle byte
+       U+D000   0xed8080   퀀퀀   begin of strange middle byte
+       U+D7FF   0xed9fbf   ퟿퟿       highest public three-byte
+       U+D800   0xeda080   ���???   lowest surrogate
+       U+DFFF   0xedbfbf   ���???   highest surrogate
+       U+E000   0xee8080        lowest private use
+       U+FFFF   0xefbfbf          highest three-byte
+
+   F\bFo\bou\bur\br-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
+
+       U+0000     0xf0808080     ????    lowest obfuscated ASCII
+       U+007f     0xf08081bf     ????    highest obfuscated ASCII
+       U+0080     0xf0808280     ????    lowest obfuscated two-byte
+       U+07FF     0xf0809fbf     ????    highest obfuscated two-byte
+       U+0800     0xf080a080     ????    lowest obfuscated three-byte
+       U+FFFF     0xf08fbfbf     ????    highest obfuscated three-byte
+       U+10000    0xf0908080     𐀀𐀀        lowest four-byte
+       U+3FFFF    0xf0bfbfbf     𿿿𿿿        end of first middle byte
+       U+40000    0xf1808080     񀀀񀀀        second middle byte
+       U+FFFFF    0xf3bfbfbf     󿿿󿿿        last normal middle byte
+       U+100000   0xf4808080     􀀀􀀀      strange middle byte
+       U+10FFFF   0xf48fbfbf     􏿿􏿿        last valid four-byte
+       U+110000   0xf4908080     ????    lowest beyond Unicode
+       U+13FFFF   0xf4bfbfbf     ????    end of strange middle byte
+       U+140000   0xf5808080     ????    lowest invalid middle byte
+       U+1FFFFF   0xf7bfbfbf     ????    highest four-byte
+       U+200000   0xf888808080   ?????   lowest five-byte
+
+
+
+OpenBSD                        December 19, 2014         CHAR-UNICODE-INPUT(1)
diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c

index 8e4a173..3d5a306 100644 (file)
--- a/usr.bin/mandoc/preconv.c
+++ b/usr.bin/mandoc/preconv.c
@@ -1,4 +1,4 @@
-/*     $OpenBSD: preconv.c,v 1.4 2014/11/28 19:25:03 schwarze Exp $ */
+/*     $OpenBSD: preconv.c,v 1.5 2014/12/19 04:57:11 schwarze Exp $ */
  /*
   * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -17,6 +17,7 @@
   */
  #include <sys/types.h>
  
+#include <assert.h>
  #include <stdio.h>
  #include <string.h>
  #include "mandoc.h"
@@ -26,88 +27,70 @@ int
  preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
      int *filenc)
  {
-       size_t           i;
-       int              state;
+       unsigned char   *cu;
+       int              nby;
         unsigned int     accum;
-       unsigned char    cu;
+
+       cu = ib->buf + *ii;
+       assert(*cu & 0x80);
  
         if ( ! (*filenc & MPARSE_UTF8))
                 goto latin;
  
-       state = 0;
-       accum = 0U;
-
-       for (i = *ii; i < ib->sz; i++) {
-               cu = ib->buf[i];
-               if (state) {
-                       if ( ! (cu & 128) || (cu & 64)) {
-                               /* Bad sequence header. */
-                               break;
-                       }
-
-                       /* Accept only legitimate bit patterns. */
-
-                       if (cu > 191 || cu < 128) {
-                               /* Bad in-sequence bits. */
-                               break;
-                       }
-
-                       accum |= (cu & 63) << --state * 6;
-
-                       if (state)
-                               continue;
-
-                       if (accum < 0x80)
-                               ob->buf[(*oi)++] = accum;
-                       else
-                               *oi += snprintf(ob->buf + *oi,
-                                   11, "\\[u%.4X]", accum);
-                       *ii = i + 1;
-                       *filenc &= ~MPARSE_LATIN1;
-                       return(1);
-               } else {
-                       /*
-                        * Entering a UTF-8 state:  if we encounter a
-                        * UTF-8 bitmask, calculate the expected UTF-8
-                        * state from it.
-                        */
-                       for (state = 0; state < 7; state++)
-                               if ( ! (cu & (1 << (7 - state))))
-                                       break;
-
-                       /* Accept only legitimate bit patterns. */
-
-                       switch (state--) {
-                       case (4):
-                               if (cu <= 244 && cu >= 240) {
-                                       accum = (cu & 7) << 18;
-                                       continue;
-                               }
-                               /* Bad 4-sequence start bits. */
-                               break;
-                       case (3):
-                               if (cu <= 239 && cu >= 224) {
-                                       accum = (cu & 15) << 12;
-                                       continue;
-                               }
-                               /* Bad 3-sequence start bits. */
-                               break;
-                       case (2):
-                               if (cu <= 223 && cu >= 194) {
-                                       accum = (cu & 31) << 6;
-                                       continue;
-                               }
-                               /* Bad 2-sequence start bits. */
-                               break;
-                       default:
-                               /* Bad sequence bit mask. */
-                               break;
-                       }
-                       break;
-               }
+       nby = 1;
+       while (nby < 5 && *cu & (1 << (7 - nby)))
+               nby++;
+
+       switch (nby) {
+       case 2:
+               accum = *cu & 0x1f;
+               if (accum < 0x02)  /* Obfuscated ASCII. */
+                       goto latin;
+               break;
+       case 3:
+               accum = *cu & 0x0f;
+               break;
+       case 4:
+               accum = *cu & 0x07;
+               if (accum > 0x04) /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:  /* Bad sequence header. */
+               goto latin;
+       }
+
+       cu++;
+       switch (nby) {
+       case 3:
+               if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+                   (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+                       goto latin;
+               break;
+       case 4:
+               if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+                   (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:
+               break;
+       }
+
+       while (--nby) {
+               if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+                       goto latin;
+               accum <<= 6;
+               accum += *cu & 0x3f;
+               cu++;
         }
  
-       /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+       assert(accum > 0x7f);
+       assert(accum < 0x110000);
+       assert(accum < 0xd800 || accum > 0xdfff);
+
+       *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+       *ii = (char *)cu - ib->buf;
+       *filenc &= ~MPARSE_LATIN1;
+       return(1);
  
  latin:
         if ( ! (*filenc & MPARSE_LATIN1))
author	schwarze <schwarze@openbsd.org>
	Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)
committer	schwarze <schwarze@openbsd.org>
	Fri, 19 Dec 2014 04:57:11 +0000 (04:57 +0000)
regress/usr.bin/mandoc/char/unicode/Makefile		patch \| blob \| history
regress/usr.bin/mandoc/char/unicode/input.in	[new file with mode: 0644]	patch \| blob
regress/usr.bin/mandoc/char/unicode/input.out_ascii	[new file with mode: 0644]	patch \| blob
regress/usr.bin/mandoc/char/unicode/input.out_lint	[new file with mode: 0644]	patch \| blob
regress/usr.bin/mandoc/char/unicode/input.out_utf8	[new file with mode: 0644]	patch \| blob
usr.bin/mandoc/preconv.c		patch \| blob \| history