Improve coverage of edge cases for 3-byte UTF-8 sequences.
authorschwarze <schwarze@openbsd.org>
Thu, 16 May 2024 20:32:24 +0000 (20:32 +0000)
committerschwarze <schwarze@openbsd.org>
Thu, 16 May 2024 20:32:24 +0000 (20:32 +0000)
Coverage for 2-byte and 4-byte sequences was already reasonable.

regress/usr.bin/mandoc/char/unicode/input.in
regress/usr.bin/mandoc/char/unicode/input.out_ascii
regress/usr.bin/mandoc/char/unicode/input.out_lint
regress/usr.bin/mandoc/char/unicode/input.out_utf8

index 276d7c5..845af5f 100644 (file)
Binary files a/regress/usr.bin/mandoc/char/unicode/input.in and b/regress/usr.bin/mandoc/char/unicode/input.in differ
index 2078cf5..d85072a 100644 (file)
@@ -31,12 +31,17 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
      U+1000   0xe18080   <?><?>   begin of second start byte
      U+CFFF   0xecbfbf   <?><?>   end of last normal start byte
      U+D000   0xed8080   <?><?>   begin of last start byte
+     U+D7FB   0xed9fbb   <?><?>   highest valid public three-byte
      U+D7FF   0xed9fbf   <?><?>   highest public three-byte
      U+D800   0xeda080   ???      lowest surrogate
      U+DFFF   0xedbfbf   ???      highest surrogate
      U+E000   0xee8080   <?><?>   lowest private use
      U+F8FF   0xefa3bf   <?><?>   highest private use
      U+F900   0xefa480   <?><?>   lowest post-private
+     U+FEFF   0xefbbbf   <?><?>   byte-order mark
+     U+FFFC   0xefbfbc   <?><?>   object replacement character
+     U+FFFD   0xefbfbd   <?><?>   replacement character
+     U+FFFE   0xefbfbe   <?><?>   reversed byte-order mark
      U+FFFF   0xefbfbf   <?><?>   highest three-byte
 
    F\bFo\bou\bur\br-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
@@ -60,4 +65,4 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
      U+1FFFFF   0xf7bfbfbf     ????     highest invalid four-byte
      U+200000   0xf888808080   ?????    lowest five-byte
 
-OpenBSD                          June 2, 2021            CHAR-UNICODE-INPUT(1)
+OpenBSD                          May 16, 2024            CHAR-UNICODE-INPUT(1)
index fa36f87..70f10c2 100644 (file)
@@ -21,61 +21,61 @@ mandoc: input.in:34:19: ERROR: skipping bad character: 0x80
 mandoc: input.in:35:17: ERROR: skipping bad character: 0xe0
 mandoc: input.in:35:18: ERROR: skipping bad character: 0x9f
 mandoc: input.in:35:19: ERROR: skipping bad character: 0xbf
-mandoc: input.in:42:25: ERROR: skipping bad character: 0xed
-mandoc: input.in:42:26: ERROR: skipping bad character: 0xa0
-mandoc: input.in:42:27: ERROR: skipping bad character: 0x80
-mandoc: input.in:42:17: ERROR: invalid special character: \[uD800]
 mandoc: input.in:43:25: ERROR: skipping bad character: 0xed
-mandoc: input.in:43:26: ERROR: skipping bad character: 0xbf
-mandoc: input.in:43:27: ERROR: skipping bad character: 0xbf
-mandoc: input.in:43:17: ERROR: invalid special character: \[uDFFF]
-mandoc: input.in:53:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:53:20: ERROR: skipping bad character: 0x80
-mandoc: input.in:53:21: ERROR: skipping bad character: 0x80
-mandoc: input.in:53:22: ERROR: skipping bad character: 0x80
-mandoc: input.in:54:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:54:20: ERROR: skipping bad character: 0x80
-mandoc: input.in:54:21: ERROR: skipping bad character: 0x81
-mandoc: input.in:54:22: ERROR: skipping bad character: 0xbf
-mandoc: input.in:55:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:55:20: ERROR: skipping bad character: 0x80
-mandoc: input.in:55:21: ERROR: skipping bad character: 0x82
-mandoc: input.in:55:22: ERROR: skipping bad character: 0x80
-mandoc: input.in:56:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:56:20: ERROR: skipping bad character: 0x80
-mandoc: input.in:56:21: ERROR: skipping bad character: 0x9f
-mandoc: input.in:56:22: ERROR: skipping bad character: 0xbf
-mandoc: input.in:57:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:57:20: ERROR: skipping bad character: 0x80
-mandoc: input.in:57:21: ERROR: skipping bad character: 0xa0
-mandoc: input.in:57:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:43:26: ERROR: skipping bad character: 0xa0
+mandoc: input.in:43:27: ERROR: skipping bad character: 0x80
+mandoc: input.in:43:17: ERROR: invalid special character: \[uD800]
+mandoc: input.in:44:25: ERROR: skipping bad character: 0xed
+mandoc: input.in:44:26: ERROR: skipping bad character: 0xbf
+mandoc: input.in:44:27: ERROR: skipping bad character: 0xbf
+mandoc: input.in:44:17: ERROR: invalid special character: \[uDFFF]
 mandoc: input.in:58:19: ERROR: skipping bad character: 0xf0
-mandoc: input.in:58:20: ERROR: skipping bad character: 0x8f
-mandoc: input.in:58:21: ERROR: skipping bad character: 0xbf
-mandoc: input.in:58:22: ERROR: skipping bad character: 0xbf
-mandoc: input.in:67:31: ERROR: skipping bad character: 0xf4
-mandoc: input.in:67:32: ERROR: skipping bad character: 0x90
-mandoc: input.in:67:33: ERROR: skipping bad character: 0x80
-mandoc: input.in:67:34: ERROR: skipping bad character: 0x80
-mandoc: input.in:67:21: ERROR: invalid special character: \[u110000]
-mandoc: input.in:68:31: ERROR: skipping bad character: 0xf4
-mandoc: input.in:68:32: ERROR: skipping bad character: 0xbf
-mandoc: input.in:68:33: ERROR: skipping bad character: 0xbf
-mandoc: input.in:68:34: ERROR: skipping bad character: 0xbf
-mandoc: input.in:68:21: ERROR: invalid special character: \[u13FFFF]
-mandoc: input.in:69:31: ERROR: skipping bad character: 0xf5
-mandoc: input.in:69:32: ERROR: skipping bad character: 0x80
-mandoc: input.in:69:33: ERROR: skipping bad character: 0x80
-mandoc: input.in:69:34: ERROR: skipping bad character: 0x80
-mandoc: input.in:69:21: ERROR: invalid special character: \[u140000]
-mandoc: input.in:70:31: ERROR: skipping bad character: 0xf7
-mandoc: input.in:70:32: ERROR: skipping bad character: 0xbf
-mandoc: input.in:70:33: ERROR: skipping bad character: 0xbf
-mandoc: input.in:70:34: ERROR: skipping bad character: 0xbf
-mandoc: input.in:70:21: ERROR: invalid special character: \[u1FFFFF]
-mandoc: input.in:71:33: ERROR: skipping bad character: 0xf8
-mandoc: input.in:71:34: ERROR: skipping bad character: 0x88
-mandoc: input.in:71:35: ERROR: skipping bad character: 0x80
-mandoc: input.in:71:36: ERROR: skipping bad character: 0x80
-mandoc: input.in:71:37: ERROR: skipping bad character: 0x80
-mandoc: input.in:71:23: ERROR: invalid special character: \[u200000]
+mandoc: input.in:58:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:58:21: ERROR: skipping bad character: 0x80
+mandoc: input.in:58:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:59:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:59:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:59:21: ERROR: skipping bad character: 0x81
+mandoc: input.in:59:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:60:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:60:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:60:21: ERROR: skipping bad character: 0x82
+mandoc: input.in:60:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:61:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:61:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:61:21: ERROR: skipping bad character: 0x9f
+mandoc: input.in:61:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:62:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:62:20: ERROR: skipping bad character: 0x80
+mandoc: input.in:62:21: ERROR: skipping bad character: 0xa0
+mandoc: input.in:62:22: ERROR: skipping bad character: 0x80
+mandoc: input.in:63:19: ERROR: skipping bad character: 0xf0
+mandoc: input.in:63:20: ERROR: skipping bad character: 0x8f
+mandoc: input.in:63:21: ERROR: skipping bad character: 0xbf
+mandoc: input.in:63:22: ERROR: skipping bad character: 0xbf
+mandoc: input.in:72:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:72:32: ERROR: skipping bad character: 0x90
+mandoc: input.in:72:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:72:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:72:21: ERROR: invalid special character: \[u110000]
+mandoc: input.in:73:31: ERROR: skipping bad character: 0xf4
+mandoc: input.in:73:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:73:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:73:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:73:21: ERROR: invalid special character: \[u13FFFF]
+mandoc: input.in:74:31: ERROR: skipping bad character: 0xf5
+mandoc: input.in:74:32: ERROR: skipping bad character: 0x80
+mandoc: input.in:74:33: ERROR: skipping bad character: 0x80
+mandoc: input.in:74:34: ERROR: skipping bad character: 0x80
+mandoc: input.in:74:21: ERROR: invalid special character: \[u140000]
+mandoc: input.in:75:31: ERROR: skipping bad character: 0xf7
+mandoc: input.in:75:32: ERROR: skipping bad character: 0xbf
+mandoc: input.in:75:33: ERROR: skipping bad character: 0xbf
+mandoc: input.in:75:34: ERROR: skipping bad character: 0xbf
+mandoc: input.in:75:21: ERROR: invalid special character: \[u1FFFFF]
+mandoc: input.in:76:33: ERROR: skipping bad character: 0xf8
+mandoc: input.in:76:34: ERROR: skipping bad character: 0x88
+mandoc: input.in:76:35: ERROR: skipping bad character: 0x80
+mandoc: input.in:76:36: ERROR: skipping bad character: 0x80
+mandoc: input.in:76:37: ERROR: skipping bad character: 0x80
+mandoc: input.in:76:23: ERROR: invalid special character: \[u200000]
index 429b427..348cb68 100644 (file)
@@ -31,12 +31,17 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
      U+1000   0xe18080   ကက     begin of second start byte
      U+CFFF   0xecbfbf   쿿쿿   end of last normal start byte
      U+D000   0xed8080   퀀퀀   begin of last start byte
+     U+D7FB   0xed9fbb   ퟻퟻ   highest valid public three-byte
      U+D7FF   0xed9fbf   ퟿퟿       highest public three-byte
      U+D800   0xeda080   ???    lowest surrogate
      U+DFFF   0xedbfbf   ???    highest surrogate
      U+E000   0xee8080        lowest private use
      U+F8FF   0xefa3bf        highest private use
      U+F900   0xefa480   豈豈   lowest post-private
+     U+FEFF   0xefbbbf          byte-order mark
+     U+FFFC   0xefbfbc        object replacement character
+     U+FFFD   0xefbfbd   ��     replacement character
+     U+FFFE   0xefbfbe   ￾￾       reversed byte-order mark
      U+FFFF   0xefbfbf   ￿￿       highest three-byte
 
    F\bFo\bou\bur\br-\b-b\bby\byt\bte\be r\bra\ban\bng\bge\be
@@ -60,4 +65,4 @@ D\bDE\bES\bSC\bCR\bRI\bIP\bPT\bTI\bIO\bON\bN
      U+1FFFFF   0xf7bfbfbf     ????    highest invalid four-byte
      U+200000   0xf888808080   ?????   lowest five-byte
 
-OpenBSD                          June 2, 2021            CHAR-UNICODE-INPUT(1)
+OpenBSD                          May 16, 2024            CHAR-UNICODE-INPUT(1)