From 39f98da69e764079e742d2bcabf459698793ca34 Mon Sep 17 00:00:00 2001 From: schwarze Date: Wed, 2 Jun 2021 16:35:25 +0000 Subject: [PATCH] Cleanup: 1. Move invalid two-byte sequences after valid ones and make their descriptions easier to understand. 2. Replace the wrong and confusing expression "middle byte" with the correct term "start byte". 3. Add test lines for U+EFFFF and U+F0000. 4. Replace the unhelpful word "strange" with more descriptive terms. Arguably, nothing about this (or maybe everything?) is strange. --- regress/usr.bin/mandoc/char/unicode/input.in | Bin 2427 -> 2588 bytes .../mandoc/char/unicode/input.out_ascii | 32 ++++++------ .../mandoc/char/unicode/input.out_lint | 46 +++++++++--------- .../mandoc/char/unicode/input.out_utf8 | 32 ++++++------ 4 files changed, 57 insertions(+), 53 deletions(-) diff --git a/regress/usr.bin/mandoc/char/unicode/input.in b/regress/usr.bin/mandoc/char/unicode/input.in index b0d9c7fc14e14f802ed597922e92e730b32da023..cc3123404cb62c77d021fa1dcacfb6e98009aed3 100644 GIT binary patch delta 471 zcmew@G)H8Dw5YLyk%5t+zJZy(fsultnU%4rm67R011%n}(!5j!BOS2x#_TJMlO5PN zC#NwfOkT#M6=`Kqk!)mcVRfj&$}!m4(^DZcuec;NF-0LiO(8iyuOu_CG_fQzKaUHn zFxl9`!0OQ92CL$d#G(>~q{@<1TpBi?XR>2t1*@AZ%why(d9#>8S#2!ZTseuwB?@3A zoA3UoyZL=o6Nu25}PHy{AIC(S4+4Fs)XqD#SIv7h$u z-*1(XnVyjfG`OHNDJL^oAvq(lC^5MtwFs!w%>W2MI)VNGL6AD9IC8f?US83%8)!h Z`8bCK)ZeU}hU}SnWr;bNDU(e(-2i#VpfUge delta 305 zcmbOu@>^(vw5XASk%6JPz5x)KC>WYpnHpOe8Ba9O5_CyTPR&h9EmAPF)B&k6*_d{P zQP#?!BH75?!s<|km1D58r>8=4eqKptUTI=UW`5peOC~LOu!>}33j?b|ha0RiGSf3q z)g&=#Z+^~X%gC0SnUa!|I$4v&2*ylcF@-T_v1s!ZmlP%DrKc)D6mCAxvYrVh-OR2o zSey#9I|VAMkW^VR*@{D1EGMzJL?JJ~C^s<&S*n6VRS4BQu#W8b^rhX diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_ascii b/regress/usr.bin/mandoc/char/unicode/input.out_ascii index 410bdc85830..66e904de69b 100644 --- a/regress/usr.bin/mandoc/char/unicode/input.out_ascii +++ b/regress/usr.bin/mandoc/char/unicode/input.out_ascii @@ -20,10 +20,10 @@ DDEESSCCRRIIPPTTIIOONN U+0000 0xc080 ?? lowest obfuscated ASCII U+007f 0xc1bf ?? highest obfuscated ASCII - 0xc278 ?x ASCII continuation U+0080 0xc280 <80><80> lowest two-byte - 0xc2c380 ?`A high continuation U+07FF 0xdfbf highest two-byte + 0xc278 ?x ASCII instead of continuation + 0xc2c380 ?`A start byte instead of continuation TThhrreeee--bbyyttee rraannggee @@ -32,10 +32,10 @@ DDEESSCCRRIIPPTTIIOONN U+0080 0xe08280 ??? lowest obfuscated two-byte U+07FF 0xe09fbf ??? highest obfuscated two-byte U+0800 0xe0a080 lowest three-byte - U+0FFF 0xe0bfbf end of first middle byte - U+1000 0xe18080 begin of second middle byte - U+CFFF 0xecbfbf end of last normal middle byte - U+D000 0xed8080 begin of strange middle byte + U+0FFF 0xe0bfbf end of first start byte + U+1000 0xe18080 begin of second start byte + U+CFFF 0xecbfbf end of last normal start byte + U+D000 0xed8080 begin of last start byte U+D7FF 0xed9fbf highest public three-byte U+D800 0xeda080 ??? lowest surrogate U+DFFF 0xedbfbf ??? highest surrogate @@ -51,17 +51,19 @@ DDEESSCCRRIIPPTTIIOONN U+0800 0xf080a080 ???? lowest obfuscated three-byte U+FFFF 0xf08fbfbf ???? highest obfuscated three-byte U+10000 0xf0908080 lowest four-byte - U+3FFFF 0xf0bfbfbf end of first middle byte - U+40000 0xf1808080 second middle byte - U+FFFFF 0xf3bfbfbf last normal middle byte - U+100000 0xf4808080 strange middle byte - U+10FFFF 0xf48fbfbf last valid four-byte + U+3FFFF 0xf0bfbfbf end of first start byte + U+40000 0xf1808080 begin of second start byte + U+EFFFF 0xf2bfbfbf highest public character + U+F0000 0xf3808080 lowest plane 15 private use + U+FFFFF 0xf3bfbfbf highest plane 15 private use + U+100000 0xf4808080 lowest plane 16 private use + U+10FFFF 0xf48fbfbf highest valid four-byte U+110000 0xf4908080 ???? lowest beyond Unicode - U+13FFFF 0xf4bfbfbf ???? end of strange middle byte - U+140000 0xf5808080 ???? lowest invalid middle byte - U+1FFFFF 0xf7bfbfbf ???? highest four-byte + U+13FFFF 0xf4bfbfbf ???? end of last start byte + U+140000 0xf5808080 ???? lowest invalid start byte + U+1FFFFF 0xf7bfbfbf ???? highest invalid four-byte U+200000 0xf888808080 ????? lowest five-byte -OpenBSD December 19, 2014 CHAR-UNICODE-INPUT(1) +OpenBSD June 2, 2021 CHAR-UNICODE-INPUT(1) diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_lint b/regress/usr.bin/mandoc/char/unicode/input.out_lint index e537b4fd463..fbd053b297d 100644 --- a/regress/usr.bin/mandoc/char/unicode/input.out_lint +++ b/regress/usr.bin/mandoc/char/unicode/input.out_lint @@ -7,8 +7,8 @@ mandoc: input.in:21:15: ERROR: skipping bad character: 0xc0 mandoc: input.in:21:16: ERROR: skipping bad character: 0x80 mandoc: input.in:22:15: ERROR: skipping bad character: 0xc1 mandoc: input.in:22:16: ERROR: skipping bad character: 0xbf -mandoc: input.in:23:9: ERROR: skipping bad character: 0xc2 -mandoc: input.in:25:11: ERROR: skipping bad character: 0xc2 +mandoc: input.in:25:9: ERROR: skipping bad character: 0xc2 +mandoc: input.in:26:11: ERROR: skipping bad character: 0xc2 mandoc: input.in:32:17: ERROR: skipping bad character: 0xc0 mandoc: input.in:32:18: ERROR: skipping bad character: 0x80 mandoc: input.in:32:19: ERROR: skipping bad character: 0x80 @@ -53,29 +53,29 @@ mandoc: input.in:56:19: ERROR: skipping bad character: 0xf0 mandoc: input.in:56:20: ERROR: skipping bad character: 0x8f mandoc: input.in:56:21: ERROR: skipping bad character: 0xbf mandoc: input.in:56:22: ERROR: skipping bad character: 0xbf -mandoc: input.in:63:31: ERROR: skipping bad character: 0xf4 -mandoc: input.in:63:32: ERROR: skipping bad character: 0x90 -mandoc: input.in:63:33: ERROR: skipping bad character: 0x80 -mandoc: input.in:63:34: ERROR: skipping bad character: 0x80 -mandoc: input.in:63:21: WARNING: invalid escape sequence: \[u110000] -mandoc: input.in:64:31: ERROR: skipping bad character: 0xf4 -mandoc: input.in:64:32: ERROR: skipping bad character: 0xbf -mandoc: input.in:64:33: ERROR: skipping bad character: 0xbf -mandoc: input.in:64:34: ERROR: skipping bad character: 0xbf -mandoc: input.in:64:21: WARNING: invalid escape sequence: \[u13FFFF] -mandoc: input.in:65:31: ERROR: skipping bad character: 0xf5 -mandoc: input.in:65:32: ERROR: skipping bad character: 0x80 +mandoc: input.in:65:31: ERROR: skipping bad character: 0xf4 +mandoc: input.in:65:32: ERROR: skipping bad character: 0x90 mandoc: input.in:65:33: ERROR: skipping bad character: 0x80 mandoc: input.in:65:34: ERROR: skipping bad character: 0x80 -mandoc: input.in:65:21: WARNING: invalid escape sequence: \[u140000] -mandoc: input.in:66:31: ERROR: skipping bad character: 0xf7 +mandoc: input.in:65:21: WARNING: invalid escape sequence: \[u110000] +mandoc: input.in:66:31: ERROR: skipping bad character: 0xf4 mandoc: input.in:66:32: ERROR: skipping bad character: 0xbf mandoc: input.in:66:33: ERROR: skipping bad character: 0xbf mandoc: input.in:66:34: ERROR: skipping bad character: 0xbf -mandoc: input.in:66:21: WARNING: invalid escape sequence: \[u1FFFFF] -mandoc: input.in:67:33: ERROR: skipping bad character: 0xf8 -mandoc: input.in:67:34: ERROR: skipping bad character: 0x88 -mandoc: input.in:67:35: ERROR: skipping bad character: 0x80 -mandoc: input.in:67:36: ERROR: skipping bad character: 0x80 -mandoc: input.in:67:37: ERROR: skipping bad character: 0x80 -mandoc: input.in:67:23: WARNING: invalid escape sequence: \[u200000] +mandoc: input.in:66:21: WARNING: invalid escape sequence: \[u13FFFF] +mandoc: input.in:67:31: ERROR: skipping bad character: 0xf5 +mandoc: input.in:67:32: ERROR: skipping bad character: 0x80 +mandoc: input.in:67:33: ERROR: skipping bad character: 0x80 +mandoc: input.in:67:34: ERROR: skipping bad character: 0x80 +mandoc: input.in:67:21: WARNING: invalid escape sequence: \[u140000] +mandoc: input.in:68:31: ERROR: skipping bad character: 0xf7 +mandoc: input.in:68:32: ERROR: skipping bad character: 0xbf +mandoc: input.in:68:33: ERROR: skipping bad character: 0xbf +mandoc: input.in:68:34: ERROR: skipping bad character: 0xbf +mandoc: input.in:68:21: WARNING: invalid escape sequence: \[u1FFFFF] +mandoc: input.in:69:33: ERROR: skipping bad character: 0xf8 +mandoc: input.in:69:34: ERROR: skipping bad character: 0x88 +mandoc: input.in:69:35: ERROR: skipping bad character: 0x80 +mandoc: input.in:69:36: ERROR: skipping bad character: 0x80 +mandoc: input.in:69:37: ERROR: skipping bad character: 0x80 +mandoc: input.in:69:23: WARNING: invalid escape sequence: \[u200000] diff --git a/regress/usr.bin/mandoc/char/unicode/input.out_utf8 b/regress/usr.bin/mandoc/char/unicode/input.out_utf8 index 882d14fd9d2..af4645bd8e7 100644 --- a/regress/usr.bin/mandoc/char/unicode/input.out_utf8 +++ b/regress/usr.bin/mandoc/char/unicode/input.out_utf8 @@ -20,10 +20,10 @@ DDEESSCCRRIIPPTTIIOONN U+0000 0xc080 ?? lowest obfuscated ASCII U+007f 0xc1bf ?? highest obfuscated ASCII - 0xc278 ?x ASCII continuation U+0080 0xc280 �� lowest two-byte - 0xc2c380 ?À high continuation U+07FF 0xdfbf ߿߿ highest two-byte + 0xc278 ?x ASCII instead of continuation + 0xc2c380 ?À start byte instead of continuation TThhrreeee--bbyyttee rraannggee @@ -32,10 +32,10 @@ DDEESSCCRRIIPPTTIIOONN U+0080 0xe08280 ??? lowest obfuscated two-byte U+07FF 0xe09fbf ??? highest obfuscated two-byte U+0800 0xe0a080 ࠀࠀ lowest three-byte - U+0FFF 0xe0bfbf ࿿࿿ end of first middle byte - U+1000 0xe18080 ကက begin of second middle byte - U+CFFF 0xecbfbf 쿿쿿 end of last normal middle byte - U+D000 0xed8080 퀀퀀 begin of strange middle byte + U+0FFF 0xe0bfbf ࿿࿿ end of first start byte + U+1000 0xe18080 ကက begin of second start byte + U+CFFF 0xecbfbf 쿿쿿 end of last normal start byte + U+D000 0xed8080 퀀퀀 begin of last start byte U+D7FF 0xed9fbf ퟿퟿ highest public three-byte U+D800 0xeda080 ??? lowest surrogate U+DFFF 0xedbfbf ??? highest surrogate @@ -51,17 +51,19 @@ DDEESSCCRRIIPPTTIIOONN U+0800 0xf080a080 ???? lowest obfuscated three-byte U+FFFF 0xf08fbfbf ???? highest obfuscated three-byte U+10000 0xf0908080 𐀀𐀀 lowest four-byte - U+3FFFF 0xf0bfbfbf 𿿿𿿿 end of first middle byte - U+40000 0xf1808080 񀀀񀀀 second middle byte - U+FFFFF 0xf3bfbfbf 󿿿󿿿 last normal middle byte - U+100000 0xf4808080 􀀀􀀀 strange middle byte - U+10FFFF 0xf48fbfbf 􏿿􏿿 last valid four-byte + U+3FFFF 0xf0bfbfbf 𿿿𿿿 end of first start byte + U+40000 0xf1808080 񀀀񀀀 begin of second start byte + U+EFFFF 0xf2bfbfbf 󯿿򿿿 highest public character + U+F0000 0xf3808080 󰀀󀀀 lowest plane 15 private use + U+FFFFF 0xf3bfbfbf 󿿿󿿿 highest plane 15 private use + U+100000 0xf4808080 􀀀􀀀 lowest plane 16 private use + U+10FFFF 0xf48fbfbf 􏿿􏿿 highest valid four-byte U+110000 0xf4908080 ???? lowest beyond Unicode - U+13FFFF 0xf4bfbfbf ???? end of strange middle byte - U+140000 0xf5808080 ???? lowest invalid middle byte - U+1FFFFF 0xf7bfbfbf ???? highest four-byte + U+13FFFF 0xf4bfbfbf ???? end of last start byte + U+140000 0xf5808080 ???? lowest invalid start byte + U+1FFFFF 0xf7bfbfbf ???? highest invalid four-byte U+200000 0xf888808080 ????? lowest five-byte -OpenBSD December 19, 2014 CHAR-UNICODE-INPUT(1) +OpenBSD June 2, 2021 CHAR-UNICODE-INPUT(1) -- 2.20.1