From a83ec1761e33fc9534627a36ca0f26bbd427f3ab Mon Sep 17 00:00:00 2001 From: schwarze Date: Tue, 14 May 2024 21:12:44 +0000 Subject: [PATCH] Garbage collect dead code intended to write five- and six-byte UTF-8 sequences since the Unicode standard has been explicitly prohibiting the use of such sequences when encoding Unicode characters for more than 20 years now. While here, also weed out UTF-16 surrogates and codepoints in the invalid range 110000 to 1FFFFF if any are encountered. I hoped to write "no functional change", but to my shame it turns out there are unrelated bugs with \[uXXXX] parsing in roff_escape.c, so this new anti-surrogate check is actually reachable until those other bugs get fixed, and even after fixing those other bugs, it will remain useful as a defense in depth. --- usr.bin/mandoc/mandocdb.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/usr.bin/mandoc/mandocdb.c b/usr.bin/mandoc/mandocdb.c index 4343a895fbe..65fbe058cfc 100644 --- a/usr.bin/mandoc/mandocdb.c +++ b/usr.bin/mandoc/mandocdb.c @@ -1,6 +1,6 @@ -/* $OpenBSD: mandocdb.c,v 1.220 2024/05/14 18:38:13 schwarze Exp $ */ +/* $OpenBSD: mandocdb.c,v 1.221 2024/05/14 21:12:44 schwarze Exp $ */ /* - * Copyright (c) 2011-2020 Ingo Schwarze + * Copyright (c) 2011-2021, 2024 Ingo Schwarze * Copyright (c) 2011, 2012 Kristaps Dzonsons * Copyright (c) 2016 Ed Maste * @@ -155,7 +155,7 @@ static void say(const char *, const char *, ...) __attribute__((__format__ (__printf__, 2, 3))); static int set_basedir(const char *, int); static int treescan(void); -static size_t utf8(unsigned int, char [7]); +static size_t utf8(unsigned int, char[5]); static int nodb; /* no database changes */ static int mparse_options; /* abort the parse early */ @@ -1863,49 +1863,35 @@ putkeys(const struct mpage *mpage, char *cp, size_t sz, uint64_t v) * Take a Unicode codepoint and produce its UTF-8 encoding. * This isn't the best way to do this, but it works. * The magic numbers are from the UTF-8 packaging. - * They're not as scary as they seem: read the UTF-8 spec for details. + * Read the UTF-8 spec or the utf8(7) manual page for details. */ static size_t -utf8(unsigned int cp, char out[7]) +utf8(unsigned int cp, char out[5]) { size_t rc; - rc = 0; - if (cp <= 0x0000007F) { + if (cp <= 0x7f) { rc = 1; out[0] = (char)cp; - } else if (cp <= 0x000007FF) { + } else if (cp <= 0x7ff) { rc = 2; out[0] = (cp >> 6 & 31) | 192; out[1] = (cp & 63) | 128; - } else if (cp <= 0x0000FFFF) { + } else if (cp >= 0xd800 && cp <= 0xdfff) { + rc = 0; /* reject UTF-16 surrogate */ + } else if (cp <= 0xffff) { rc = 3; out[0] = (cp >> 12 & 15) | 224; out[1] = (cp >> 6 & 63) | 128; out[2] = (cp & 63) | 128; - } else if (cp <= 0x001FFFFF) { + } else if (cp <= 0x10ffff) { rc = 4; out[0] = (cp >> 18 & 7) | 240; out[1] = (cp >> 12 & 63) | 128; out[2] = (cp >> 6 & 63) | 128; out[3] = (cp & 63) | 128; - } else if (cp <= 0x03FFFFFF) { - rc = 5; - out[0] = (cp >> 24 & 3) | 248; - out[1] = (cp >> 18 & 63) | 128; - out[2] = (cp >> 12 & 63) | 128; - out[3] = (cp >> 6 & 63) | 128; - out[4] = (cp & 63) | 128; - } else if (cp <= 0x7FFFFFFF) { - rc = 6; - out[0] = (cp >> 30 & 1) | 252; - out[1] = (cp >> 24 & 63) | 128; - out[2] = (cp >> 18 & 63) | 128; - out[3] = (cp >> 12 & 63) | 128; - out[4] = (cp >> 6 & 63) | 128; - out[5] = (cp & 63) | 128; } else - return 0; + rc = 0; out[rc] = '\0'; return rc; -- 2.20.1