-/* $OpenBSD: mandocdb.c,v 1.220 2024/05/14 18:38:13 schwarze Exp $ */
+/* $OpenBSD: mandocdb.c,v 1.221 2024/05/14 21:12:44 schwarze Exp $ */
/*
- * Copyright (c) 2011-2020 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011-2021, 2024 Ingo Schwarze <schwarze@openbsd.org>
* Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2016 Ed Maste <emaste@freebsd.org>
*
__attribute__((__format__ (__printf__, 2, 3)));
static int set_basedir(const char *, int);
static int treescan(void);
-static size_t utf8(unsigned int, char [7]);
+static size_t utf8(unsigned int, char[5]);
static int nodb; /* no database changes */
static int mparse_options; /* abort the parse early */
* Take a Unicode codepoint and produce its UTF-8 encoding.
* This isn't the best way to do this, but it works.
* The magic numbers are from the UTF-8 packaging.
- * They're not as scary as they seem: read the UTF-8 spec for details.
+ * Read the UTF-8 spec or the utf8(7) manual page for details.
*/
static size_t
-utf8(unsigned int cp, char out[7])
+utf8(unsigned int cp, char out[5])
{
size_t rc;
- rc = 0;
- if (cp <= 0x0000007F) {
+ if (cp <= 0x7f) {
rc = 1;
out[0] = (char)cp;
- } else if (cp <= 0x000007FF) {
+ } else if (cp <= 0x7ff) {
rc = 2;
out[0] = (cp >> 6 & 31) | 192;
out[1] = (cp & 63) | 128;
- } else if (cp <= 0x0000FFFF) {
+ } else if (cp >= 0xd800 && cp <= 0xdfff) {
+ rc = 0; /* reject UTF-16 surrogate */
+ } else if (cp <= 0xffff) {
rc = 3;
out[0] = (cp >> 12 & 15) | 224;
out[1] = (cp >> 6 & 63) | 128;
out[2] = (cp & 63) | 128;
- } else if (cp <= 0x001FFFFF) {
+ } else if (cp <= 0x10ffff) {
rc = 4;
out[0] = (cp >> 18 & 7) | 240;
out[1] = (cp >> 12 & 63) | 128;
out[2] = (cp >> 6 & 63) | 128;
out[3] = (cp & 63) | 128;
- } else if (cp <= 0x03FFFFFF) {
- rc = 5;
- out[0] = (cp >> 24 & 3) | 248;
- out[1] = (cp >> 18 & 63) | 128;
- out[2] = (cp >> 12 & 63) | 128;
- out[3] = (cp >> 6 & 63) | 128;
- out[4] = (cp & 63) | 128;
- } else if (cp <= 0x7FFFFFFF) {
- rc = 6;
- out[0] = (cp >> 30 & 1) | 252;
- out[1] = (cp >> 24 & 63) | 128;
- out[2] = (cp >> 18 & 63) | 128;
- out[3] = (cp >> 12 & 63) | 128;
- out[4] = (cp >> 6 & 63) | 128;
- out[5] = (cp & 63) | 128;
} else
- return 0;
+ rc = 0;
out[rc] = '\0';
return rc;