From 702123639534b7d2ffe3eb692a2717af7205ae5c Mon Sep 17 00:00:00 2001 From: schwarze Date: Tue, 14 May 2024 18:38:13 +0000 Subject: [PATCH] The makewhatis(8) program already provided a "-T utf8" option to put UTF-8 strings into the database, but that only worked for input files containing the manually written, mnemonic roff(7) character escape sequences documented in mandoc_char(7). Even though mandoc(1), man(1), and man.cgi(8) have been able to properly handle UTF-8 and ISO-Latin-1 encoded input files for many years, makewhatis(8) unconditionally replaced all non-ASCII bytes in all input files with ASCII question marks ("?"). Improve this by changing two aspects of non-ASCII character handling in makewhatis(8) at the same time. 1. In the makewhatis(8) main program, when configuring the roff(7) parser, enable UTF-8 and ISO-Latin-1 autorecognition and translation to \[uXXXX] roff(7) Unicode character escape sequences. The man(1) and man.cgi(8) programs prove that this option has been working very reliably for many years, so there is no risk. 2. In the makewhatis(8) string rendering code, if "-T utf8" was requested, translate these escape sequences to UTF-8 strings, just like makewhatis(8) already did it for ESCAPE_SPECIAL sequences. Otherwise, i.e. if an ASCII-only database is desired, replace all character escape sequences by ASCII transliterations, again like it was already done for ESCAPE_SPECIAL sequences. With this change, giving UTF-8 command line arguments to apropos(1) allows searching in UTF-8 and ISO-Latin-1 encoded manual pages if the respective mandoc.db(5) has been built with makewhatis(8) -T utf8. Issue found while investigating a question from Valid-Amirali-Averiva at rambler dot ru, who is using mandoc on FreeBSD to process documents containing cyrillic letters. --- usr.bin/mandoc/mandocdb.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/usr.bin/mandoc/mandocdb.c b/usr.bin/mandoc/mandocdb.c index f38d40994df..4343a895fbe 100644 --- a/usr.bin/mandoc/mandocdb.c +++ b/usr.bin/mandoc/mandocdb.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mandocdb.c,v 1.219 2022/12/26 19:16:02 jmc Exp $ */ +/* $OpenBSD: mandocdb.c,v 1.220 2024/05/14 18:38:13 schwarze Exp $ */ /* * Copyright (c) 2011-2020 Ingo Schwarze * Copyright (c) 2011, 2012 Kristaps Dzonsons @@ -326,7 +326,7 @@ mandocdb(int argc, char *argv[]) goto usage; \ } while (/*CONSTCOND*/0) - mparse_options = MPARSE_VALIDATE; + mparse_options = MPARSE_UTF8 | MPARSE_LATIN1 | MPARSE_VALIDATE; path_arg = NULL; op = OP_DEFAULT; @@ -1987,7 +1987,21 @@ render_string(char **public, size_t *psz) */ scp++; - if (mandoc_escape(&scp, &seq, &seqlen) != ESCAPE_SPECIAL) + switch (mandoc_escape(&scp, &seq, &seqlen)) { + case ESCAPE_UNICODE: + unicode = mchars_num2uc(seq + 1, seqlen - 1); + break; + case ESCAPE_NUMBERED: + unicode = mchars_num2char(seq, seqlen); + break; + case ESCAPE_SPECIAL: + unicode = mchars_spec2cp(seq, seqlen); + break; + default: + unicode = -1; + break; + } + if (unicode <= 0) continue; /* @@ -1996,21 +2010,17 @@ render_string(char **public, size_t *psz) */ if (write_utf8) { - unicode = mchars_spec2cp(seq, seqlen); - if (unicode <= 0) - continue; addsz = utf8(unicode, utfbuf); if (addsz == 0) continue; addcp = utfbuf; } else { - addcp = mchars_spec2str(seq, seqlen, &addsz); + addcp = mchars_uc2str(unicode); if (addcp == NULL) continue; - if (*addcp == ASCII_NBRSP) { + if (*addcp == ASCII_NBRSP) addcp = " "; - addsz = 1; - } + addsz = strlen(addcp); } /* Copy the rendered glyph into the stream. */ -- 2.20.1