OK millert@.
Tested by naddy@ in a bulk and by matthieu@ in the new foot(1) port.
I originally wrote the code in 2022 at the prodding of espie@.
Using one improvement to a manual page from jmc@.
./usr/lib/crtendS.o
./usr/lib/gcrt0.o
./usr/lib/libagentx.so.1.1
-./usr/lib/libc.so.97.0
+./usr/lib/libc.so.97.1
./usr/lib/libcbor.so.2.0
./usr/lib/libcrypto.so.52.0
./usr/lib/libcurses.so.14.0
./usr/share/relink/kernel.tgz
./usr/share/relink/usr
./usr/share/relink/usr/lib
-./usr/share/relink/usr/lib/libc.so.97.0.a
+./usr/share/relink/usr/lib/libc.so.97.1.a
./usr/share/relink/usr/lib/libcrypto.so.52.0.a
./usr/share/relink/usr/libexec
./usr/share/relink/usr/libexec/ld.so.a
./usr/include/time.h
./usr/include/tls.h
./usr/include/ttyent.h
+./usr/include/uchar.h
./usr/include/ufs
./usr/include/ufs/ext2fs
./usr/include/ufs/ext2fs/ext2fs.h
./usr/share/man/man3/btowc.3
./usr/share/man/man3/btree.3
./usr/share/man/man3/bzero.3
+./usr/share/man/man3/c16rtomb.3
./usr/share/man/man3/cacos.3
./usr/share/man/man3/cacosh.3
./usr/share/man/man3/carg.3
./usr/share/man/man3/malloc.3
./usr/share/man/man3/mblen.3
./usr/share/man/man3/mbrlen.3
+./usr/share/man/man3/mbrtoc16.3
./usr/share/man/man3/mbrtowc.3
./usr/share/man/man3/mbsinit.3
./usr/share/man/man3/mbsrtowcs.3
-# $OpenBSD: Makefile,v 1.230 2022/08/30 18:50:06 krw Exp $
+# $OpenBSD: Makefile,v 1.231 2023/08/20 15:02:50 schwarze Exp $
# $NetBSD: Makefile,v 1.59 1996/05/15 21:36:43 jtc Exp $
# @(#)Makefile 5.45.1.1 (Berkeley) 5/6/91
signal.h siphash.h sndio.h spawn.h stdbool.h stddef.h \
stdio.h stdlib.h string.h strings.h sysexits.h \
tar.h tgmath.h tib.h time.h ttyent.h \
- unistd.h utime.h utmp.h uuid.h \
+ uchar.h unistd.h utime.h utmp.h uuid.h \
vis.h \
wchar.h wctype.h
--- /dev/null
+/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:50 schwarze Exp $ */
+/*
+ * Written by Ingo Schwarze <schwarze@openbsd.org>
+ * and placed in the public domain on March 19, 2022.
+ */
+
+#ifndef _UCHAR_H_
+#define _UCHAR_H_
+
+#include <sys/cdefs.h>
+#include <sys/_types.h>
+
+#ifndef _MBSTATE_T_DEFINED_
+#define _MBSTATE_T_DEFINED_
+typedef __mbstate_t mbstate_t;
+#endif
+
+#ifndef _SIZE_T_DEFINED_
+#define _SIZE_T_DEFINED_
+typedef __size_t size_t;
+#endif
+
+#define __STDC_UTF_16__ 1
+#define __STDC_UTF_32__ 1
+
+typedef __uint16_t char16_t;
+typedef __uint32_t char32_t;
+
+__BEGIN_DECLS
+size_t mbrtoc16(char16_t * __restrict, const char * __restrict, size_t,
+ mbstate_t * __restrict);
+size_t c16rtomb(char * __restrict, char16_t, mbstate_t * __restrict);
+size_t mbrtoc32(char32_t * __restrict, const char * __restrict, size_t,
+ mbstate_t * __restrict);
+size_t c32rtomb(char * __restrict, char32_t, mbstate_t * __restrict);
+__END_DECLS
+
+#endif /* !_UCHAR_H_ */
/* locale */
__mb_cur_max
btowc
+c16rtomb
+c32rtomb
duplocale
freelocale
isalnum_l
mbsrtowcs
mbstowcs
newlocale
+mbrtoc16
+mbrtoc32
mbtowc
nl_langinfo
nl_langinfo_l
--- /dev/null
+/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
+/*
+ * Written by Ingo Schwarze <schwarze@openbsd.org>
+ * and placed in the public domain on March 19, 2022.
+ */
+
+#ifndef _LIBC_UCHAR_H_
+#define _LIBC_UCHAR_H_
+
+#include_next <uchar.h>
+
+PROTO_STD_DEPRECATED(c16rtomb);
+PROTO_STD_DEPRECATED(c32rtomb);
+PROTO_STD_DEPRECATED(mbrtoc16);
+PROTO_STD_DEPRECATED(mbrtoc32);
+
+#endif /* !_LIBC_UCHAR_H_ */
-# $OpenBSD: Makefile.inc,v 1.26 2022/07/27 20:00:11 guenther Exp $
+# $OpenBSD: Makefile.inc,v 1.27 2023/08/20 15:02:51 schwarze Exp $
# locale sources
.PATH: ${LIBCSRCDIR}/locale
-SRCS+= btowc.c _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
+SRCS+= _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
localeconv.c nl_langinfo.c nl_langinfo_l.c setlocale.c \
duplocale.c freelocale.c newlocale.c uselocale.c \
__mb_cur_max.c _CurrentRuneLocale.c _get_locname.c \
isctype_l.c iswctype.c iswctype_l.c wctype.c \
- mblen.c mbrlen.c mbstowcs.c mbtowc.c multibyte_citrus.c wcscoll.c \
+ mblen.c mbrlen.c mbrtoc16.c mbrtoc32.c mbstowcs.c mbtowc.c \
+ btowc.c c16rtomb.c c32rtomb.c multibyte_citrus.c wcscoll.c \
wcscoll_l.c \
wcstombs.c wctob.c wctomb.c wcstof.c wcstod.c wcstold.c wcstol.c \
wcstoul.c wcstoll.c wcstoull.c wcstoimax.c wcstoumax.c \
MAN+= nl_langinfo.3 setlocale.3 newlocale.3 uselocale.3 localeconv.3 \
iswalnum.3 towlower.3 \
- btowc.3 mblen.3 mbrlen.3 mbrtowc.3 mbsinit.3 mbsrtowcs.3 \
+ btowc.3 c16rtomb.3 mblen.3 mbrlen.3 mbrtoc16.3 mbrtowc.3 \
+ mbsinit.3 mbsrtowcs.3 \
mbstowcs.3 mbtowc.3 wcrtomb.3 wcscoll.3 wcsrtombs.3 wcstod.3 \
wcstol.3 wcstombs.3 wcsxfrm.3 wctob.3 wctomb.3 \
wctype.3 iswctype.3 wctrans.3 towctrans.3 wcwidth.3
--- /dev/null
+.\" $OpenBSD: c16rtomb.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
+.\"
+.\" Copyright (c) 2023 Ingo Schwarze <schwarze@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: August 20 2023 $
+.Dt C16RTOMB 3
+.Os
+.Sh NAME
+.Nm c16rtomb
+.Nd convert one UTF-16 encoded character to UTF-8
+.Sh SYNOPSIS
+.In uchar.h
+.Ft size_t
+.Fo c16rtomb
+.Fa "char * restrict s"
+.Fa "char16_t c16"
+.Fa "mbstate_t * restrict mbs"
+.Fc
+.Sh DESCRIPTION
+This function converts one UTF-16 encoded character to UTF-8.
+In some cases, it is necessary to call the function twice
+to convert a single character.
+.Pp
+First, call
+.Fn c16rtomb
+passing the first 16-bit code unit of the UTF-16 encoded character in
+.Fa c16 .
+If the return value is greater than 0, the character is part of the UCS-2
+range, the complete UTF-8 encoding consisting of at most
+.Dv MB_CUR_MAX
+bytes has been written to the storage starting at
+.Fa s ,
+and the function does not need to be called again.
+.Pp
+If the return value is 0, the first 16-bit code unit is a UTF-16
+high surrogate and the function needs to be called a second time,
+this time passing the second 16-bit code unit of the UTF-16 encoded
+character in
+.Fa c16
+and passing the same
+.Fa mbs
+again that was also passed to the first call.
+If the second 16-bit code unit is a UTF-16 low surrogate,
+the second call returns a value greater than 0,
+the surrogate pair represents a Unicode code point
+beyond the basic multilingual plane,
+and the complete UTF-8 encoding consisting of at most
+.Dv MB_CUR_MAX
+bytes is written to the storage starting at
+.Fa s .
+.Pp
+The output encoding that
+.Fn c16rtomb
+uses in
+.Fa s
+is determined by the
+.Dv LC_CTYPE
+category of the current locale.
+.Ox
+only supports UTF-8 and ASCII output,
+and this function is only useful for UTF-8.
+.Pp
+The following arguments cause special processing:
+.Bl -tag -width 012345678901
+.It Fa c16 No == 0
+A NUL byte is stored to
+.Pf * Fa s
+and the state object pointed to by
+.Fa mbs
+is reset to the initial state.
+On operating systems other than
+.Ox
+that support state-dependent multibyte encodings,
+a special byte sequence
+.Pq Dq shift sequence
+is written before the NUL byte to return to the initial state
+if that is required by the output encoding
+and by the current output encoding state.
+.It Fa mbs No == Dv NULL
+An internal
+.Vt mbstate_t
+object specific to the
+.Fn c16rtomb
+function is used instead of the
+.Fa mbs
+argument.
+This internal object is automatically initialized at program startup
+and never changed by any
+.Em libc
+function except
+.Fn c16rtomb .
+.It Fa s No == Dv NULL
+The object pointed to by
+.Fa mbs ,
+or the internal object if
+.Fa mbs
+is a
+.Dv NULL
+pointer, is reset to its initial state,
+.Fa c16
+is ignored, and 1 is returned.
+.El
+.Sh RETURN VALUES
+.Fn c16rtomb
+returns the number of bytes written to
+.Fa s
+on success or
+.Po Vt size_t Pc Ns \-1
+on failure, specifically:
+.Bl -tag -width 10n
+.It 0
+The first 16-bit code unit was successfully decoded
+as a UTF-16 high surrogate.
+Nothing was written to
+.Fa s
+yet.
+.It 1
+The first 16-bit code unit was successfully decoded
+as a character in the range U+0000 to U+007F, or
+.Fa s
+is
+.Dv NULL .
+.It 2
+The first 16-bit code unit was successfully decoded
+as a character in the range U+0080 to U+07FF.
+.It 3
+The first 16-bit code unit was successfully decoded
+as a character in the range U+0800 to U+D7FF or U+E000 to U+FFFF.
+.It 4
+The second 16-bit code unit was successfully decoded as a UTF-16 low
+surrogate, resulting in a character in the range U+10000 to U+10FFFF.
+.It greater
+Return values greater than 4 may occur on operating systems other than
+.Ox
+for output encodings other than UTF-8, in particular when a shift
+sequence was written.
+.It Po Vt size_t Pc Ns \-1
+UTF-16 input decoding or
+.Dv LC_CTYPE
+output encoding failed, or
+.Fa mbs
+is invalid.
+Nothing was written to
+.Fa s ,
+and
+.Va errno
+has been set.
+.El
+.Sh ERRORS
+.Fn c16rtomb
+causes an error in the following cases:
+.Bl -tag -width Er
+.It Bq Er EILSEQ
+UTF-16 input decoding failed because the first 16-bit code unit
+is neither a UCS-2 character nor a UTF-16 high surrogate,
+or because the second 16-bit code unit is not a UTF-16 low surrogate;
+or output encoding failed because the resulting character
+cannot be represented in the output encoding selected with
+.Dv LC_CTYPE .
+.It Bq Er EINVAL
+.Fa mbs
+points to an invalid or uninitialized
+.Vt mbstate_t
+object.
+.El
+.Sh SEE ALSO
+.Xr mbrtoc16 3 ,
+.Xr setlocale 3 ,
+.Xr wcrtomb 3
+.Sh STANDARDS
+.Fn c16rtomb
+conforms to
+.St -isoC-2011 .
+.Sh HISTORY
+.Fn c16rtomb
+has been available since
+.Ox 7.4 .
+.Sh CAVEATS
+The C11 standard only requires the
+.Fa c16
+argument to be interpreted according to UTF-16
+if the predefined environment macro
+.Dv __STDC_UTF_16__
+is defined with a value of 1.
+On
+.Ox ,
+.In uchar.h
+provides this definition.
+Other operating systems which do not define
+.Dv __STDC_UTF_16__
+could theoretically use a different,
+implementation-defined input encoding for
+.Fa c16
+instead of UTF-16.
+Using UTF-16 becomes mandatory in C23.
--- /dev/null
+/* $OpenBSD: c16rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
+/*
+ * Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <uchar.h>
+#include <wchar.h>
+
+/*
+ * Keep this structure compatible with
+ * struct _utf8_state in the file citrus/citrus_utf8.c.
+ */
+struct _utf16_state {
+ wchar_t ch;
+ int want;
+};
+
+size_t
+c16rtomb(char *s, char16_t c16, mbstate_t *ps)
+{
+ static mbstate_t mbs;
+ struct _utf16_state *us;
+ wchar_t wc;
+
+ if (ps == NULL)
+ ps = &mbs;
+
+ /*
+ * Handle the special case of NULL output first
+ * to avoid inspecting c16 and ps and possibly drawing
+ * bogus conclusions from whatever those may contain.
+ * Instead, just restore the initial conversion state.
+ * The return value represents the length of the NUL byte
+ * corresponding to the NUL wide character, even though
+ * there is no place to write that NUL byte to.
+ */
+ if (s == NULL) {
+ memset(ps, 0, sizeof(*ps));
+ return 1;
+ }
+
+ us = (struct _utf16_state *)ps;
+
+ if (us->want == (size_t)-3) {
+
+ /*
+ * The previous call read a high surrogate,
+ * so expect a low surrogate now.
+ */
+ if ((c16 & 0xfc00) != 0xdc00) {
+ errno = EILSEQ;
+ return -1;
+ }
+
+ /*
+ * Assemble the full code point for processing
+ * by wcrtomb(3). Since we do not support
+ * state-dependent encodings, our wcrtomb(3)
+ * always expects the initial conversion state,
+ * so clearing the state here is just fine.
+ */
+ wc = us->ch + (c16 & 0x3ff);
+ us->ch = 0;
+ us->want = 0;
+
+ } else if ((c16 & 0xfc00) == 0xd800) {
+
+ /*
+ * Got a high surrogate while being in the initial
+ * conversion state. Remeber its contribution to
+ * the codepoint and defer encoding to the next call.
+ */
+ us->ch = 0x10000 + ((c16 & 0x3ff) << 10);
+ us->want = -3;
+
+ /* Nothing was written to *s just yet. */
+ return 0;
+
+ } else
+ wc = c16;
+
+ /*
+ * The following correctly returns an error when a low
+ * surrogate is encountered without a preceding high one.
+ */
+ return wcrtomb(s, wc, ps);
+}
--- /dev/null
+/* $OpenBSD: c32rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
+/*
+ * Written by Ingo Schwarze <schwarze@openbsd.org>
+ * and placed in the public domain on March 19, 2022.
+ */
+
+#include <uchar.h>
+#include <wchar.h>
+
+size_t
+c32rtomb(char *s, char32_t c32, mbstate_t *ps)
+{
+ static mbstate_t mbs;
+
+ if (ps == NULL)
+ ps = &mbs;
+ return wcrtomb(s, c32, ps);
+}
--- /dev/null
+.\" $OpenBSD: mbrtoc16.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
+.\"
+.\" Copyright 2023 Ingo Schwarze <schwarze@openbsd.org>
+.\" Copyright 2010 Stefan Sperling <stsp@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: August 20 2023 $
+.Dt MBRTOC16 3
+.Os
+.Sh NAME
+.Nm mbrtoc16
+.Nd convert one UTF-8 encoded character to UTF-16
+.Sh SYNOPSIS
+.In uchar.h
+.Ft size_t
+.Fo mbrtoc16
+.Fa "char16_t * restrict pc16"
+.Fa "const char * restrict s"
+.Fa "size_t n"
+.Fa "mbstate_t * restrict mbs"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn mbrtoc16
+function examines at most
+.Fa n
+bytes of the multibyte character byte string pointed to by
+.Fa s ,
+converts those bytes to a wide character,
+and encodes the wide character using UTF-16.
+In some cases, it is necessary to call this function
+twice to convert a single character.
+.Pp
+Conversion happens in accordance with the conversion state
+.Pf * Fa mbs ,
+which must be initialized to zero before the application's first call to
+.Fn mbrtoc16 .
+For this function,
+.Pf * Fa mbs
+stores information about both the state of the UTF-8 input encoding
+and the state of the UTF-16 output encoding.
+If the previous call did not return
+.Po Vt size_t Pc Ns \-1 ,
+.Fa mbs
+can safely be reused without reinitialization.
+.Pp
+The input encoding that
+.Fn mbrtoc16
+uses for
+.Fa s
+is determined by the
+.Dv LC_CTYPE
+category of the current locale.
+If the locale is changed without reinitialization of
+.Pf * Fa mbs ,
+the behaviour is undefined.
+.Pp
+Unlike
+.Xr mbtowc 3 ,
+.Fn mbrtoc16
+accepts an incomplete byte sequence pointed to by
+.Fa s
+which does not form a complete character but is potentially part of
+a valid character.
+In this case, the function consumes all such bytes.
+The conversion state saved in
+.Pf * Fa mbs
+will be used to restart the suspended conversion during the next call.
+.Pp
+On systems other than
+.Ox
+that support state-dependent encodings,
+.Fa s
+may point to a special sequence of bytes called a
+.Dq shift sequence ;
+see
+.Xr mbrtowc 3
+for details.
+.Pp
+The following arguments cause special processing:
+.Bl -tag -width 012345678901
+.It Fa pc16 No == Dv NULL
+The conversion from a multibyte character to a wide character is performed
+and the conversion state may be affected, but the resulting wide character
+is discarded.
+.It Fa s No == Dv NULL
+The arguments
+.Fa pc16
+and
+.Fa n
+are ignored and starting or continuing the conversion with an empty string
+is attempted, discarding the conversion result.
+.It Fa mbs No == Dv NULL
+An internal
+.Vt mbstate_t
+object specific to the
+.Fn mbrtoc16
+function is used instead of the
+.Fa mbs
+argument.
+This internal object is automatically initialized at program startup
+and never changed by any
+.Em libc
+function except
+.Fn mbrtoc16 .
+.Pp
+If
+.Fn mbrtoc16
+is called with a
+.Dv NULL
+.Fa mbs
+argument and that call returns
+.Po Vt size_t Pc Ns \-1 ,
+the internal conversion state of
+.Fn mbrtoc16
+becomes permanently undefined and there is no way
+to reset it to any defined state.
+Consequently, after such a mishap, it is not safe to call
+.Fn mbrtoc16
+with a
+.Dv NULL
+.Fa mbs
+argument ever again until the program is terminated.
+.El
+.Sh RETURN VALUES
+.Bl -tag -width 012345678901
+.It 0
+The bytes pointed to by
+.Fa s
+form a terminating NUL character.
+If
+.Fa pc16
+is not
+.Dv NULL ,
+a NUL wide character has been stored in
+.Pf * Fa pc16 .
+.It positive
+.Fa s
+points to a valid character, and the value returned is the number of
+bytes completing the character.
+If
+.Fa pc16
+is not
+.Dv NULL ,
+the first UTF-16 code unit of the corresponding wide character
+has been stored in
+.Pf * Fa pc16 .
+If it is an UTF-16 high surrogate, the function needs to be called
+again to retrieve a second UTF-16 code unit, the low surrogate.
+On
+.Ox ,
+this happens if and only if the return value is 4,
+but this equivalence does not hold on other operating systems
+that support input encodings other than UTF-8.
+.It Po Vt size_t Pc Ns \-1
+.Fa s
+points to an illegal byte sequence which does not form a valid multibyte
+character in the current locale, or
+.Fa mbs
+points to an invalid or uninitialized object.
+.Va errno
+is set to
+.Er EILSEQ
+or
+.Er EINVAL ,
+respectively.
+The conversion state object pointed to by
+.Fa mbs
+is left in an undefined state and must be reinitialized before being
+used again.
+.It Po Vt size_t Pc Ns \-2
+.Fa s
+points to an incomplete byte sequence of length
+.Fa n
+which has been consumed and contains part of a valid multibyte character.
+The character may be completed by calling the same function again with
+.Fa s
+pointing to one or more subsequent bytes of the multibyte character and
+.Fa mbs
+pointing to the conversion state object used during conversion of the
+incomplete byte sequence.
+.It Po Vt size_t Pc Ns \-3
+The second 16-bit code unit resulting from a previous call
+has been stored into
+.Pf * Fa pc16 ,
+without consuming any additional bytes from
+.Fa s .
+.El
+.Sh ERRORS
+.Fn mbrtoc16
+causes an error in the following cases:
+.Bl -tag -width Er
+.It Bq Er EILSEQ
+.Fa s
+points to an invalid multibyte character.
+.It Bq Er EINVAL
+.Fa mbs
+points to an invalid or uninitialized
+.Vt mbstate_t
+object.
+.El
+.Sh SEE ALSO
+.Xr c16rtomb 3 ,
+.Xr mbrtowc 3 ,
+.Xr setlocale 3
+.Sh STANDARDS
+.Fn mbrtoc16
+conforms to
+.St -isoC-2011 .
+.Sh HISTORY
+.Fn mbrtoc16
+has been available since
+.Ox 7.4 .
+.Sh CAVEATS
+On operating systems other than
+.Ox
+that support input encodings other than UTF-8, inspecting the return value
+is insufficient to tell whether the function needs to be called again.
+If the return value is positive, inspecting
+.Pf * Fa pc16
+is also required to make that decision.
+Consequently, passing a
+.Dv NULL
+pointer for the
+.Fa pc16
+argument is discouraged because it can result
+in a well-defined but unknown output encoding state.
+The simplest way to recover from such an unknown state is to
+reinitialize the object pointed to by
+.Fa mbs .
+.Pp
+The C11 standard only requires the
+.Fa pc16
+argument to be encoded according to UTF-16
+if the predefined environment macro
+.Dv __STDC_UTF_16__
+is defined with a value of 1.
+On
+.Ox ,
+.In uchar.h
+provides this definition.
+Other operating systems which do not define
+.Dv __STDC_UTF_16__
+could theoretically use a different,
+implementation-defined output encoding for
+.Fa pc16
+instead of UTF-16.
+Writing portable code for an arbitrary output encoding is impossible
+because the rules when and how often the function needs to be called
+again depend on the output encoding; the rules explained above are
+specific to UTF-16.
+Using UTF-16 as the output encoding of
+.Fn wcrtoc16
+becomes mandatory in C23.
--- /dev/null
+/* $OpenBSD: mbrtoc16.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
+/*
+ * Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <uchar.h>
+#include <wchar.h>
+
+/*
+ * Keep this structure compatible with
+ * struct _utf8_state in the file citrus/citrus_utf8.c.
+ * In particular, only use values for the "want" field
+ * that do not collide with values used by the function
+ * _citrus_utf8_ctype_mbrtowc().
+ */
+struct _utf16_state {
+ wchar_t ch;
+ int want;
+};
+
+size_t
+mbrtoc16(char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
+{
+ static mbstate_t mbs;
+ struct _utf16_state *us;
+ size_t rv;
+ wchar_t wc;
+
+ /*
+ * Fall back to a state object local to this function
+ * and do not use the fallback object in mbrtowc(3)
+ * because an application program might mix calls to mbrtowc(3)
+ * and mbrtoc16(3) decoding different strings, and they must
+ * not clobber each other's state.
+ */
+ if (ps == NULL)
+ ps = &mbs;
+
+ us = (struct _utf16_state *)ps;
+
+ /*
+ * Handle the special case of NULL input first such that
+ * a low surrogate left over from a previous call does not
+ * clobber an object pointed to by the pc16 argument.
+ */
+ if (s == NULL) {
+ s = "";
+ n = 1;
+ pc16 = NULL;
+ }
+
+ /*
+ * If the previous call stored a high surrogate,
+ * store the corresponding low surrogate now
+ * and do not inspect any further input yet.
+ */
+ if (us->want == (size_t)-3) {
+ if (pc16 != NULL)
+ *pc16 = 0xdc00 + (us->ch & 0x3ff);
+ us->ch = 0;
+ us->want = 0;
+ return -3;
+ }
+
+ /*
+ * Decode the multibyte character.
+ * All the mbrtowc(3) use cases can be reached from here,
+ * including continuing an imcomplete character started earlier,
+ * decoding a NUL character, a valid complete character,
+ * an incomplete character to be continued later,
+ * or a decoding error.
+ */
+ rv = mbrtowc(&wc, s, n, ps);
+
+ if (rv < (size_t)-2) {
+ /* A new character that is valid and complete. */
+ if (wc > UINT16_MAX) {
+ /* Store a high surrogate. */
+ if (pc16 != NULL)
+ *pc16 = 0xd7c0 + (wc >> 10);
+ /* Remember that the low surrogate is pending. */
+ us->ch = wc;
+ us->want = -3;
+ } else if (pc16 != NULL)
+ /* Store a basic multilingual plane codepoint. */
+ *pc16 = wc;
+ }
+ return rv;
+}
--- /dev/null
+/* $OpenBSD: mbrtoc32.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
+/*
+ * Written by Ingo Schwarze <schwarze@openbsd.org>
+ * and placed in the public domain on March 19, 2022.
+ */
+
+#include <uchar.h>
+#include <wchar.h>
+
+size_t
+mbrtoc32(char32_t *pc32, const char *s, size_t n, mbstate_t *ps)
+{
+ static mbstate_t mbs;
+
+ if (ps == NULL)
+ ps = &mbs;
+ return mbrtowc(pc32, s, n, ps);
+}
-.\" $OpenBSD: mbrtowc.3,v 1.5 2016/02/08 09:56:16 schwarze Exp $
+.\" $OpenBSD: mbrtowc.3,v 1.6 2023/08/20 15:02:51 schwarze Exp $
.\" $NetBSD: mbrtowc.3,v 1.5 2003/09/08 17:54:31 wiz Exp $
.\"
+.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
+.\" Copyright (c)2010 Stefan Sperling <stsp@openbsd.org>
.\" Copyright (c)2002 Citrus Project,
.\" All rights reserved.
.\"
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd $Mdocdate: February 8 2016 $
+.Dd $Mdocdate: August 20 2023 $
.Dt MBRTOWC 3
.Os
.Sh NAME
-.Nm mbrtowc
-.Nd converts a multibyte character to a wide character (restartable)
+.Nm mbrtowc ,
+.Nm mbrtoc32
+.Nd convert a multibyte character to a wide character (restartable)
.Sh SYNOPSIS
.In wchar.h
.Ft size_t
-.Fn mbrtowc "wchar_t * restrict wc" "const char * restrict s" "size_t n" \
-"mbstate_t * restrict mbs"
+.Fo mbrtowc
+.Fa "wchar_t * restrict wc"
+.Fa "const char * restrict s"
+.Fa "size_t n"
+.Fa "mbstate_t * restrict mbs"
+.Fc
+.In uchar.h
+.Ft size_t
+.Fo mbrtoc32
+.Fa "char32_t * restrict wc"
+.Fa "const char * restrict s"
+.Fa "size_t n"
+.Fa "mbstate_t * restrict mbs"
+.Fc
.Sh DESCRIPTION
The
.Fn mbrtowc
-function examines at most
+and
+.Fn mbrtoc32
+functions examine at most
.Fa n
bytes of the multibyte character byte string pointed to by
.Fa s ,
-converts those bytes to a wide character, and stores the wide character
-in the wchar_t object pointed to by
-.Fa wc
+convert those bytes to a wide character, and store the wide character into
+.Pf * Fa wc
if
.Fa wc
is not
.Fa s
points to a valid character.
.Pp
-Conversion happens in accordance with the conversion state described
-by the mbstate_t object pointed to by
-.Fa mbs .
-The mbstate_t object must be initialized to zero before the application's
-first call to
-.Fn mbrtowc .
-If the previous call to
+Conversion happens in accordance with the conversion state
+.Pf * Fa mbs ,
+which must be initialized to zero before the application's first call to
.Fn mbrtowc
-did not return (size_t)-1, the mbstate_t object can safely be reused
-without reinitialization.
+or
+.Fn mbrtoc32 .
+If the previous call did not return
+.Po Vt size_t Pc Ns \-1 ,
+.Fa mbs
+can safely be reused without reinitialization.
.Pp
-The behaviour of
+The input encoding that
.Fn mbrtowc
-is affected by the
+and
+.Fn mbrtoc32
+use for
+.Fa s
+is determined by the
.Dv LC_CTYPE
category of the current locale.
-If the locale is changed without reinitialization of the mbstate_t object
-pointed to by
-.Fa mbs ,
-the behaviour of
-.Fn mbrtowc
-is undefined.
+If the locale is changed without reinitialization of
+.Pf * Fa mbs ,
+the behaviour is undefined.
.Pp
Unlike
.Xr mbtowc 3 ,
.Fn mbrtowc
-will accept an incomplete byte sequence pointed to by
+and
+.Fn mbrtoc32
+accept an incomplete byte sequence pointed to by
.Fa s
which does not form a complete character but is potentially part of
a valid character.
-In this case,
-.Fn mbrtowc
-consumes all such bytes.
-The conversion state saved in the mbstate_t object pointed to by
-.Fa mbs
-will be used to restart the suspended conversion during the next
-call to
-.Fn mbrtowc .
+In this case, both functions consume all such bytes.
+The conversion state saved in
+.Pf * Fa mbs
+will be used to restart the suspended conversion during the next call.
.Pp
-In state-dependent encodings,
+On systems other than
+.Ox
+that support state-dependent encodings,
.Fa s
may point to a special sequence of bytes called a
.Dq shift sequence .
JIS X 0208 (which uses two bytes per character).
Shift sequence bytes correspond to no individual wide character, so
.Fn mbrtowc
-treats them as if they were part of the subsequent multibyte character.
+and
+.Fn mbrtoc32
+treat them as if they were part of the subsequent multibyte character.
Therefore they do contribute to the number of bytes in the multibyte character.
.Pp
-Special cases in interpretation of arguments are as follows:
+The following arguments cause special processing:
.Bl -tag -width 012345678901
-.It "wc == NULL "
+.It Fa wc No == Dv NULL
The conversion from a multibyte character to a wide character is performed
and the conversion state may be affected, but the resulting wide character
is discarded.
-.Pp
This can be used to find out how many bytes are contained in the
multibyte character pointed to by
.Fa s .
-.It "s == NULL "
-.Fn mbrtowc
-ignores
+.It Fa s No == Dv NULL
+The arguments
.Fa wc
and
-.Fa n ,
-and behaves equivalent to
-.Bd -literal -offset indent
-mbrtowc(NULL, "", 1, mbs);
-.Ed
-.Pp
-which attempts to use the mbstate_t object pointed to by
-.Fa mbs
-to start or continue conversion using the empty string as input,
-and discards the conversion result.
-.Pp
+.Fa n
+are ignored and starting or continuing the conversion with an empty string
+is attempted, discarding the conversion result.
If conversion succeeds, this call always returns zero.
Unlike
.Xr mbtowc 3 ,
the value returned does not indicate whether the current encoding of
the locale is state-dependent, i.e. uses shift sequences.
-.It "mbs == NULL "
+.It Fa mbs No == Dv NULL
.Fn mbrtowc
-uses its own internal state object to keep the conversion state,
-instead of an mbstate_t object pointed to by
-.Fa mbs .
-This internal conversion state is initialized once at program startup.
-It is not safe to call
+and
+.Fn mbrtoc32
+each use their own internal state object instead of the
+.Fa mbs
+argument.
+Both internal state objects are initialized at startup time of the program,
+and no other libc function ever changes either of them.
+.Pp
+If
.Fn mbrtowc
-again with a
+or
+.Fn mbrtoc32
+is called with a
.Dv NULL
.Fa mbs
-argument if
-.Fn mbrtowc
-returned (size_t)-1 because at this point the internal conversion state
-is undefined.
-.Pp
-Calling any other functions in
-.Em libc
-never changes the internal
-conversion state object of
-.Fn mbrtowc .
+argument and that call returns
+.Po Vt size_t Pc Ns \-1 ,
+the internal conversion state of the respective function becomes
+permanently undefined and there is no way to reset it to any defined state.
+Consequently, after such a mishap, it is not safe
+to call the same function with a
+.Dv NULL
+.Fa mbs
+argument ever again until the program is terminated.
.El
.Sh RETURN VALUES
.Bl -tag -width 012345678901
the corresponding wide character has been stored in the wchar_t object
pointed to by
.Fa wc .
-.It (size_t)-1
+.It Po Vt size_t Pc Ns \-1
.Fa s
points to an illegal byte sequence which does not form a valid multibyte
-character in the current locale.
-.Fn mbrtowc
-sets
+character in the current locale, or
+.Fa mbs
+points to an invalid or uninitialized object.
.Va errno
-to EILSEQ.
+is set to
+.Er EILSEQ
+or
+.Er EINVAL ,
+respectively.
The conversion state object pointed to by
.Fa mbs
is left in an undefined state and must be reinitialized before being
.Pp
Because applications using
.Fn mbrtowc
+or
+.Fn mbrtoc32
are shielded from the specifics of the multibyte character encoding scheme,
it is impossible to repair byte sequences containing encoding errors.
Such byte sequences must be treated as invalid and potentially malicious input.
.Fa s
and either discard any wide characters already converted, or cope with
truncated input.
-.It (size_t)-2
+.It Po Vt size_t Pc Ns \-2
.Fa s
points to an incomplete byte sequence of length
.Fa n
which has been consumed and contains part of a valid multibyte character.
-The character may be completed by calling
-.Fn mbrtowc
-again with
+The character may be completed by calling the same function again with
.Fa s
pointing to one or more subsequent bytes of the multibyte character and
.Fa mbs
pointing to the conversion state object used during conversion of the
incomplete byte sequence.
+.It Po Vt size_t Pc Ns \-3
+The next character resulting from a previous call has been stored into
+.Fa wc ,
+without consuming any additional bytes from
+.Fa s .
+This never happens for
+.Fn mbrtowc ,
+and on
+.Ox ,
+it never happens for
+.Fn mbrtoc32
+either.
.El
.Sh ERRORS
-The
.Fn mbrtowc
-function may cause an error in the following cases:
+and
+.Fn mbrtoc32
+cause an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
.Fa s
points to an invalid multibyte character.
.It Bq Er EINVAL
.Fa mbs
-points to an invalid or uninitialized mbstate_t object.
+points to an invalid or uninitialized
+.Vt mbstate_t
+object.
.El
.Sh SEE ALSO
.Xr mbrlen 3 ,
.Xr mbtowc 3 ,
-.Xr setlocale 3
+.Xr setlocale 3 ,
+.Xr wcrtomb 3
.Sh STANDARDS
-The
.Fn mbrtowc
-function conforms to
-.\" .St -isoC-amd1 .
-ISO/IEC 9899/AMD1:1995
-.Pq Dq ISO C90, Amendment 1 .
-The restrict qualifier is added at
-.\" .St -isoC99 .
-ISO/IEC 9899:1999
-.Pq Dq ISO C99 .
+conforms to
+.St -isoC-amd1 .
+The restrict qualifier was added at
+.St -isoC-99 .
+.Pp
+.Fn mbrtoc32
+conforms to
+.St -isoC-2011 .
+.Sh HISTORY
+.Fn mbrtowc
+has been available since since
+.Ox 3.8
+and has provided support for UTF-8 since
+.Ox 4.8 .
+.Pp
+.Fn mbrtoc32
+has been available since since
+.Ox 7.4 .
.Sh CAVEATS
.Fn mbrtowc
-is not suitable for programs that care about internals of the character
+and
+.Fn mbrtoc32
+are not suitable for programs that care about internals of the character
encoding scheme used by the byte string pointed to by
.Fa s .
.Pp
-It is possible that
-.Fn mbrtowc
-fails because of locale configuration errors.
+It is possible that these functions
+fail because of locale configuration errors.
An
.Dq invalid
character sequence may simply be encoded in a different encoding than that
of the current locale.
.Pp
The special cases for
-.Fa s
-== NULL and
-.Fa mbs
-== NULL do not make any sense.
+.Fa s No == Dv NULL
+and
+.Fa mbs No == Dv NULL
+do not make any sense.
Instead of passing
.Dv NULL
for
-.\" $OpenBSD: wcrtomb.3,v 1.10 2015/03/22 18:02:11 stsp Exp $
+.\" $OpenBSD: wcrtomb.3,v 1.11 2023/08/20 15:02:51 schwarze Exp $
.\" $NetBSD: wcrtomb.3,v 1.4 2003/09/08 17:54:31 wiz Exp $
.\"
+.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
.\" Copyright (c)2002 Citrus Project,
.\" All rights reserved.
.\"
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd $Mdocdate: March 22 2015 $
+.Dd $Mdocdate: August 20 2023 $
.Dt WCRTOMB 3
.Os
-.\" ----------------------------------------------------------------------
.Sh NAME
-.Nm wcrtomb
-.Nd converts a wide character to a multibyte character (restartable)
-.\" ----------------------------------------------------------------------
+.Nm wcrtomb ,
+.Nm c32rtomb
+.Nd convert a wide character to a multibyte character
.Sh SYNOPSIS
.In wchar.h
.Ft size_t
-.Fn wcrtomb "const char * restrict s" "wchar_t wc" "mbstate_t * restrict ps"
-.\" ----------------------------------------------------------------------
+.Fo wcrtomb
+.Fa "const char * restrict s"
+.Fa "wchar_t wc"
+.Fa "mbstate_t * restrict mbs"
+.Fc
+.In uchar.h
+.Ft size_t
+.Fo c32rtomb
+.Fa "char * restrict s"
+.Fa "char32_t wc"
+.Fa "mbstate_t * restrict mbs"
+.Fc
.Sh DESCRIPTION
.Fn wcrtomb
-converts the wide character given by
+and
+.Fn c32rtomb
+convert the wide character
.Fa wc
-to the corresponding multibyte character, and stores up to
+to the corresponding multibyte character, and store up to
.Dv MB_CUR_MAX
bytes in the array pointed to by
.Fa s
if
.Fa s
-is not a null pointer.
+is not a
+.Dv NULL
+pointer.
+The interpretation of
+.Fa wc
+is implementation-defined.
+On
+.Ox ,
+.Vt wchar_t
+and
+.Vt char32_t
+are of the same width and both are always interpreted as Unicode codepoints.
.Pp
-The behaviour of
+The output encoding that
.Fn wcrtomb
-is affected by the
+and
+.Fn c32rtomb
+use in
+.Fa s
+is determined by the
.Dv LC_CTYPE
category of the current locale.
+.Ox
+only supports UTF-8 and ASCII output,
+and these functions are only useful for UTF-8.
.Pp
-These are the special cases:
+The following arguments cause special processing:
.Bl -tag -width 012345678901
-.It "wc == 0"
-For state-dependent encodings,
-.Fn wcrtomb
-stores a null byte preceded by a special byte sequence (if any)
-to return to an initial state to the array pointed by
-.Fa s ,
-and the state object pointed by
-.Fa ps
-also returned to an initial state.
-.It "s == NULL"
-.Fn wcrtomb
-just places
-.Fa ps
-into an initial state.
-It is equivalent to the following call:
-.Bd -literal -offset indent
-wcrtomb(buf, L'\e0', ps);
-.Ed
-.Pp
-Here,
-.Fa buf
-is a dummy buffer.
-In this case,
-.Fa wc
-is ignored.
-.It "ps == NULL"
+.It Fa wc No == 0
+A NUL byte is stored to
+.Pf * Fa s
+and the state object pointed to by
+.Fa mbs
+is reset to the initial state.
+On operating systems other than
+.Ox
+that support state-dependent multibyte encodings, a special byte sequence
+.Pq Dq shift sequence
+is written before the NUL byte to return to the initial state
+if that is required by the output encoding
+and by the current output encoding state.
+.It Fa mbs No == Dv NULL
.Fn mbrtowc
-uses its own internal state object to keep the conversion state,
-instead of
-.Fa ps
-mentioned in this manual page.
-.Pp
-Calling any other functions in
+and
+.Fn c32rtomb
+each use their own internal state object instead of the
+.Fa mbs
+argument.
+Both internal state objects are initialized at startup time of the program,
+and no other
.Em libc
-never change the internal
-state of
-.Fn mbrtowc ,
-which is initialized at startup time of the program.
+function ever changes either of them.
+.It Fa s No == Dv NULL
+The object pointed to by
+.Fa mbs ,
+or the internal object if
+.Fa mbs
+is a
+.Dv NULL
+pointer, is reset to the initial state,
+.Fa wc
+is ignored, and 1 is returned.
.El
-.\" ----------------------------------------------------------------------
.Sh RETURN VALUES
.Fn wcrtomb
-returns the number of bytes (including any shift sequences)
+and
+.Fn c32rtomb
+return the number of bytes (including any shift sequences)
which are stored in the array pointed to by
-.Fa s .
+.Fa s ,
+or 1 if
+.Fa s
+is
+.Dv NULL .
If
.Fa wc
-is not a valid wide character,
-.Fn wcrtomb
-returns (size_t)-1
-and sets
+is not a valid wide character
+or if it cannot be represented in the multibyte encoding selected with
+.Dv LC_CTYPE ,
+both functions return
+.Po Vt size_t Pc Ns \-1
+and set
.Va errno
-to indicate error.
-.\" ----------------------------------------------------------------------
+to indicate the error.
.Sh ERRORS
.Fn wcrtomb
-may cause an error in the following cases:
+and
+.Fn c32rtomb
+cause an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
.Fa wc
-is not a valid wide character.
+is not a valid wide character or cannot be represented using
+.Dv LC_CTYPE .
.It Bq Er EINVAL
-.Fa ps
-points to an invalid or uninitialized mbstate_t object.
+.Fa mbs
+points to an invalid or uninitialized
+.Vt mbstate_t
+object.
.El
-.\" ----------------------------------------------------------------------
.Sh SEE ALSO
+.Xr mbrtowc 3 ,
.Xr setlocale 3 ,
.Xr wctomb 3
-.\" ----------------------------------------------------------------------
.Sh STANDARDS
-The
.Fn wcrtomb
-function conforms to
-.\" .St -isoC-amd1 .
-ISO/IEC 9899/AMD1:1995
-.Pq Dq ISO C90, Amendment 1 .
-The restrict qualifier is added at
-.\" .St -isoC99 .
-ISO/IEC 9899/1999
-.Pq Dq ISO C99 .
+conforms to
+.St -isoC-amd1 .
+The restrict qualifier was added at
+.St -isoC-99 .
+.Pp
+.Fn c32rtomb
+conforms to
+.St -isoC-2011 .
+.Sh HISTORY
+.Fn wcrtomb
+has been available since
+.Ox 3.8
+and has provided support for UTF-8 since
+.Ox 4.8 .
+.Pp
+.Fn c32rtomb
+has been available since since
+.Ox 7.4 .
major=97
-minor=0
+minor=1
# note: If changes were made to include/thread_private.h or if system calls
# were added/changed then librthread/shlib_version must also be updated.