Replace old'n'wrong UTF-8 logic with a better one borrowed from Citrus;
authormiod <miod@openbsd.org>
Mon, 6 Mar 2023 17:14:44 +0000 (17:14 +0000)
committermiod <miod@openbsd.org>
Mon, 6 Mar 2023 17:14:44 +0000 (17:14 +0000)
issue reported by Crystal Kolipe on tech@

sys/dev/wscons/wsemul_subr.c
sys/dev/wscons/wsemul_sun.c
sys/dev/wscons/wsemul_vt100.c
sys/dev/wscons/wsemul_vt100var.h
sys/dev/wscons/wsemulvar.h

index 50d7b7a..95266a6 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: wsemul_subr.c,v 1.1 2013/10/18 22:06:41 miod Exp $    */
+/*     $OpenBSD: wsemul_subr.c,v 1.2 2023/03/06 17:14:44 miod Exp $    */
 
 /*
  * Copyright (c) 2007, 2013 Miodrag Vallat.
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+/*
+ * Part of the UTF-8 state machine logic borrowed from citrus_utf8.c
+ * under the following licence:
+ */
+/*-
+ * Copyright (c) 2002-2004 Tim J. Robbins
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
@@ -38,29 +68,25 @@ int
 wsemul_getchar(const u_char **inbuf, u_int *inlen,
     struct wsemul_inputstate *state, int allow_utf8)
 {
-#ifndef HAVE_UTF8_SUPPORT
        u_int len = *inlen;
        const u_char *buf = *inbuf;
+#ifdef HAVE_UTF8_SUPPORT
+       int rc;
+       u_int32_t tmpchar, lbound;
+       u_int mbleft;
+#endif
 
        if (len == 0)
-               return (EAGAIN);
+               return EAGAIN;
 
+#ifndef HAVE_UTF8_SUPPORT
        state->inchar = *buf++;
        state->mbleft = 0;
        len--;
        *inlen = len;
        *inbuf = buf;
-       return (0);
+       return 0;
 #else
-       u_int len = *inlen;
-       const u_char *buf = *inbuf;
-       int rc = EAGAIN;
-       u_int32_t tmpchar;
-       u_int mbleft;
-
-       if (len == 0)
-               return (rc);
-
        /*
         * If we do not allow multibyte sequences, process as quickly
         * as possible.
@@ -71,10 +97,12 @@ wsemul_getchar(const u_char **inbuf, u_int *inlen,
                len--;
                *inlen = len;
                *inbuf = buf;
-               return (0);
+               return 0;
        }
 
+       rc = EAGAIN;
        tmpchar = state->inchar;
+       lbound = state->lbound;
        mbleft = state->mbleft;
 
        while (len != 0) {
@@ -87,19 +115,22 @@ wsemul_getchar(const u_char **inbuf, u_int *inlen,
                 */
 
                if (mbleft != 0) {
-                       if ((frag & 0xc0) != 0x80) {
-                               /* Abort the sequence and continue */
-                               mbleft = 0;
-                               tmpchar = 0;
-                               rc = EILSEQ;
-                       } else {
-                               tmpchar = (tmpchar << 6) | (frag & 0x3f);
-                               mbleft--;
-                               if (mbleft == 0) {
-                                       rc = 0;
-                                       break;
-                               }
+                       if ((frag & 0xc0) != 0x80)
+                               goto invalid;
+
+                       tmpchar = (tmpchar << 6) | (frag & 0x3f);
+                       mbleft--;
+                       if (mbleft == 0) {
+                               if (tmpchar < lbound)
+                                       goto invalid;
+                               if (tmpchar >= 0xd800 && tmpchar < 0xe000)
+                                       goto invalid;
+                               if (tmpchar >= 0x110000)
+                                       goto invalid;
+                               rc = 0;
+                               break;
                        }
+                       continue;
                }
 
                /*
@@ -113,41 +144,38 @@ wsemul_getchar(const u_char **inbuf, u_int *inlen,
                        break;
                }
 
-               if (frag == 0xfe || frag == 0xff || (frag & 0x40) == 0) {
-                       /* Abort the sequence and continue */
-                       mbleft = 0;
-                       tmpchar = 0;
-                       rc = EILSEQ;
-               } else {
-                       frag &= ~(0x80 | 0x40);
+               if ((frag & 0xe0) == 0xc0) {
+                       frag &= 0x1f;
                        mbleft = 1;
+                       lbound = 0x80;
+               } else if ((frag & 0xf0) == 0xe0) {
+                       frag &= 0x0f;
+                       mbleft = 2;
+                       lbound = 0x800;
+               } else if ((frag & 0xf8) == 0xf0) {
+                       frag &= 0x07;
+                       mbleft = 3;
+                       lbound = 0x10000;
+               } else {
+                       goto invalid;
+               }
 
-                       if (frag & 0x20) {
-                               frag &= ~0x20;
-                               mbleft++;
-                       }
-                       if (frag & 0x10) {
-                               frag &= ~0x10;
-                               mbleft++;
-                       }
-                       if (frag & 0x08) {
-                               frag &= ~0x08;
-                               mbleft++;
-                       }
-                       if (frag & 0x04) {
-                               frag &= ~0x04;
-                               mbleft++;
-                       }
+               tmpchar = frag;
+               state->lbound = lbound;
+               continue;
 
-                       tmpchar = frag;
-               }
+invalid:
+               /* Abort the ill-formed sequence and continue */
+               mbleft = 0;
+               tmpchar = 0;
+               rc = EILSEQ;
        }
 
        state->inchar = tmpchar;
        state->mbleft = mbleft;
        *inlen = len;
        *inbuf = buf;
-       return (rc);
+       return rc;
 #endif
 }
 
@@ -659,7 +687,7 @@ wsemul_local_translate(u_int32_t unisym, kbd_t layout, u_char *out)
 
 /*
  * Keysym to UTF-8 sequence translation function.
- * The out buffer is at least 6 characters long.
+ * The out buffer is at least 4 characters long.
  */
 int
 wsemul_utf8_translate(u_int32_t unisym, kbd_t layout, u_char *out,
@@ -671,28 +699,27 @@ wsemul_utf8_translate(u_int32_t unisym, kbd_t layout, u_char *out,
        u_int pos, length, headpat;
 
        if (!allow_utf8)
-               return (wsemul_local_translate(unisym, layout, out));
-
-       if (unisym >= 0x80000000) {
-               return (0);
-       } else if (unisym > 0x04000000) {
-               headpat = 0xfc;
-               length = 6;
-       } else if (unisym > 0x00200000) {
-               headpat = 0xf8;
-               length = 5;
-       } else if (unisym > 0x00010000) {
-               headpat = 0xf0;
-               length = 4;
-       } else if (unisym > 0x00000800) {
-               headpat = 0xe0;
-               length = 3;
-       } else if (unisym > 0x00000080) {
+               return wsemul_local_translate(unisym, layout, out);
+
+       if (unisym < 0x80) {
+               /* Fast path for plain ASCII characters. */
+               *out = (u_char)unisym;
+               return 1;
+       }
+
+       if (unisym < 0x800) {
                headpat = 0xc0;
                length = 2;
+       } else if (unisym < 0x10000) {
+               if (unisym >= 0xd800 && unisym < 0xe000)
+                       return 0;
+               headpat = 0xe0;
+               length = 3;
        } else {
-               headpat = 0x00;
-               length = 1;
+               if (unisym >= 0x110000)
+                       return 0;
+               headpat = 0xf0;
+               length = 4;
        }
 
        for (pos = length - 1; pos > 0; pos--) {
@@ -701,6 +728,6 @@ wsemul_utf8_translate(u_int32_t unisym, kbd_t layout, u_char *out,
        }
        out[0] = headpat | unisym;
 
-       return (length);
+       return length;
 #endif
 }
index f75986b..c866b11 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: wsemul_sun.c,v 1.34 2020/05/25 09:55:49 jsg Exp $ */
+/* $OpenBSD: wsemul_sun.c,v 1.35 2023/03/06 17:14:44 miod Exp $ */
 /* $NetBSD: wsemul_sun.c,v 1.11 2000/01/05 11:19:36 drochner Exp $ */
 
 /*
@@ -118,7 +118,7 @@ struct wsemul_sun_emuldata {
        struct wsemul_inputstate kstate;        /* kernel input state */
 
 #ifdef HAVE_UTF8_SUPPORT
-       u_char translatebuf[6];
+       u_char translatebuf[4];
 #else
        u_char translatebuf[1];
 #endif
@@ -185,8 +185,10 @@ wsemul_sun_reset(struct wsemul_sun_emuldata *edp)
        edp->bgcol = WSCOL_WHITE;
        edp->scrolldist = 1;
        edp->instate.inchar = 0;
+       edp->instate.lbound = 0;
        edp->instate.mbleft = 0;
        edp->kstate.inchar = 0;
+       edp->kstate.lbound = 0;
        edp->kstate.mbleft = 0;
 }
 
index 7796bab..b58f223 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: wsemul_vt100.c,v 1.43 2023/02/26 15:09:53 miod Exp $ */
+/* $OpenBSD: wsemul_vt100.c,v 1.44 2023/03/06 17:14:44 miod Exp $ */
 /* $NetBSD: wsemul_vt100.c,v 1.13 2000/04/28 21:56:16 mycroft Exp $ */
 
 /*
@@ -309,6 +309,12 @@ wsemul_vt100_reset(struct wsemul_vt100_emuldata *edp)
        edp->chartab0 = 0;
        edp->chartab1 = 2;
        edp->sschartab = 0;
+       edp->instate.inchar = 0;
+       edp->instate.lbound = 0;
+       edp->instate.mbleft = 0;
+       edp->kstate.inchar = 0;
+       edp->kstate.lbound = 0;
+       edp->kstate.mbleft = 0;
 }
 
 /*
index 6d52490..4b7c7a6 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: wsemul_vt100var.h,v 1.12 2023/01/12 20:39:37 nicm Exp $ */
+/* $OpenBSD: wsemul_vt100var.h,v 1.13 2023/03/06 17:14:44 miod Exp $ */
 /* $NetBSD: wsemul_vt100var.h,v 1.5 2000/04/28 21:56:17 mycroft Exp $ */
 
 /*
@@ -95,7 +95,7 @@ struct wsemul_vt100_emuldata {
        struct wsemul_inputstate kstate;        /* kernel input state */
 
 #ifdef HAVE_UTF8_SUPPORT
-       u_char translatebuf[6];
+       u_char translatebuf[4];
 #else
        u_char translatebuf[1];
 #endif
index 71dd307..a0a88e7 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: wsemulvar.h,v 1.18 2020/09/13 10:05:46 fcambus Exp $ */
+/* $OpenBSD: wsemulvar.h,v 1.19 2023/03/06 17:14:44 miod Exp $ */
 /* $NetBSD: wsemulvar.h,v 1.6 1999/01/17 15:46:15 drochner Exp $ */
 
 /*
@@ -78,7 +78,8 @@ struct wsemul_ops {
  * decoding.
  */
 struct wsemul_inputstate {
-       u_int32_t       inchar; /* character being reconstructed */
+       uint32_t        inchar; /* character being reconstructed */
+       uint32_t        lbound; /* lower bound of above */
        u_int           mbleft; /* multibyte bytes left until char complete */
 };