--- /dev/null
+/*
+ * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * A mostly exhaustive test of UTF-8 decoder and encoder
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <err.h>
+
+#include <openssl/asn1.h>
+
+#define UNCHANGED 0xfedcba98
+
+#define ASSERT(x) \
+ do { \
+ if (!(x)) \
+ errx(1, "test failed at line %d: %s", \
+ __LINE__, #x); \
+ } while (0)
+
+int
+main(void)
+{
+ unsigned char testbuf[] = "012345";
+ const unsigned char zerobuf[sizeof testbuf] = { 0 };
+ unsigned long value;
+ int i, j, k, l, ret;
+
+ /*
+ * First, verify UTF8_getc()
+ */
+ value = UNCHANGED;
+ ret = UTF8_getc(testbuf, 0, &value);
+ ASSERT(ret == 0);
+ ASSERT(value == UNCHANGED);
+
+ /* check all valid single-byte chars */
+ for (i = 0; i < 0x80; i++) {
+ testbuf[0] = i;
+ ret = UTF8_getc(testbuf, 1, &value);
+ ASSERT(ret == 1);
+ ASSERT(value == i);
+
+ ret = UTF8_getc(testbuf, 2, &value);
+ ASSERT(ret == 1);
+ ASSERT(value == i);
+ }
+
+ /*
+ * Verify failure on all invalid initial bytes:
+ * 0x80 - 0xBF following bytes only
+ * 0xC0 - 0xC1 used to be in non-shortest forms
+ * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences
+ * 0xFE - 0xFF have never been valid in utf-8
+ */
+ for (i = 0x80; i < 0xC2; i++) {
+ value = UNCHANGED;
+ testbuf[0] = i;
+ ret = UTF8_getc(testbuf, 1, &value);
+ ASSERT(ret == -2);
+ ASSERT(value == UNCHANGED);
+ }
+ for (i = 0xF5; i < 0x100; i++) {
+ value = UNCHANGED;
+ testbuf[0] = i;
+ ret = UTF8_getc(testbuf, 1, &value);
+ ASSERT(ret == -2);
+ ASSERT(value == UNCHANGED);
+ }
+
+ /*
+ * Verify handling of all two-byte sequences
+ */
+ for (i = 0xC2; i < 0xE0; i++) {
+ testbuf[0] = i;
+
+ for (j = 0; j < 0x100; j++) {
+ testbuf[1] = j;
+
+ value = UNCHANGED;
+ ret = UTF8_getc(testbuf, 1, &value);
+ ASSERT(ret == -1);
+ ASSERT(value == UNCHANGED);
+
+ ret = UTF8_getc(testbuf, 2, &value);
+
+ /* outside range of trailing bytes */
+ if (j < 0x80 || j > 0xBF) {
+ ASSERT(ret == -3);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ /* valid */
+ ASSERT(ret == 2);
+ ASSERT((value & 0x3F) == (j & 0x3F));
+ ASSERT(value >> 6 == (i & 0x1F));
+ }
+ }
+
+#if 0
+ /*
+ * Verify handling of all three-byte sequences
+ */
+ for (i = 0xE0; i < 0xF0; i++) {
+ testbuf[0] = i;
+
+ for (j = 0; j < 0x100; j++) {
+ testbuf[1] = j;
+
+ for (k = 0; k < 0x100; k++) {
+ testbuf[2] = k;
+
+ value = UNCHANGED;
+ ret = UTF8_getc(testbuf, 2, &value);
+ ASSERT(ret == -1);
+ ASSERT(value == UNCHANGED);
+
+ ret = UTF8_getc(testbuf, 3, &value);
+
+ /* outside range of trailing bytes */
+ if (j < 0x80 || j > 0xBF ||
+ k < 0x80 || k > 0xBF) {
+ ASSERT(ret == -3);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ /* non-shortest form */
+ if (i == 0xE0 && j < 0xA0) {
+ ASSERT(ret == -4);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ /* surrogate pair code point */
+ if (i == 0xED && j > 0x9F) {
+ ASSERT(ret == -2);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ ASSERT(ret == 3);
+ ASSERT((value & 0x3F) == (k & 0x3F));
+ ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
+ ASSERT(value >> 12 == (i & 0x0F));
+ }
+ }
+ }
+
+ /*
+ * Verify handling of all four-byte sequences
+ */
+ for (i = 0xF0; i < 0xF5; i++) {
+ testbuf[0] = i;
+
+ for (j = 0; j < 0x100; j++) {
+ testbuf[1] = j;
+
+ for (k = 0; k < 0x100; k++) {
+ testbuf[2] = k;
+
+ for (l = 0; l < 0x100; l++) {
+ testbuf[3] = l;
+
+ value = UNCHANGED;
+ ret = UTF8_getc(testbuf, 3, &value);
+ ASSERT(ret == -1);
+ ASSERT(value == UNCHANGED);
+
+ ret = UTF8_getc(testbuf, 4, &value);
+
+ /* outside range of trailing bytes */
+ if (j < 0x80 || j > 0xBF ||
+ k < 0x80 || k > 0xBF ||
+ l < 0x80 || l > 0xBF) {
+ ASSERT(ret == -3);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ /* non-shortest form */
+ if (i == 0xF0 && j < 0x90) {
+ ASSERT(ret == -4);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ /* beyond end of UCS range */
+ if (i == 0xF4 && j > 0x8F) {
+ ASSERT(ret == -2);
+ ASSERT(value == UNCHANGED);
+ continue;
+ }
+
+ ASSERT(ret == 4);
+ ASSERT((value & 0x3F) == (l & 0x3F));
+ ASSERT(((value >> 6) & 0x3F) ==
+ (k & 0x3F));
+ ASSERT(((value >> 12) & 0x3F) ==
+ (j & 0x3F));
+ ASSERT(value >> 18 == (i & 0x07));
+ }
+ }
+ }
+ }
+#endif
+
+
+ /*
+ * Next, verify UTF8_putc()
+ */
+ memset(testbuf, 0, sizeof testbuf);
+
+ /* single-byte sequences */
+ for (i = 0; i < 0x80; i++) {
+ ret = UTF8_putc(NULL, 0, i);
+ ASSERT(ret == 1);
+
+ testbuf[0] = 0;
+ ret = UTF8_putc(testbuf, 0, i);
+ ASSERT(ret == -1);
+ ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
+
+ ret = UTF8_putc(testbuf, 1, i);
+ ASSERT(ret == 1);
+ ASSERT(testbuf[0] == i);
+ ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
+ }
+
+ /* two-byte sequences */
+ for (i = 0x80; i < 0x800; i++) {
+ ret = UTF8_putc(NULL, 0, i);
+ ASSERT(ret == 2);
+
+ testbuf[0] = testbuf[1] = 0;
+ ret = UTF8_putc(testbuf, 1, i);
+ ASSERT(ret == -1);
+ ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
+
+ ret = UTF8_putc(testbuf, 2, i);
+ ASSERT(ret == 2);
+ ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
+ ret = UTF8_getc(testbuf, 2, &value);
+ ASSERT(ret == 2);
+ ASSERT(value == i);
+ }
+
+ /* three-byte sequences */
+ for (i = 0x800; i < 0x10000; i++) {
+ /* XXX skip surrogate pair code points */
+ if (i >= 0xD800 && i < 0xE000)
+ continue;
+
+ ret = UTF8_putc(NULL, 0, i);
+ ASSERT(ret == 3);
+
+ testbuf[0] = testbuf[1] = testbuf[2] = 0;
+ ret = UTF8_putc(testbuf, 2, i);
+ ASSERT(ret == -1);
+ ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
+
+ ret = UTF8_putc(testbuf, 3, i);
+ ASSERT(ret == 3);
+ ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
+ ret = UTF8_getc(testbuf, 3, &value);
+ ASSERT(ret == 3);
+ ASSERT(value == i);
+ }
+
+ /* four-byte sequences */
+ for (i = 0x10000; i < 0x110000; i++) {
+ ret = UTF8_putc(NULL, 0, i);
+ ASSERT(ret == 4);
+
+ testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
+ ret = UTF8_putc(testbuf, 3, i);
+ ASSERT(ret == -1);
+ ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
+
+ ret = UTF8_putc(testbuf, 4, i);
+ ASSERT(ret == 4);
+ ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
+ ret = UTF8_getc(testbuf, 4, &value);
+ ASSERT(ret == 4);
+ ASSERT(value == i);
+ }
+
+ /* XXX What should UTF8_putc() do with values > 0x10FFFF */
+
+ return 0;
+}