From 391879418e04bd99b3ad19ccf3bc324e853ee274 Mon Sep 17 00:00:00 2001 From: guenther Date: Sun, 18 May 2014 22:04:14 +0000 Subject: [PATCH] Add regression test for UTF8_{getc,putc}() --- regress/lib/libcrypto/Makefile | 5 +- regress/lib/libcrypto/utf8/Makefile | 7 + regress/lib/libcrypto/utf8/utf8test.c | 307 ++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 regress/lib/libcrypto/utf8/Makefile create mode 100644 regress/lib/libcrypto/utf8/utf8test.c diff --git a/regress/lib/libcrypto/Makefile b/regress/lib/libcrypto/Makefile index 6cf7191cd70..54fcae7c91a 100644 --- a/regress/lib/libcrypto/Makefile +++ b/regress/lib/libcrypto/Makefile @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile,v 1.9 2014/05/14 14:46:35 jsing Exp $ +# $OpenBSD: Makefile,v 1.10 2014/05/18 22:04:14 guenther Exp $ SUBDIR= \ aeswrap \ @@ -31,7 +31,8 @@ SUBDIR= \ rmd \ sha \ sha1 \ - sha2 + sha2 \ + utf8 install: diff --git a/regress/lib/libcrypto/utf8/Makefile b/regress/lib/libcrypto/utf8/Makefile new file mode 100644 index 00000000000..4940e600503 --- /dev/null +++ b/regress/lib/libcrypto/utf8/Makefile @@ -0,0 +1,7 @@ +# $OpenBSD: Makefile,v 1.1 2014/05/18 22:04:14 guenther Exp $ + +PROG= utf8test +LDADD= -lcrypto +DPADD= ${LIBCRYPTO} + +.include diff --git a/regress/lib/libcrypto/utf8/utf8test.c b/regress/lib/libcrypto/utf8/utf8test.c new file mode 100644 index 00000000000..5b737a52010 --- /dev/null +++ b/regress/lib/libcrypto/utf8/utf8test.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2014 Philip Guenther + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * A mostly exhaustive test of UTF-8 decoder and encoder + */ + +#include +#include +#include + +#include + +#define UNCHANGED 0xfedcba98 + +#define ASSERT(x) \ + do { \ + if (!(x)) \ + errx(1, "test failed at line %d: %s", \ + __LINE__, #x); \ + } while (0) + +int +main(void) +{ + unsigned char testbuf[] = "012345"; + const unsigned char zerobuf[sizeof testbuf] = { 0 }; + unsigned long value; + int i, j, k, l, ret; + + /* + * First, verify UTF8_getc() + */ + value = UNCHANGED; + ret = UTF8_getc(testbuf, 0, &value); + ASSERT(ret == 0); + ASSERT(value == UNCHANGED); + + /* check all valid single-byte chars */ + for (i = 0; i < 0x80; i++) { + testbuf[0] = i; + ret = UTF8_getc(testbuf, 1, &value); + ASSERT(ret == 1); + ASSERT(value == i); + + ret = UTF8_getc(testbuf, 2, &value); + ASSERT(ret == 1); + ASSERT(value == i); + } + + /* + * Verify failure on all invalid initial bytes: + * 0x80 - 0xBF following bytes only + * 0xC0 - 0xC1 used to be in non-shortest forms + * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences + * 0xFE - 0xFF have never been valid in utf-8 + */ + for (i = 0x80; i < 0xC2; i++) { + value = UNCHANGED; + testbuf[0] = i; + ret = UTF8_getc(testbuf, 1, &value); + ASSERT(ret == -2); + ASSERT(value == UNCHANGED); + } + for (i = 0xF5; i < 0x100; i++) { + value = UNCHANGED; + testbuf[0] = i; + ret = UTF8_getc(testbuf, 1, &value); + ASSERT(ret == -2); + ASSERT(value == UNCHANGED); + } + + /* + * Verify handling of all two-byte sequences + */ + for (i = 0xC2; i < 0xE0; i++) { + testbuf[0] = i; + + for (j = 0; j < 0x100; j++) { + testbuf[1] = j; + + value = UNCHANGED; + ret = UTF8_getc(testbuf, 1, &value); + ASSERT(ret == -1); + ASSERT(value == UNCHANGED); + + ret = UTF8_getc(testbuf, 2, &value); + + /* outside range of trailing bytes */ + if (j < 0x80 || j > 0xBF) { + ASSERT(ret == -3); + ASSERT(value == UNCHANGED); + continue; + } + + /* valid */ + ASSERT(ret == 2); + ASSERT((value & 0x3F) == (j & 0x3F)); + ASSERT(value >> 6 == (i & 0x1F)); + } + } + +#if 0 + /* + * Verify handling of all three-byte sequences + */ + for (i = 0xE0; i < 0xF0; i++) { + testbuf[0] = i; + + for (j = 0; j < 0x100; j++) { + testbuf[1] = j; + + for (k = 0; k < 0x100; k++) { + testbuf[2] = k; + + value = UNCHANGED; + ret = UTF8_getc(testbuf, 2, &value); + ASSERT(ret == -1); + ASSERT(value == UNCHANGED); + + ret = UTF8_getc(testbuf, 3, &value); + + /* outside range of trailing bytes */ + if (j < 0x80 || j > 0xBF || + k < 0x80 || k > 0xBF) { + ASSERT(ret == -3); + ASSERT(value == UNCHANGED); + continue; + } + + /* non-shortest form */ + if (i == 0xE0 && j < 0xA0) { + ASSERT(ret == -4); + ASSERT(value == UNCHANGED); + continue; + } + + /* surrogate pair code point */ + if (i == 0xED && j > 0x9F) { + ASSERT(ret == -2); + ASSERT(value == UNCHANGED); + continue; + } + + ASSERT(ret == 3); + ASSERT((value & 0x3F) == (k & 0x3F)); + ASSERT(((value >> 6) & 0x3F) == (j & 0x3F)); + ASSERT(value >> 12 == (i & 0x0F)); + } + } + } + + /* + * Verify handling of all four-byte sequences + */ + for (i = 0xF0; i < 0xF5; i++) { + testbuf[0] = i; + + for (j = 0; j < 0x100; j++) { + testbuf[1] = j; + + for (k = 0; k < 0x100; k++) { + testbuf[2] = k; + + for (l = 0; l < 0x100; l++) { + testbuf[3] = l; + + value = UNCHANGED; + ret = UTF8_getc(testbuf, 3, &value); + ASSERT(ret == -1); + ASSERT(value == UNCHANGED); + + ret = UTF8_getc(testbuf, 4, &value); + + /* outside range of trailing bytes */ + if (j < 0x80 || j > 0xBF || + k < 0x80 || k > 0xBF || + l < 0x80 || l > 0xBF) { + ASSERT(ret == -3); + ASSERT(value == UNCHANGED); + continue; + } + + /* non-shortest form */ + if (i == 0xF0 && j < 0x90) { + ASSERT(ret == -4); + ASSERT(value == UNCHANGED); + continue; + } + + /* beyond end of UCS range */ + if (i == 0xF4 && j > 0x8F) { + ASSERT(ret == -2); + ASSERT(value == UNCHANGED); + continue; + } + + ASSERT(ret == 4); + ASSERT((value & 0x3F) == (l & 0x3F)); + ASSERT(((value >> 6) & 0x3F) == + (k & 0x3F)); + ASSERT(((value >> 12) & 0x3F) == + (j & 0x3F)); + ASSERT(value >> 18 == (i & 0x07)); + } + } + } + } +#endif + + + /* + * Next, verify UTF8_putc() + */ + memset(testbuf, 0, sizeof testbuf); + + /* single-byte sequences */ + for (i = 0; i < 0x80; i++) { + ret = UTF8_putc(NULL, 0, i); + ASSERT(ret == 1); + + testbuf[0] = 0; + ret = UTF8_putc(testbuf, 0, i); + ASSERT(ret == -1); + ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); + + ret = UTF8_putc(testbuf, 1, i); + ASSERT(ret == 1); + ASSERT(testbuf[0] == i); + ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0); + } + + /* two-byte sequences */ + for (i = 0x80; i < 0x800; i++) { + ret = UTF8_putc(NULL, 0, i); + ASSERT(ret == 2); + + testbuf[0] = testbuf[1] = 0; + ret = UTF8_putc(testbuf, 1, i); + ASSERT(ret == -1); + ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); + + ret = UTF8_putc(testbuf, 2, i); + ASSERT(ret == 2); + ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0); + ret = UTF8_getc(testbuf, 2, &value); + ASSERT(ret == 2); + ASSERT(value == i); + } + + /* three-byte sequences */ + for (i = 0x800; i < 0x10000; i++) { + /* XXX skip surrogate pair code points */ + if (i >= 0xD800 && i < 0xE000) + continue; + + ret = UTF8_putc(NULL, 0, i); + ASSERT(ret == 3); + + testbuf[0] = testbuf[1] = testbuf[2] = 0; + ret = UTF8_putc(testbuf, 2, i); + ASSERT(ret == -1); + ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); + + ret = UTF8_putc(testbuf, 3, i); + ASSERT(ret == 3); + ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0); + ret = UTF8_getc(testbuf, 3, &value); + ASSERT(ret == 3); + ASSERT(value == i); + } + + /* four-byte sequences */ + for (i = 0x10000; i < 0x110000; i++) { + ret = UTF8_putc(NULL, 0, i); + ASSERT(ret == 4); + + testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0; + ret = UTF8_putc(testbuf, 3, i); + ASSERT(ret == -1); + ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); + + ret = UTF8_putc(testbuf, 4, i); + ASSERT(ret == 4); + ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0); + ret = UTF8_getc(testbuf, 4, &value); + ASSERT(ret == 4); + ASSERT(value == i); + } + + /* XXX What should UTF8_putc() do with values > 0x10FFFF */ + + return 0; +} -- 2.20.1