From 35a004d4772e4d72a0b4704882f06c5b22b38028 Mon Sep 17 00:00:00 2001 From: millert Date: Fri, 6 Oct 2023 22:29:24 +0000 Subject: [PATCH] Update awk to Sep 24, 2023 version. fnematch and getrune have been overhauled to solve issues around unicode FS and RS. also fixed gsub null match issue with unicode. big thanks to Arnold Robbins. --- usr.bin/awk/FIXES | 5 ++ usr.bin/awk/b.c | 118 +++++++++++++++++++++++++++------------------ usr.bin/awk/lib.c | 3 +- usr.bin/awk/main.c | 4 +- usr.bin/awk/run.c | 7 ++- 5 files changed, 86 insertions(+), 51 deletions(-) diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES index 6c2f1505e96..e3dedacf485 100644 --- a/usr.bin/awk/FIXES +++ b/usr.bin/awk/FIXES @@ -25,6 +25,11 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Sep 24, 2023: + fnematch and getrune have been overhauled to solve issues around + unicode FS and RS. also fixed gsub null match issue with unicode. + big thanks to Arnold Robbins. + Sep 12, 2023: Fixed a length error in u8_byte2char that set RSTART to incorrect (cannot happen) value for EOL match(str, /$/). diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c index 5cf54abb8b7..2370c661f6d 100644 --- a/usr.bin/awk/b.c +++ b/usr.bin/awk/b.c @@ -1,4 +1,4 @@ -/* $OpenBSD: b.c,v 1.42 2023/09/21 17:19:06 millert Exp $ */ +/* $OpenBSD: b.c,v 1.43 2023/10/06 22:29:24 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -81,6 +81,8 @@ int patlen; fa *fatab[NFA]; int nfatab = 0; /* entries in fatab */ +extern int u8_nextlen(const char *s); + /* utf-8 mechanism: @@ -760,33 +762,59 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */ return (0); } -static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum, - int *curpos, int *lastpos) + +#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long + +// Read one rune at a time from the given FILE*. Return both +// the bytes and the actual rune. + +struct runedata { + int rune; + size_t len; + char bytes[6]; +}; + +struct runedata getrune(FILE *fp) { - int c = 0; - char *buf = *pbuf; - static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4 - int i, rune; - uschar private_buf[max_bytes + 1]; - - for (i = 0; i <= max_bytes; i++) { - if (++*curpos == *lastpos) { - if (*lastpos == *pbufsize) - if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune")) - FATAL("stream '%.30s...' too long", buf); - buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0; - private_buf[i] = c; - } - if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character - ungetc(c, fp); - private_buf[i] = 0; + struct runedata result; + int c, i, next; + + memset(&result, 0, sizeof(result)); + + c = getc(fp); + if (c == EOF) + return result; // result.rune == 0 --> EOF + else if (c < 128 || awk_mb_cur_max == 1) { + result.bytes[0] = c; + result.len = 1; + result.rune = c; + + return result; + } + + // need to get bytes and fill things in + result.bytes[0] = c; + result.len = 1; + + next = 1; + for (i = 1; i < MAX_UTF_BYTES; i++) { + c = getc(fp); + if (c == EOF) break; - } + result.bytes[next++] = c; + result.len++; } - u8_rune(& rune, private_buf); + // put back any extra input bytes + int actual_len = u8_nextlen(result.bytes); + while (result.len > actual_len) { + ungetc(result.bytes[--result.len], fp); + } + + result.bytes[result.len] = '\0'; + (void) u8_rune(& result.rune, (uschar *) result.bytes); - return rune; + return result; } @@ -809,8 +837,8 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { char *buf = *pbuf; int bufsize = *pbufsize; - int c, i, j, k, ns, s; - int rune; + int i, j, k, ns, s; + struct runedata r; s = pfa->initstat; patlen = 0; @@ -819,42 +847,38 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * All indices relative to buf. * i <= j <= k <= bufsize * - * i: origin of active substring - * j: current character + * i: origin of active substring (first byte of first character) + * j: current character (last byte of current character) * k: destination of next getc() */ i = -1, k = 0; do { j = i++; do { - if (++j == k) { - if (k == bufsize) + r = getrune(f); + if ((++j + r.len) >= k) { + if (k >= bufsize) if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) FATAL("stream '%.30s...' too long", buf); - buf[k++] = (c = getc(f)) != EOF ? c : 0; - } - c = (uschar)buf[j]; - if (c < 128 || awk_mb_cur_max == 1) - rune = c; - else { - j--; - k--; - ungetc(c, f); - rune = getrune(f, &buf, &bufsize, quantum, &j, &k); } + memcpy(buf + k, r.bytes, r.len); + j += r.len - 1; // incremented next time around the loop + k += r.len; - if ((ns = get_gototab(pfa, s, rune)) != 0) + if ((ns = get_gototab(pfa, s, r.rune)) != 0) s = ns; else - s = cgoto(pfa, s, rune); + s = cgoto(pfa, s, r.rune); if (pfa->out[s]) { /* final state */ patlen = j - i + 1; - if (c == 0) /* don't count $ */ + if (r.rune == 0) /* don't count $ */ patlen--; } } while (buf[j] && s != 1); s = 2; + if (r.len > 1) + i += r.len - 1; // i incremented around the loop } while (buf[i] && !patlen); /* adjbuf() may have relocated a resized buffer. Inform the world. */ @@ -874,10 +898,12 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * (except for EOF's nullbyte, if present) and null * terminate the buffer. */ - do - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - while (k > i + patlen); + do { + int ii; + for (ii = r.len; ii > 0; ii--) + if (buf[--k] && ungetc(buf[k], f) == EOF) + FATAL("unable to ungetc '%c'", buf[k]); + } while (k > i + patlen); buf[k] = '\0'; return true; } diff --git a/usr.bin/awk/lib.c b/usr.bin/awk/lib.c index 5749a9d5fb2..9534c324690 100644 --- a/usr.bin/awk/lib.c +++ b/usr.bin/awk/lib.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lib.c,v 1.51 2023/09/17 14:49:44 millert Exp $ */ +/* $OpenBSD: lib.c,v 1.52 2023/10/06 22:29:24 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -234,6 +234,7 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec } else if (*rs && rs[1]) { bool found; + memset(buf, 0, bufsize); fa *pfa = makedfa(rs, 1); if (newflag) found = fnematch(pfa, inf, &buf, &bufsize, recsize); diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c index 2ec0bc7d9e9..55794c6ff15 100644 --- a/usr.bin/awk/main.c +++ b/usr.bin/awk/main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: main.c,v 1.62 2023/09/20 16:57:12 millert Exp $ */ +/* $OpenBSD: main.c,v 1.63 2023/10/06 22:29:24 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20230913"; +const char *version = "version 20231001"; #define DEBUG #include diff --git a/usr.bin/awk/run.c b/usr.bin/awk/run.c index 436b6c75290..1c1b72c6dd8 100644 --- a/usr.bin/awk/run.c +++ b/usr.bin/awk/run.c @@ -1,4 +1,4 @@ -/* $OpenBSD: run.c,v 1.78 2023/09/20 16:49:13 millert Exp $ */ +/* $OpenBSD: run.c,v 1.79 2023/10/06 22:29:24 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -2587,6 +2587,7 @@ Cell *gsub(Node **a, int nnn) /* global substitute */ fa *pfa; int mflag, tempstat, num; int bufsz = recsize; + int charlen = 0; if ((buf = (char *) malloc(bufsz)) == NULL) FATAL("out of memory in gsub"); @@ -2628,7 +2629,9 @@ Cell *gsub(Node **a, int nnn) /* global substitute */ if (*t == '\0') /* at end */ goto done; adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub"); - *pb++ = *t++; + charlen = u8_nextlen(t); + while (charlen-- > 0) + *pb++ = *t++; if (pb > buf + bufsz) /* BUG: not sure of this test */ FATAL("gsub result0 %.30s too big; can't happen", buf); mflag = 0; -- 2.20.1