From c83d5272b39480cf40aae5f75fdcabadd1efd2eb Mon Sep 17 00:00:00 2001 From: millert Date: Sat, 25 Nov 2023 16:31:33 +0000 Subject: [PATCH] Update awk to the Nov 24, 2023 version. --- usr.bin/awk/FIXES | 16 ++++-- usr.bin/awk/awk.h | 13 +++-- usr.bin/awk/b.c | 130 ++++++++++++++++++++++++++++++++++----------- usr.bin/awk/lex.c | 16 +++--- usr.bin/awk/main.c | 4 +- usr.bin/awk/run.c | 5 +- 6 files changed, 137 insertions(+), 47 deletions(-) diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES index 5d2b4595980..52f49e3eed0 100644 --- a/usr.bin/awk/FIXES +++ b/usr.bin/awk/FIXES @@ -25,13 +25,24 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. -Nov 20, 2023 +Nov 24, 2023: + Fix issue #199: gototab improvements to dynamically resize the + table, qsort and bsearch to improve the lookup speed as the + table gets larger for multibyte input. thanks to Arnold Robbins. + +Nov 23, 2023: + Fix Issue #169, related to escape sequences in strings. + Thanks to Github user rajeevvp. + Fix Issue #147, reported by Github user drawkula, and fixed + by Miguel Pineiro Jr. + +Nov 20, 2023: rewrite of fnematch to fix a number of issues, including extraneous output, out-of-bounds access, number of bytes to push back after a failed match etc. thanks to Miguel Pineiro Jr. -Nov 15, 2023 +Nov 15, 2023: Man page edit, regression test fixes. thanks to Arnold Robbins consolidation of sub and gsub into dosub, removing duplicate code. thanks to Miguel Pineiro Jr. @@ -44,7 +55,6 @@ Oct 30, 2023: systems. also fixed an out-of-bounds read for empty CCL. fixed a buffer overflow in substr with utf-8 strings. many thanks to Todd C Miller. - Sep 24, 2023: fnematch and getrune have been overhauled to solve issues around diff --git a/usr.bin/awk/awk.h b/usr.bin/awk/awk.h index a57e27eaffd..3b5c67b60c3 100644 --- a/usr.bin/awk/awk.h +++ b/usr.bin/awk/awk.h @@ -1,4 +1,4 @@ -/* $OpenBSD: awk.h,v 1.30 2023/09/18 19:32:19 millert Exp $ */ +/* $OpenBSD: awk.h,v 1.31 2023/11/25 16:31:33 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -257,14 +257,19 @@ typedef struct rrow { int *lfollow; } rrow; -typedef struct gtt { /* gototab entry */ +typedef struct gtte { /* gototab entry */ unsigned int ch; unsigned int state; +} gtte; + +typedef struct gtt { /* gototab */ + size_t allocated; + size_t inuse; + gtte *entries; } gtt; typedef struct fa { - gtt **gototab; - int gototab_len; + gtt *gototab; uschar *out; uschar *restr; int **posns; diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c index 09548d7e5f4..523e3ee195c 100644 --- a/usr.bin/awk/b.c +++ b/usr.bin/awk/b.c @@ -1,4 +1,4 @@ -/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */ +/* $OpenBSD: b.c,v 1.49 2023/11/25 16:31:33 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -97,9 +97,8 @@ extern int u8_nextlen(const char *s); mechanism of the goto table used 8-bit byte indices into the gototab entries to compute the next state. Unicode is a lot bigger, so the gototab entries are now structs with a character - and a next state, and there is a linear search of the characters - to find the state. (Yes, this is slower, by a significant - amount. Tough.) + and a next state. These are sorted by code point and binary + searched. Throughout the RE mechanism in b.c, utf-8 characters are converted to their utf-32 value. This mostly shows up in @@ -114,9 +113,10 @@ extern int u8_nextlen(const char *s); */ +static int entry_cmp(const void *l, const void *r); static int get_gototab(fa*, int, int); static int set_gototab(fa*, int, int, int); -static void reset_gototab(fa*, int); +static void clear_gototab(fa*, int); extern int u8_rune(int *, const uschar *); static int * @@ -151,7 +151,7 @@ resizesetvec(const char *f) static void resize_state(fa *f, int state) { - gtt **p; + gtt *p; uschar *p2; int **p3; int i, new_count; @@ -161,7 +161,7 @@ resize_state(fa *f, int state) new_count = state + 10; /* needs to be tuned */ - p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); + p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt)); if (p == NULL) goto out; f->gototab = p; @@ -177,13 +177,14 @@ resize_state(fa *f, int state) f->posns = p3; for (i = f->state_count; i < new_count; ++i) { - f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); - if (f->gototab[i] == NULL) + f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); + if (f->gototab[i].entries == NULL) goto out; - f->out[i] = 0; + f->gototab[i].allocated = NCHARS; + f->gototab[i].inuse = 0; + f->out[i] = 0; f->posns[i] = NULL; } - f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: @@ -277,7 +278,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - reset_gototab(f, 2); + clear_gototab(f, 2); f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ @@ -601,37 +602,104 @@ int member(int c, int *sarg) /* is c in s? */ return(0); } +static void resize_gototab(fa *f, int state) +{ + size_t new_size = f->gototab[state].allocated * 2; + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + + // need to initialized the new memory to zero + size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size + memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out + + f->gototab[state].allocated = new_size; // update gotottab info + f->gototab[state].entries = p; +} + static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0) - break; - if (f->gototab[state][i].ch == ch) - return f->gototab[state][i].state; - } - return 0; + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item == NULL) + return 0; + else + return item->state; } -static void reset_gototab(fa *f, int state) /* hide gototab inplementation */ +static int entry_cmp(const void *l, const void *r) { - memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab)); + const gtte *left, *right; + + left = (const gtte *) l; + right = (const gtte *) r; + + return left->ch - right->ch; } static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { - f->gototab[state][i].ch = ch; - f->gototab[state][i].state = val; - return val; + if (f->gototab[state].inuse == 0) { + f->gototab[state].entries[0].ch = ch; + f->gototab[state].entries[0].state = val; + f->gototab[state].inuse++; + return val; + } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { + // not seen yet, insert and return + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + + f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; + f->gototab[state].entries[f->gototab[state].inuse-1].state = val; + f->gototab[state].inuse++; + return val; + } else { + // maybe we have it, maybe we don't + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item != NULL) { + // we have it, update state and return + item->state = val; + return item->state; } + // otherwise, fall through to insert and reallocate. } - overflo(__func__); + + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + ++tab->inuse; + f->gototab[state].entries[tab->inuse].ch = ch; + f->gototab[state].entries[tab->inuse].state = val; + + qsort(f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), entry_cmp); + return val; /* not used anywhere at the moment */ } +static void clear_gototab(fa *f, int state) +{ + memset(f->gototab[state].entries, 0, + f->gototab[state].allocated * sizeof(gtte)); + f->gototab[state].inuse = 0; +} + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; @@ -1460,6 +1528,7 @@ int cgoto(fa *f, int s, int c) /* add tmpset to current set of states */ ++(f->curstat); resize_state(f, f->curstat); + clear_gototab(f, f->curstat); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); @@ -1483,7 +1552,8 @@ void freefa(fa *f) /* free a finite automaton */ if (f == NULL) return; for (i = 0; i < f->state_count; i++) - xfree(f->gototab[i]) + xfree(f->gototab[i].entries); + xfree(f->gototab); for (i = 0; i <= f->curstat; i++) xfree(f->posns[i]); for (i = 0; i <= f->accept; i++) { diff --git a/usr.bin/awk/lex.c b/usr.bin/awk/lex.c index b1f422d4093..4c7371de3bb 100644 --- a/usr.bin/awk/lex.c +++ b/usr.bin/awk/lex.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lex.c,v 1.31 2023/09/17 14:49:44 millert Exp $ */ +/* $OpenBSD: lex.c,v 1.32 2023/11/25 16:31:33 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -432,8 +432,12 @@ int string(void) { int i; + if (!isxdigit(peek())) { + unput(c); + break; + } n = 0; - for (i = 1; i <= 2; i++) { + for (i = 0; i < 2; i++) { c = input(); if (c == 0) break; @@ -444,13 +448,13 @@ int string(void) n += (c - '0'); else n += 10 + (c - 'a'); - } else + } else { + unput(c); break; + } } - if (n) + if (i) *bp++ = n; - else - unput(c); break; } diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c index ddec8e820a1..a2f53635351 100644 --- a/usr.bin/awk/main.c +++ b/usr.bin/awk/main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $ */ +/* $OpenBSD: main.c,v 1.66 2023/11/25 16:31:33 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231120"; +const char *version = "version 20231124"; #define DEBUG #include diff --git a/usr.bin/awk/run.c b/usr.bin/awk/run.c index ba9469d69f5..6114768edf1 100644 --- a/usr.bin/awk/run.c +++ b/usr.bin/awk/run.c @@ -1,4 +1,4 @@ -/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */ +/* $OpenBSD: run.c,v 1.82 2023/11/25 16:31:33 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -1541,8 +1541,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */ if (x == y && !(x->tval & (FLD|REC)) && x != nfloc) ; /* self-assignment: leave alone unless it's a field or NF */ else if ((y->tval & (STR|NUM)) == (STR|NUM)) { + yf = getfval(y); setsval(x, getsval(y)); - x->fval = getfval(y); + x->fval = yf; x->tval |= NUM; } else if (isstr(y)) -- 2.20.1