Update awk to the Nov 20, 2023 version.
authormillert <millert@openbsd.org>
Wed, 22 Nov 2023 01:01:21 +0000 (01:01 +0000)
committermillert <millert@openbsd.org>
Wed, 22 Nov 2023 01:01:21 +0000 (01:01 +0000)
This includes a rewrite of the fnematch() function as well as a
refactoring of the sub and gsub implementation.

usr.bin/awk/FIXES
usr.bin/awk/b.c
usr.bin/awk/main.c
usr.bin/awk/maketab.c
usr.bin/awk/proto.h
usr.bin/awk/run.c

index a13ca50..5d2b459 100644 (file)
@@ -25,6 +25,18 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the 
 second edition of the AWK book was published in September 2023.
 
+Nov 20, 2023
+       rewrite of fnematch to fix a number of issues, including
+       extraneous output, out-of-bounds access, number of bytes
+       to push back after a failed match etc.
+       thanks to Miguel Pineiro Jr.
+
+Nov 15, 2023
+       Man page edit, regression test fixes. thanks to Arnold Robbins
+       consolidation of sub and gsub into dosub, removing duplicate
+       code. thanks to Miguel Pineiro Jr.
+       gcc replaced with cc everywhere.
+
 Oct 30, 2023:
        multiple fixes and a minor code cleanup.
        disabled utf-8 for non-multibyte locales, such as C or POSIX.
index 543fbf7..09548d7 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $  */
+/*     $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $  */
 /****************************************************************
 Copyright (C) Lucent Technologies 1997
 All Rights Reserved
@@ -770,59 +770,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
 
 #define MAX_UTF_BYTES  4       // UTF-8 is up to 4 bytes long
 
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
-       int rune;
-       size_t len;
-       char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
-       struct runedata result;
-       int c, i, next;
-
-       memset(&result, 0, sizeof(result));
-
-       c = getc(fp);
-       if (c == EOF)
-               return result;  // result.rune == 0 --> EOF
-       else if (c < 128 || awk_mb_cur_max == 1) {
-               result.bytes[0] = c;
-               result.len = 1;
-               result.rune = c;
-
-               return result;
-       }
-
-       // need to get bytes and fill things in
-       result.bytes[0] = c;
-       result.len = 1;
-
-       next = 1;
-       for (i = 1; i < MAX_UTF_BYTES; i++) {
-               c = getc(fp);
-               if (c == EOF)
-                       break;
-               result.bytes[next++] = c;
-               result.len++;
-       }
-
-       // put back any extra input bytes
-       int actual_len = u8_nextlen(result.bytes);
-       while (result.len > actual_len) {
-               ungetc(result.bytes[--result.len], fp);
-       }
-
-       result.bytes[result.len] = '\0';
-       (void) u8_rune(& result.rune, (uschar *) result.bytes);
-
-       return result;
-}
-
-
 /*
  * NAME
  *     fnematch
@@ -840,60 +787,76 @@ struct runedata getrune(FILE *fp)
 
 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 {
-       char *buf = *pbuf;
+       char *i, *j, *k, *buf = *pbuf;
        int bufsize = *pbufsize;
-       int i, j, k, ns, s;
-       struct runedata r;
+       int c, n, ns, s;
 
        s = pfa->initstat;
        patlen = 0;
 
        /*
-        * All indices relative to buf.
-        * i <= j <= k <= bufsize
+        * buf <= i <= j <= k <= buf+bufsize
         *
-        * i: origin of active substring (first byte of first character)
-        * j: current character         (last byte of current character)
-        * k: destination of next getc()
+        * i: origin of active substring
+        * j: current character
+        * k: destination of the next getc
         */
-       i = -1, k = 0;
-        do {
-               j = i++;
-               do {
-                       r = getrune(f);
-                       if (r.len == 0) {
-                               r.len = 1;      // store NUL byte for EOF
+
+       i = j = k = buf;
+
+       do {
+               /*
+                * Call u8_rune with at least MAX_UTF_BYTES ahead in
+                * the buffer until EOF interferes.
+                */
+               if (k - j < MAX_UTF_BYTES) {
+                       if (k + MAX_UTF_BYTES > buf + bufsize) {
+                               adjbuf(&buf, &bufsize,
+                                   bufsize + MAX_UTF_BYTES,
+                                   quantum, 0, "fnematch");
                        }
-                       j += r.len;
-                       if (j >= bufsize) {
-                               if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
-                                       FATAL("stream '%.30s...' too long", buf);
+                       for (n = MAX_UTF_BYTES ; n > 0; n--) {
+                               *k++ = (c = getc(f)) != EOF ? c : 0;
+                               if (c == EOF) {
+                                       if (ferror(f))
+                                               FATAL("fnematch: getc error");
+                                       break;
+                               }
                        }
-                       memcpy(buf + k, r.bytes, r.len);
-                       k += r.len;
+               }
 
-                       if ((ns = get_gototab(pfa, s, r.rune)) != 0)
-                               s = ns;
-                       else
-                               s = cgoto(pfa, s, r.rune);
+               j += u8_rune(&c, (uschar *)j);
 
-                       if (pfa->out[s]) {      /* final state */
-                               patlen = j - i + 1;
-                               if (r.rune == 0)        /* don't count $ */
-                                       patlen--;
-                       }
-               } while (buf[j] && s != 1);
+               if ((ns = get_gototab(pfa, s, c)) != 0)
+                       s = ns;
+               else
+                       s = cgoto(pfa, s, c);
+
+               if (pfa->out[s]) {      /* final state */
+                       patbeg = i;
+                       patlen = j - i;
+                       if (c == 0)     /* don't count $ */
+                               patlen--;
+               }
+
+               if (c && s != 1)
+                       continue;  /* origin i still viable, next j */
+               if (patlen)
+                       break;     /* best match found */
+
+               /* no match at origin i, next i and start over */
+               i += u8_rune(&c, (uschar *)i);
+               if (c == 0)
+                       break;    /* no match */
+               j = i;
                s = 2;
-               if (r.len > 1)
-                       i += r.len - 1; // i incremented around the loop
-       } while (buf[i] && !patlen);
+       } while (1);
 
        /* adjbuf() may have relocated a resized buffer. Inform the world. */
        *pbuf = buf;
        *pbufsize = bufsize;
 
        if (patlen) {
-               patbeg = buf + i;
                /*
                 * Under no circumstances is the last character fed to
                 * the automaton part of the match. It is EOF's nullbyte,
@@ -905,10 +868,11 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
                 * (except for EOF's nullbyte, if present) and null
                 * terminate the buffer.
                 */
-               for (; r.len > 0; r.len--)
-                       if (buf[--k] && ungetc(buf[k], f) == EOF)
-                               FATAL("unable to ungetc '%c'", buf[k]);
-               buf[k-patlen] = '\0';
+               do
+                       if (*--k && ungetc(*k, f) == EOF)
+                               FATAL("unable to ungetc '%c'", *k);
+               while (k > patbeg + patlen);
+               *k = '\0';
                return true;
        }
        else
index d5acca8..ddec8e8 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: main.c,v 1.64 2023/10/31 01:08:51 millert Exp $       */
+/*     $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $       */
 /****************************************************************
 Copyright (C) Lucent Technologies 1997
 All Rights Reserved
@@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
 
-const char     *version = "version 20231030";
+const char     *version = "version 20231120";
 
 #define DEBUG
 #include <stdio.h>
index 2c4adf7..4f2756b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: maketab.c,v 1.21 2023/10/30 17:52:54 millert Exp $    */
+/*     $OpenBSD: maketab.c,v 1.22 2023/11/22 01:01:21 millert Exp $    */
 /****************************************************************
 Copyright (C) Lucent Technologies 1997
 All Rights Reserved
@@ -53,8 +53,8 @@ struct xx
        { ARRAY, "array", NULL },
        { INDIRECT, "indirect", "$(" },
        { SUBSTR, "substr", "substr" },
-       { SUB, "sub", "sub" },
-       { GSUB, "gsub", "gsub" },
+       { SUB, "dosub", "sub" },
+       { GSUB, "dosub", "gsub" },
        { INDEX, "sindex", "sindex" },
        { SPRINTF, "awksprintf", "sprintf " },
        { ADD, "arith", " + " },
index 7d9aa3c..4c2fafd 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: proto.h,v 1.22 2023/09/17 14:49:44 millert Exp $      */
+/*     $OpenBSD: proto.h,v 1.23 2023/11/22 01:01:21 millert Exp $      */
 /****************************************************************
 Copyright (C) Lucent Technologies 1997
 All Rights Reserved
@@ -199,8 +199,7 @@ extern      FILE    *openfile(int, const char *, bool *);
 extern const char      *filename(FILE *);
 extern Cell    *closefile(Node **, int);
 extern void    closeall(void);
-extern Cell    *sub(Node **, int);
-extern Cell    *gsub(Node **, int);
+extern Cell    *dosub(Node **, int);
 extern Cell    *gensub(Node **, int);
 
 extern FILE    *popen(const char *, const char *);
index 6e72ec1..ba9469d 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $        */
+/*     $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $        */
 /****************************************************************
 Copyright (C) Lucent Technologies 1997
 All Rights Reserved
@@ -2518,169 +2518,143 @@ static void flush_all(void)
 
 void backsub(char **pb_ptr, const char **sptr_ptr);
 
-Cell *sub(Node **a, int nnn)   /* substitute command */
+Cell *dosub(Node **a, int subop)        /* sub and gsub */
 {
-       const char *sptr, *q;
-       Cell *x, *y, *result;
-       char *t, *buf, *pb;
        fa *pfa;
+       int tempstat;
+       char *repl;
+       Cell *x;
+
+       char *buf = NULL;
+       char *pb = NULL;
        int bufsz = recsize;
 
-       if ((buf = (char *) malloc(bufsz)) == NULL)
-               FATAL("out of memory in sub");
-       x = execute(a[3]);      /* target string */
-       t = getsval(x);
-       if (a[0] == NULL)       /* 0 => a[1] is already-compiled regexpr */
-               pfa = (fa *) a[1];      /* regular expression */
-       else {
-               y = execute(a[1]);
-               pfa = makedfa(getsval(y), 1);
-               tempfree(y);
+       const char *r, *s;
+       const char *start;
+       const char *noempty = NULL;      /* empty match disallowed here */
+       size_t m = 0;                    /* match count */
+       size_t whichm;                   /* which match to select, 0 = global */
+       int mtype;                       /* match type */
+
+       if (a[0] == NULL) {     /* 0 => a[1] is already-compiled regexpr */
+               pfa = (fa *) a[1];
+       } else {
+               x = execute(a[1]);
+               pfa = makedfa(getsval(x), 1);
+               tempfree(x);
        }
-       y = execute(a[2]);      /* replacement string */
-       result = False;
-       if (pmatch(pfa, t)) {
-               sptr = t;
-               adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
-               pb = buf;
-               while (sptr < patbeg)
-                       *pb++ = *sptr++;
-               sptr = getsval(y);
-               while (*sptr != '\0') {
-                       adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
-                       if (*sptr == '\\') {
-                               backsub(&pb, &sptr);
-                       } else if (*sptr == '&') {
-                               sptr++;
-                               adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
-                               for (q = patbeg; q < patbeg+patlen; )
-                                       *pb++ = *q++;
-                       } else
-                               *pb++ = *sptr++;
+
+       x = execute(a[2]);      /* replacement string */
+       repl = tostring(getsval(x));
+       tempfree(x);
+
+       switch (subop) {
+       case SUB:
+               whichm = 1;
+               x = execute(a[3]);    /* source string */
+               break;
+       case GSUB:
+               whichm = 0;
+               x = execute(a[3]);    /* source string */
+               break;
+       default:
+               FATAL("dosub: unrecognized subop: %d", subop);
+       }
+
+       start = getsval(x);
+       while (pmatch(pfa, start)) {
+               if (buf == NULL) {
+                       if ((pb = buf = malloc(bufsz)) == NULL)
+                               FATAL("out of memory in dosub");
+                       tempstat = pfa->initstat;
+                       pfa->initstat = 2;
                }
-               *pb = '\0';
-               if (pb > buf + bufsz)
-                       FATAL("sub result1 %.30s too big; can't happen", buf);
-               sptr = patbeg + patlen;
-               if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
-                       adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
-                       while ((*pb++ = *sptr++) != '\0')
-                               continue;
+
+               /* match types */
+               #define MT_IGNORE  0  /* unselected or invalid */
+               #define MT_INSERT  1  /* selected, empty */
+               #define MT_REPLACE 2  /* selected, not empty */
+
+               /* an empty match just after replacement is invalid */
+
+               if (patbeg == noempty && patlen == 0) {
+                       mtype = MT_IGNORE;    /* invalid, not counted */
+               } else if (whichm == ++m || whichm == 0) {
+                       mtype = patlen ? MT_REPLACE : MT_INSERT;
+               } else {
+                       mtype = MT_IGNORE;    /* unselected, but counted */
                }
-               if (pb > buf + bufsz)
-                       FATAL("sub result2 %.30s too big; can't happen", buf);
-               setsval(x, buf);        /* BUG: should be able to avoid copy */
-               result = True;
-       }
-       tempfree(x);
-       tempfree(y);
-       free(buf);
-       return result;
-}
 
-Cell *gsub(Node **a, int nnn)  /* global substitute */
-{
-       Cell *x, *y;
-       char *rptr, *pb;
-       const char *q, *t, *sptr;
-       char *buf;
-       fa *pfa;
-       int mflag, tempstat, num;
-       int bufsz = recsize;
-       int charlen = 0;
+               /* leading text: */
+               if (patbeg > start) {
+                       adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
+                               recsize, &pb, "dosub");
+                       s = start;
+                       while (s < patbeg)
+                               *pb++ = *s++;
+               }
 
-       if ((buf = (char *) malloc(bufsz)) == NULL)
-               FATAL("out of memory in gsub");
-       mflag = 0;      /* if mflag == 0, can replace empty string */
-       num = 0;
-       x = execute(a[3]);      /* target string */
-       t = getsval(x);
-       if (a[0] == NULL)       /* 0 => a[1] is already-compiled regexpr */
-               pfa = (fa *) a[1];      /* regular expression */
-       else {
-               y = execute(a[1]);
-               pfa = makedfa(getsval(y), 1);
-               tempfree(y);
-       }
-       y = execute(a[2]);      /* replacement string */
-       if (pmatch(pfa, t)) {
-               tempstat = pfa->initstat;
-               pfa->initstat = 2;
-               pb = buf;
-               rptr = getsval(y);
-               do {
-                       if (patlen == 0 && *patbeg != '\0') {   /* matched empty string */
-                               if (mflag == 0) {       /* can replace empty */
-                                       num++;
-                                       sptr = rptr;
-                                       while (*sptr != '\0') {
-                                               adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
-                                               if (*sptr == '\\') {
-                                                       backsub(&pb, &sptr);
-                                               } else if (*sptr == '&') {
-                                                       sptr++;
-                                                       adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
-                                                       for (q = patbeg; q < patbeg+patlen; )
-                                                               *pb++ = *q++;
-                                               } else
-                                                       *pb++ = *sptr++;
-                                       }
-                               }
-                               if (*t == '\0') /* at end */
-                                       goto done;
-                               adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
-                               charlen = u8_nextlen(t);
-                               while (charlen-- > 0)
-                                       *pb++ = *t++;
-                               if (pb > buf + bufsz)   /* BUG: not sure of this test */
-                                       FATAL("gsub result0 %.30s too big; can't happen", buf);
-                               mflag = 0;
-                       }
-                       else {  /* matched nonempty string */
-                               num++;
-                               sptr = t;
-                               adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
-                               while (sptr < patbeg)
-                                       *pb++ = *sptr++;
-                               sptr = rptr;
-                               while (*sptr != '\0') {
-                                       adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
-                                       if (*sptr == '\\') {
-                                               backsub(&pb, &sptr);
-                                       } else if (*sptr == '&') {
-                                               sptr++;
-                                               adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
-                                               for (q = patbeg; q < patbeg+patlen; )
-                                                       *pb++ = *q++;
-                                       } else
-                                               *pb++ = *sptr++;
-                               }
-                               t = patbeg + patlen;
-                               if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
-                                       goto done;
-                               if (pb > buf + bufsz)
-                                       FATAL("gsub result1 %.30s too big; can't happen", buf);
-                               mflag = 1;
+               if (mtype == MT_IGNORE)
+                       goto matching_text;  /* skip replacement text */
+
+               r = repl;
+               while (*r != 0) {
+                       adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
+                       if (*r == '\\') {
+                               backsub(&pb, &r);
+                       } else if (*r == '&') {
+                               r++;
+                               adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
+                                       &pb, "dosub");
+                               for (s = patbeg; s < patbeg+patlen; )
+                                       *pb++ = *s++;
+                       } else {
+                               *pb++ = *r++;
                        }
-               } while (pmatch(pfa,t));
-               sptr = t;
-               adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
-               while ((*pb++ = *sptr++) != '\0')
-                       continue;
-       done:   if (pb < buf + bufsz)
-                       *pb = '\0';
-               else if (*(pb-1) != '\0')
-                       FATAL("gsub result2 %.30s truncated; can't happen", buf);
-               setsval(x, buf);        /* BUG: should be able to avoid copy + free */
+               }
+
+matching_text:
+               if (mtype == MT_REPLACE || *patbeg == '\0')
+                       goto next_search;  /* skip matching text */
+               
+               if (patlen == 0)
+                       patlen = u8_nextlen(patbeg);
+               adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
+               s = patbeg;
+               while (s < patbeg + patlen)
+                       *pb++ = *s++;
+
+next_search:
+               start = patbeg + patlen;
+               if (m == whichm || *patbeg == '\0')
+                       break;
+               if (mtype == MT_REPLACE)
+                       noempty = start;
+
+               #undef MT_IGNORE
+               #undef MT_INSERT
+               #undef MT_REPLACE
+       }
+
+       xfree(repl);
+
+       if (buf != NULL) {
                pfa->initstat = tempstat;
+
+               /* trailing text */
+               adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
+               while ((*pb++ = *start++) != '\0')
+                       ;
+
+               setsval(x, buf);
+               free(buf);
        }
+
        tempfree(x);
-       tempfree(y);
        x = gettemp();
        x->tval = NUM;
-       x->fval = num;
-       free(buf);
-       return(x);
+       x->fval = m;
+       return x;
 }
 
 Cell *gensub(Node **a, int nnn)        /* global selective substitute */