-/* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */
+/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
- int rune;
- size_t len;
- char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
- struct runedata result;
- int c, i, next;
-
- memset(&result, 0, sizeof(result));
-
- c = getc(fp);
- if (c == EOF)
- return result; // result.rune == 0 --> EOF
- else if (c < 128 || awk_mb_cur_max == 1) {
- result.bytes[0] = c;
- result.len = 1;
- result.rune = c;
-
- return result;
- }
-
- // need to get bytes and fill things in
- result.bytes[0] = c;
- result.len = 1;
-
- next = 1;
- for (i = 1; i < MAX_UTF_BYTES; i++) {
- c = getc(fp);
- if (c == EOF)
- break;
- result.bytes[next++] = c;
- result.len++;
- }
-
- // put back any extra input bytes
- int actual_len = u8_nextlen(result.bytes);
- while (result.len > actual_len) {
- ungetc(result.bytes[--result.len], fp);
- }
-
- result.bytes[result.len] = '\0';
- (void) u8_rune(& result.rune, (uschar *) result.bytes);
-
- return result;
-}
-
-
/*
* NAME
* fnematch
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
{
- char *buf = *pbuf;
+ char *i, *j, *k, *buf = *pbuf;
int bufsize = *pbufsize;
- int i, j, k, ns, s;
- struct runedata r;
+ int c, n, ns, s;
s = pfa->initstat;
patlen = 0;
/*
- * All indices relative to buf.
- * i <= j <= k <= bufsize
+ * buf <= i <= j <= k <= buf+bufsize
*
- * i: origin of active substring (first byte of first character)
- * j: current character (last byte of current character)
- * k: destination of next getc()
+ * i: origin of active substring
+ * j: current character
+ * k: destination of the next getc
*/
- i = -1, k = 0;
- do {
- j = i++;
- do {
- r = getrune(f);
- if (r.len == 0) {
- r.len = 1; // store NUL byte for EOF
+
+ i = j = k = buf;
+
+ do {
+ /*
+ * Call u8_rune with at least MAX_UTF_BYTES ahead in
+ * the buffer until EOF interferes.
+ */
+ if (k - j < MAX_UTF_BYTES) {
+ if (k + MAX_UTF_BYTES > buf + bufsize) {
+ adjbuf(&buf, &bufsize,
+ bufsize + MAX_UTF_BYTES,
+ quantum, 0, "fnematch");
}
- j += r.len;
- if (j >= bufsize) {
- if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
- FATAL("stream '%.30s...' too long", buf);
+ for (n = MAX_UTF_BYTES ; n > 0; n--) {
+ *k++ = (c = getc(f)) != EOF ? c : 0;
+ if (c == EOF) {
+ if (ferror(f))
+ FATAL("fnematch: getc error");
+ break;
+ }
}
- memcpy(buf + k, r.bytes, r.len);
- k += r.len;
+ }
- if ((ns = get_gototab(pfa, s, r.rune)) != 0)
- s = ns;
- else
- s = cgoto(pfa, s, r.rune);
+ j += u8_rune(&c, (uschar *)j);
- if (pfa->out[s]) { /* final state */
- patlen = j - i + 1;
- if (r.rune == 0) /* don't count $ */
- patlen--;
- }
- } while (buf[j] && s != 1);
+ if ((ns = get_gototab(pfa, s, c)) != 0)
+ s = ns;
+ else
+ s = cgoto(pfa, s, c);
+
+ if (pfa->out[s]) { /* final state */
+ patbeg = i;
+ patlen = j - i;
+ if (c == 0) /* don't count $ */
+ patlen--;
+ }
+
+ if (c && s != 1)
+ continue; /* origin i still viable, next j */
+ if (patlen)
+ break; /* best match found */
+
+ /* no match at origin i, next i and start over */
+ i += u8_rune(&c, (uschar *)i);
+ if (c == 0)
+ break; /* no match */
+ j = i;
s = 2;
- if (r.len > 1)
- i += r.len - 1; // i incremented around the loop
- } while (buf[i] && !patlen);
+ } while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
- patbeg = buf + i;
/*
* Under no circumstances is the last character fed to
* the automaton part of the match. It is EOF's nullbyte,
* (except for EOF's nullbyte, if present) and null
* terminate the buffer.
*/
- for (; r.len > 0; r.len--)
- if (buf[--k] && ungetc(buf[k], f) == EOF)
- FATAL("unable to ungetc '%c'", buf[k]);
- buf[k-patlen] = '\0';
+ do
+ if (*--k && ungetc(*k, f) == EOF)
+ FATAL("unable to ungetc '%c'", *k);
+ while (k > patbeg + patlen);
+ *k = '\0';
return true;
}
else
-/* $OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $ */
+/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
void backsub(char **pb_ptr, const char **sptr_ptr);
-Cell *sub(Node **a, int nnn) /* substitute command */
+Cell *dosub(Node **a, int subop) /* sub and gsub */
{
- const char *sptr, *q;
- Cell *x, *y, *result;
- char *t, *buf, *pb;
fa *pfa;
+ int tempstat;
+ char *repl;
+ Cell *x;
+
+ char *buf = NULL;
+ char *pb = NULL;
int bufsz = recsize;
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in sub");
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
+ const char *r, *s;
+ const char *start;
+ const char *noempty = NULL; /* empty match disallowed here */
+ size_t m = 0; /* match count */
+ size_t whichm; /* which match to select, 0 = global */
+ int mtype; /* match type */
+
+ if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */
+ pfa = (fa *) a[1];
+ } else {
+ x = execute(a[1]);
+ pfa = makedfa(getsval(x), 1);
+ tempfree(x);
}
- y = execute(a[2]); /* replacement string */
- result = False;
- if (pmatch(pfa, t)) {
- sptr = t;
- adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
- pb = buf;
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = getsval(y);
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
+
+ x = execute(a[2]); /* replacement string */
+ repl = tostring(getsval(x));
+ tempfree(x);
+
+ switch (subop) {
+ case SUB:
+ whichm = 1;
+ x = execute(a[3]); /* source string */
+ break;
+ case GSUB:
+ whichm = 0;
+ x = execute(a[3]); /* source string */
+ break;
+ default:
+ FATAL("dosub: unrecognized subop: %d", subop);
+ }
+
+ start = getsval(x);
+ while (pmatch(pfa, start)) {
+ if (buf == NULL) {
+ if ((pb = buf = malloc(bufsz)) == NULL)
+ FATAL("out of memory in dosub");
+ tempstat = pfa->initstat;
+ pfa->initstat = 2;
}
- *pb = '\0';
- if (pb > buf + bufsz)
- FATAL("sub result1 %.30s too big; can't happen", buf);
- sptr = patbeg + patlen;
- if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
+
+ /* match types */
+ #define MT_IGNORE 0 /* unselected or invalid */
+ #define MT_INSERT 1 /* selected, empty */
+ #define MT_REPLACE 2 /* selected, not empty */
+
+ /* an empty match just after replacement is invalid */
+
+ if (patbeg == noempty && patlen == 0) {
+ mtype = MT_IGNORE; /* invalid, not counted */
+ } else if (whichm == ++m || whichm == 0) {
+ mtype = patlen ? MT_REPLACE : MT_INSERT;
+ } else {
+ mtype = MT_IGNORE; /* unselected, but counted */
}
- if (pb > buf + bufsz)
- FATAL("sub result2 %.30s too big; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy */
- result = True;
- }
- tempfree(x);
- tempfree(y);
- free(buf);
- return result;
-}
-Cell *gsub(Node **a, int nnn) /* global substitute */
-{
- Cell *x, *y;
- char *rptr, *pb;
- const char *q, *t, *sptr;
- char *buf;
- fa *pfa;
- int mflag, tempstat, num;
- int bufsz = recsize;
- int charlen = 0;
+ /* leading text: */
+ if (patbeg > start) {
+ adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
+ recsize, &pb, "dosub");
+ s = start;
+ while (s < patbeg)
+ *pb++ = *s++;
+ }
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in gsub");
- mflag = 0; /* if mflag == 0, can replace empty string */
- num = 0;
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
- }
- y = execute(a[2]); /* replacement string */
- if (pmatch(pfa, t)) {
- tempstat = pfa->initstat;
- pfa->initstat = 2;
- pb = buf;
- rptr = getsval(y);
- do {
- if (patlen == 0 && *patbeg != '\0') { /* matched empty string */
- if (mflag == 0) { /* can replace empty */
- num++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- }
- if (*t == '\0') /* at end */
- goto done;
- adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
- charlen = u8_nextlen(t);
- while (charlen-- > 0)
- *pb++ = *t++;
- if (pb > buf + bufsz) /* BUG: not sure of this test */
- FATAL("gsub result0 %.30s too big; can't happen", buf);
- mflag = 0;
- }
- else { /* matched nonempty string */
- num++;
- sptr = t;
- adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- t = patbeg + patlen;
- if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
- goto done;
- if (pb > buf + bufsz)
- FATAL("gsub result1 %.30s too big; can't happen", buf);
- mflag = 1;
+ if (mtype == MT_IGNORE)
+ goto matching_text; /* skip replacement text */
+
+ r = repl;
+ while (*r != 0) {
+ adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
+ if (*r == '\\') {
+ backsub(&pb, &r);
+ } else if (*r == '&') {
+ r++;
+ adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
+ &pb, "dosub");
+ for (s = patbeg; s < patbeg+patlen; )
+ *pb++ = *s++;
+ } else {
+ *pb++ = *r++;
}
- } while (pmatch(pfa,t));
- sptr = t;
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
- done: if (pb < buf + bufsz)
- *pb = '\0';
- else if (*(pb-1) != '\0')
- FATAL("gsub result2 %.30s truncated; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy + free */
+ }
+
+matching_text:
+ if (mtype == MT_REPLACE || *patbeg == '\0')
+ goto next_search; /* skip matching text */
+
+ if (patlen == 0)
+ patlen = u8_nextlen(patbeg);
+ adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
+ s = patbeg;
+ while (s < patbeg + patlen)
+ *pb++ = *s++;
+
+next_search:
+ start = patbeg + patlen;
+ if (m == whichm || *patbeg == '\0')
+ break;
+ if (mtype == MT_REPLACE)
+ noempty = start;
+
+ #undef MT_IGNORE
+ #undef MT_INSERT
+ #undef MT_REPLACE
+ }
+
+ xfree(repl);
+
+ if (buf != NULL) {
pfa->initstat = tempstat;
+
+ /* trailing text */
+ adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
+ while ((*pb++ = *start++) != '\0')
+ ;
+
+ setsval(x, buf);
+ free(buf);
}
+
tempfree(x);
- tempfree(y);
x = gettemp();
x->tval = NUM;
- x->fval = num;
- free(buf);
- return(x);
+ x->fval = m;
+ return x;
}
Cell *gensub(Node **a, int nnn) /* global selective substitute */