From: cheloha Date: Fri, 2 Sep 2022 15:21:40 +0000 (+0000) Subject: wc(1): accelerate word counting X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=3a7fc93ec35e245ad13b0c5ca5bf7b5df572713a;p=openbsd wc(1): accelerate word counting wc(1) counts a word whenever a whitespace byte is followed by a non-whitespace byte. Because the state machine transition occurs within the space of a single byte we don't need to use getline(3). Counting words in a big buffer with read(2) is much faster. The overhead varies with the length of a line, but for files with 60-100 byte lines, word counting is about twice as fast when we avoid getline(3). In the pathological case where each line is a single byte, word counting is about ten times as fast when we avoid getline(3). Link1: https://marc.info/?l=openbsd-tech&m=163715995626532&w=2 Link2: https://marc.info/?l=openbsd-tech&m=165956826103639&w=2 "Seems reasonable." deraadt@ --- diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c index 009933ff802..0e1e4856827 100644 --- a/usr.bin/wc/wc.c +++ b/usr.bin/wc/wc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: wc.c,v 1.29 2021/11/28 19:28:42 deraadt Exp $ */ +/* $OpenBSD: wc.c,v 1.30 2022/09/02 15:21:40 cheloha Exp $ */ /* * Copyright (c) 1980, 1987, 1991, 1993 @@ -145,16 +145,42 @@ cnt(const char *path) fd = STDIN_FILENO; } - if (!doword && !multibyte) { + if (!multibyte) { if (bufsz < _MAXBSIZE && (buf = realloc(buf, _MAXBSIZE)) == NULL) err(1, NULL); + + /* + * According to POSIX, a word is a "maximal string of + * characters delimited by whitespace." Nothing is said + * about a character being printing or non-printing. + */ + if (doword) { + gotsp = 1; + while ((len = read(fd, buf, _MAXBSIZE)) > 0) { + charct += len; + for (C = buf; len--; ++C) { + if (isspace((unsigned char)*C)) { + gotsp = 1; + if (*C == '\n') + ++linect; + } else if (gotsp) { + gotsp = 0; + ++wordct; + } + } + } + if (len == -1) { + warn("%s", file); + rval = 1; + } + } /* * Line counting is split out because it's a lot * faster to get lines than to get words, since * the word count requires some logic. */ - if (doline) { + else if (doline) { while ((len = read(fd, buf, _MAXBSIZE)) > 0) { charct += len; for (C = buf; len--; ++C) @@ -204,46 +230,26 @@ cnt(const char *path) return; } - /* - * Do it the hard way. - * According to POSIX, a word is a "maximal string of - * characters delimited by whitespace." Nothing is said - * about a character being printing or non-printing. - */ gotsp = 1; while ((len = getline(&buf, &bufsz, stream)) > 0) { - if (multibyte) { - const char *end = buf + len; - for (C = buf; C < end; C += len) { - ++charct; - len = mbtowc(&wc, C, MB_CUR_MAX); - if (len == -1) { - mbtowc(NULL, NULL, - MB_CUR_MAX); - len = 1; - wc = L'?'; - } else if (len == 0) - len = 1; - if (iswspace(wc)) { - gotsp = 1; - if (wc == L'\n') - ++linect; - } else if (gotsp) { - gotsp = 0; - ++wordct; - } - } - } else { - charct += len; - for (C = buf; len--; ++C) { - if (isspace((unsigned char)*C)) { - gotsp = 1; - if (*C == '\n') - ++linect; - } else if (gotsp) { - gotsp = 0; - ++wordct; - } + const char *end = buf + len; + for (C = buf; C < end; C += len) { + ++charct; + len = mbtowc(&wc, C, MB_CUR_MAX); + if (len == -1) { + mbtowc(NULL, NULL, + MB_CUR_MAX); + len = 1; + wc = L'?'; + } else if (len == 0) + len = 1; + if (iswspace(wc)) { + gotsp = 1; + if (wc == L'\n') + ++linect; + } else if (gotsp) { + gotsp = 0; + ++wordct; } } }