wc(1): accelerate word counting
authorcheloha <cheloha@openbsd.org>
Fri, 2 Sep 2022 15:21:40 +0000 (15:21 +0000)
committercheloha <cheloha@openbsd.org>
Fri, 2 Sep 2022 15:21:40 +0000 (15:21 +0000)
wc(1) counts a word whenever a whitespace byte is followed by a
non-whitespace byte.  Because the state machine transition occurs
within the space of a single byte we don't need to use getline(3).

Counting words in a big buffer with read(2) is much faster.  The
overhead varies with the length of a line, but for files with 60-100
byte lines, word counting is about twice as fast when we avoid
getline(3).  In the pathological case where each line is a single
byte, word counting is about ten times as fast when we avoid
getline(3).

Link1: https://marc.info/?l=openbsd-tech&m=163715995626532&w=2
Link2: https://marc.info/?l=openbsd-tech&m=165956826103639&w=2

"Seems reasonable." deraadt@

usr.bin/wc/wc.c

index 009933f..0e1e485 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: wc.c,v 1.29 2021/11/28 19:28:42 deraadt Exp $ */
+/*     $OpenBSD: wc.c,v 1.30 2022/09/02 15:21:40 cheloha Exp $ */
 
 /*
  * Copyright (c) 1980, 1987, 1991, 1993
@@ -145,16 +145,42 @@ cnt(const char *path)
                fd = STDIN_FILENO;
        }
 
-       if (!doword && !multibyte) {
+       if (!multibyte) {
                if (bufsz < _MAXBSIZE &&
                    (buf = realloc(buf, _MAXBSIZE)) == NULL)
                        err(1, NULL);
+
+               /*
+                * According to POSIX, a word is a "maximal string of
+                * characters delimited by whitespace."  Nothing is said
+                * about a character being printing or non-printing.
+                */
+               if (doword) {
+                       gotsp = 1;
+                       while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
+                               charct += len;
+                               for (C = buf; len--; ++C) {
+                                       if (isspace((unsigned char)*C)) {
+                                               gotsp = 1;
+                                               if (*C == '\n')
+                                                       ++linect;
+                                       } else if (gotsp) {
+                                               gotsp = 0;
+                                               ++wordct;
+                                       }
+                               }
+                       }
+                       if (len == -1) {
+                               warn("%s", file);
+                               rval = 1;
+                       }
+               }
                /*
                 * Line counting is split out because it's a lot
                 * faster to get lines than to get words, since
                 * the word count requires some logic.
                 */
-               if (doline) {
+               else if (doline) {
                        while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
                                charct += len;
                                for (C = buf; len--; ++C)
@@ -204,46 +230,26 @@ cnt(const char *path)
                        return;
                }
 
-               /*
-                * Do it the hard way.
-                * According to POSIX, a word is a "maximal string of
-                * characters delimited by whitespace."  Nothing is said
-                * about a character being printing or non-printing.
-                */
                gotsp = 1;
                while ((len = getline(&buf, &bufsz, stream)) > 0) {
-                       if (multibyte) {
-                               const char *end = buf + len;
-                               for (C = buf; C < end; C += len) {
-                                       ++charct;
-                                       len = mbtowc(&wc, C, MB_CUR_MAX);
-                                       if (len == -1) {
-                                               mbtowc(NULL, NULL,
-                                                   MB_CUR_MAX);
-                                               len = 1;
-                                               wc = L'?';
-                                       } else if (len == 0)
-                                               len = 1;
-                                       if (iswspace(wc)) {
-                                               gotsp = 1;
-                                               if (wc == L'\n')
-                                                       ++linect;
-                                       } else if (gotsp) {
-                                               gotsp = 0;
-                                               ++wordct;
-                                       }
-                               }
-                       } else {
-                               charct += len;
-                               for (C = buf; len--; ++C) {
-                                       if (isspace((unsigned char)*C)) {
-                                               gotsp = 1;
-                                               if (*C == '\n')
-                                                       ++linect;
-                                       } else if (gotsp) {
-                                               gotsp = 0;
-                                               ++wordct;
-                                       }
+                       const char *end = buf + len;
+                       for (C = buf; C < end; C += len) {
+                               ++charct;
+                               len = mbtowc(&wc, C, MB_CUR_MAX);
+                               if (len == -1) {
+                                       mbtowc(NULL, NULL,
+                                           MB_CUR_MAX);
+                                       len = 1;
+                                       wc = L'?';
+                               } else if (len == 0)
+                                       len = 1;
+                               if (iswspace(wc)) {
+                                       gotsp = 1;
+                                       if (wc == L'\n')
+                                               ++linect;
+                               } else if (gotsp) {
+                                       gotsp = 0;
+                                       ++wordct;
                                }
                        }
                }