From: cheloha <cheloha@openbsd.org>
Date: Fri, 2 Sep 2022 15:21:40 +0000 (+0000)
Subject: wc(1): accelerate word counting
X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=3a7fc93ec35e245ad13b0c5ca5bf7b5df572713a;p=openbsd

wc(1): accelerate word counting

wc(1) counts a word whenever a whitespace byte is followed by a
non-whitespace byte.  Because the state machine transition occurs
within the space of a single byte we don't need to use getline(3).

Counting words in a big buffer with read(2) is much faster.  The
overhead varies with the length of a line, but for files with 60-100
byte lines, word counting is about twice as fast when we avoid
getline(3).  In the pathological case where each line is a single
byte, word counting is about ten times as fast when we avoid
getline(3).

Link1: https://marc.info/?l=openbsd-tech&m=163715995626532&w=2
Link2: https://marc.info/?l=openbsd-tech&m=165956826103639&w=2

"Seems reasonable." deraadt@
---

diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c
index 009933ff802..0e1e4856827 100644
--- a/usr.bin/wc/wc.c
+++ b/usr.bin/wc/wc.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: wc.c,v 1.29 2021/11/28 19:28:42 deraadt Exp $	*/
+/*	$OpenBSD: wc.c,v 1.30 2022/09/02 15:21:40 cheloha Exp $	*/
 
 /*
  * Copyright (c) 1980, 1987, 1991, 1993
@@ -145,16 +145,42 @@ cnt(const char *path)
 		fd = STDIN_FILENO;
 	}
 
-	if (!doword && !multibyte) {
+	if (!multibyte) {
 		if (bufsz < _MAXBSIZE &&
 		    (buf = realloc(buf, _MAXBSIZE)) == NULL)
 			err(1, NULL);
+
+		/*
+		 * According to POSIX, a word is a "maximal string of
+		 * characters delimited by whitespace."  Nothing is said
+		 * about a character being printing or non-printing.
+		 */
+		if (doword) {
+			gotsp = 1;
+			while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
+				charct += len;
+				for (C = buf; len--; ++C) {
+					if (isspace((unsigned char)*C)) {
+						gotsp = 1;
+						if (*C == '\n')
+							++linect;
+					} else if (gotsp) {
+						gotsp = 0;
+						++wordct;
+					}
+				}
+			}
+			if (len == -1) {
+				warn("%s", file);
+				rval = 1;
+			}
+		}
 		/*
 		 * Line counting is split out because it's a lot
 		 * faster to get lines than to get words, since
 		 * the word count requires some logic.
 		 */
-		if (doline) {
+		else if (doline) {
 			while ((len = read(fd, buf, _MAXBSIZE)) > 0) {
 				charct += len;
 				for (C = buf; len--; ++C)
@@ -204,46 +230,26 @@ cnt(const char *path)
 			return;
 		}
 
-		/*
-		 * Do it the hard way.
-		 * According to POSIX, a word is a "maximal string of
-		 * characters delimited by whitespace."  Nothing is said
-		 * about a character being printing or non-printing.
-		 */
 		gotsp = 1;
 		while ((len = getline(&buf, &bufsz, stream)) > 0) {
-			if (multibyte) {
-				const char *end = buf + len;
-				for (C = buf; C < end; C += len) {
-					++charct;
-					len = mbtowc(&wc, C, MB_CUR_MAX);
-					if (len == -1) {
-						mbtowc(NULL, NULL,
-						    MB_CUR_MAX);
-						len = 1;
-						wc = L'?';
-					} else if (len == 0)
-						len = 1;
-					if (iswspace(wc)) {
-						gotsp = 1;
-						if (wc == L'\n')
-							++linect;
-					} else if (gotsp) {
-						gotsp = 0;
-						++wordct;
-					}
-				}
-			} else {
-				charct += len;
-				for (C = buf; len--; ++C) {
-					if (isspace((unsigned char)*C)) {
-						gotsp = 1;
-						if (*C == '\n')
-							++linect;
-					} else if (gotsp) {
-						gotsp = 0;
-						++wordct;
-					}
+			const char *end = buf + len;
+			for (C = buf; C < end; C += len) {
+				++charct;
+				len = mbtowc(&wc, C, MB_CUR_MAX);
+				if (len == -1) {
+					mbtowc(NULL, NULL,
+					    MB_CUR_MAX);
+					len = 1;
+					wc = L'?';
+				} else if (len == 0)
+					len = 1;
+				if (iswspace(wc)) {
+					gotsp = 1;
+					if (wc == L'\n')
+						++linect;
+				} else if (gotsp) {
+					gotsp = 0;
+					++wordct;
 				}
 			}
 		}