From: schwarze Date: Sun, 29 Jul 2018 11:27:14 +0000 (+0000) Subject: UTF-8 support: use wcwidth(3) when calculating column widths; X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=94b41d461e620318723ee4083327eebf272139d8;p=openbsd UTF-8 support: use wcwidth(3) when calculating column widths; written during g218; no objection when shown on tech@ --- diff --git a/usr.bin/lam/Makefile b/usr.bin/lam/Makefile index 956087cd7f2..74a2ab00bfa 100644 --- a/usr.bin/lam/Makefile +++ b/usr.bin/lam/Makefile @@ -1,5 +1,6 @@ -# $OpenBSD: Makefile,v 1.3 1997/09/21 11:49:24 deraadt Exp $ +# $OpenBSD: Makefile,v 1.4 2018/07/29 11:27:14 schwarze Exp $ PROG= lam +SRCS= lam.c utf8.c .include diff --git a/usr.bin/lam/lam.1 b/usr.bin/lam/lam.1 index fd9d5d03934..bbdeb1e10b4 100644 --- a/usr.bin/lam/lam.1 +++ b/usr.bin/lam/lam.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: lam.1,v 1.9 2016/01/04 23:21:28 schwarze Exp $ +.\" $OpenBSD: lam.1,v 1.10 2018/07/29 11:27:14 schwarze Exp $ .\" $NetBSD: lam.1,v 1.4 2002/02/08 01:36:25 ross Exp $ .\" .\" Copyright (c) 1993 @@ -30,7 +30,7 @@ .\" .\" @(#)lam.1 8.1 (Berkeley) 6/6/93 .\" -.Dd $Mdocdate: January 4 2016 $ +.Dd $Mdocdate: July 29 2018 $ .Dt LAM 1 .Os .Sh NAME @@ -74,8 +74,8 @@ is the minimum field width and the maximum field width. If .Ar min -begins with a zero, zeros will be added to make up the field width, -and if it begins with a +begins with a zero, zeros will be prepended to make up the field width +instead of blanks, and if it begins with a .Sq \&- , the fragment will be left-adjusted within the field. @@ -98,6 +98,22 @@ The newline normally appended to each output line is omitted. .Pp To print files simultaneously for easy viewing use .Xr pr 1 . +.Sh ENVIRONMENT +.Bl -tag -width LC_CTYPE +.It Ev LC_CTYPE +The character encoding +.Xr locale 1 . +It determines the display widths of characters used by the +.Fl f +and +.Fl p +options. +If unset or set to +.Qq C , +.Qq POSIX , +or an unsupported value, each byte is regarded as a character +of display width 1. +.El .Sh EXAMPLES Join four files together along each line: .Pp diff --git a/usr.bin/lam/lam.c b/usr.bin/lam/lam.c index 1e169eff265..9b84b17924f 100644 --- a/usr.bin/lam/lam.c +++ b/usr.bin/lam/lam.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lam.c,v 1.21 2018/07/11 11:42:17 schwarze Exp $ */ +/* $OpenBSD: lam.c,v 1.22 2018/07/29 11:27:14 schwarze Exp $ */ /* $NetBSD: lam.c,v 1.2 1994/11/14 20:27:42 jtc Exp $ */ /*- @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -48,11 +49,13 @@ struct openfile { /* open file structure */ FILE *fp; /* file pointer */ + int minwidth; /* pad this column to this width */ + int maxwidth; /* truncate this column */ short eof; /* eof flag */ short pad; /* pad flag for missing columns */ char eol; /* end of line character */ + char align; /* '0' for zero fill, '-' for left align */ char *sepstring; /* string to print before each line */ - char *format; /* printf(3) style string spec. */ } input[NOFILE_MAX + 1]; /* last one is for the last -s arg. */ #define INPUTSIZE sizeof(input) / sizeof(*input) @@ -61,6 +64,8 @@ int nofinalnl; /* normally append \n to each output line */ char line[BIGBUFSIZ]; char *linep; +int mbswidth_truncate(char *, int); /* utf8.c */ + void usage(void); char *gatherline(struct openfile *); void getargs(int, char *[]); @@ -71,6 +76,8 @@ main(int argc, char *argv[]) { int i; + setlocale(LC_CTYPE, ""); + if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); @@ -106,9 +113,9 @@ void getargs(int argc, char *argv[]) { struct openfile *ip = input; - char *p; + const char *errstr; + char *p, *q; int ch, P, S, F, T; - size_t siz; P = S = F = T = 0; /* capitalized options */ while (optind < argc) { @@ -120,17 +127,28 @@ getargs(int argc, char *argv[]) case 'F': case 'f': F = (ch == 'F'); /* Validate format string argument. */ - for (p = optarg; *p != '\0'; p++) - if (!isdigit((unsigned char)*p) && - *p != '.' && *p != '-') - errx(1, "%s: invalid width specified", - optarg); - /* '%' + width + 's' + '\0' */ - siz = p - optarg + 3; - if ((p = realloc(ip->format, siz)) == NULL) - err(1, NULL); - snprintf(p, siz, "%%%ss", optarg); - ip->format = p; + p = optarg; + if (*p == '0' || *p == '-') + ip->align = *p++; + else + ip->align = ' '; + if ((q = strchr(p, '.')) != NULL) + *q++ = '\0'; + if (*p != '\0') { + ip->minwidth = strtonum(p, 1, INT_MAX, + &errstr); + if (errstr != NULL) + errx(1, "minimum width is %s: %s", + errstr, p); + } + if (q != NULL) { + ip->maxwidth = strtonum(q, 1, INT_MAX, + &errstr); + if (errstr != NULL) + errx(1, "maximum width is %s: %s", + errstr, q); + } else + ip->maxwidth = INT_MAX; break; case 'S': case 's': S = (ch == 'S'); @@ -157,10 +175,16 @@ getargs(int argc, char *argv[]) ip->pad = P; if (ip->sepstring == NULL) ip->sepstring = S ? (ip-1)->sepstring : ""; - if (ip->format == NULL) - ip->format = (P || F) ? (ip-1)->format : "%s"; if (ip->eol == '\0') ip->eol = T ? (ip-1)->eol : '\n'; + if (ip->align == '\0') { + if (F || P) { + ip->align = (ip-1)->align; + ip->minwidth = (ip-1)->minwidth; + ip->maxwidth = (ip-1)->maxwidth; + } else + ip->maxwidth = INT_MAX; + } ip++; optind++; break; @@ -179,14 +203,14 @@ pad(struct openfile *ip) { size_t n; char *lp = linep; + int i = 0; n = strlcpy(lp, ip->sepstring, line + sizeof(line) - lp); lp += (n < line + sizeof(line) - lp) ? n : strlen(lp); - if (ip->pad) { - n = snprintf(lp, line + sizeof(line) - lp, ip->format, ""); - if (n > 0) - lp += (n < line + sizeof(line) - lp) ? n : strlen(lp); - } + if (ip->pad) + while (i++ < ip->minwidth && lp + 1 < line + sizeof(line)) + *lp++ = ' '; + *lp = '\0'; return (lp); } @@ -202,7 +226,7 @@ gatherline(struct openfile *ip) char *p; char *lp = linep; char *end = s + BUFSIZ - 1; - int c; + int c, width; if (ip->eof) return (pad(ip)); @@ -220,9 +244,16 @@ gatherline(struct openfile *ip) numfiles++; n = strlcpy(lp, ip->sepstring, line + sizeof(line) - lp); lp += (n < line + sizeof(line) - lp) ? n : strlen(lp); - n = snprintf(lp, line + sizeof(line) - lp, ip->format, s); - if (n > 0) - lp += (n < line + sizeof(line) - lp) ? n : strlen(lp); + width = mbswidth_truncate(s, ip->maxwidth); + if (ip->align != '-') + while (width++ < ip->minwidth && lp + 1 < line + sizeof(line)) + *lp++ = ip->align; + n = strlcpy(lp, s, line + sizeof(line) - lp); + lp += (n < line + sizeof(line) - lp) ? n : strlen(lp); + if (ip->align == '-') + while (width++ < ip->minwidth && lp + 1 < line + sizeof(line)) + *lp++ = ' '; + *lp = '\0'; return (lp); } diff --git a/usr.bin/lam/utf8.c b/usr.bin/lam/utf8.c new file mode 100644 index 00000000000..0f6892466a3 --- /dev/null +++ b/usr.bin/lam/utf8.c @@ -0,0 +1,47 @@ +/* $OpenBSD: utf8.c,v 1.1 2018/07/29 11:27:15 schwarze Exp $ */ +/* + * Copyright (c) 2018 Ingo Schwarze + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +/* + * Measure the display width of the multibyte string. + * Treat invalid bytes and non-printable characters as width 1. + * Truncate the string to a display width of maxwidth. + * Return the total width, possibly after truncation. + */ +int +mbswidth_truncate(char *mbs, int maxwidth) +{ + wchar_t wc; + int len, width, sum; + + sum = 0; + while (*mbs != '\0') { + if ((len = mbtowc(&wc, mbs, MB_CUR_MAX)) == -1) + len = width = 1; + else if ((width = wcwidth(wc)) < 0) + width = 1; + if (sum + width > maxwidth) { + *mbs = '\0'; + break; + } + sum += width; + mbs += len; + } + return sum; +}