From 0a3faa2e7b5822ce788c345a7c75cd9b9b9d28e4 Mon Sep 17 00:00:00 2001 From: martijn Date: Mon, 11 Dec 2017 13:25:57 +0000 Subject: [PATCH] Fix and change y command in the following ways: - When 'n' is used as a delimiter escaping 'n' will remain a newline instead of becoming a 'n' character. This is how POSIX specifies how this should work. Other implementations tested also do this wrong. - '[' and maybe other characters are not special during the parsing of the y command and don't need to be matched or treated special in any way. - POSIX specifies that a backslash followed by anything other than the delimiter, 'n', and another backslash, as well as repeating characters in string1 are unspecified. Since the various implementations handle these situations in opposing ways choose to error out on them to prevent people falling into the pitfall of expecting identical behaviour on various implementations. Inspired by the sed.1 patch by kshe59 zoho eu Feedback and OK millert@ Manpage bits OK jmc@ --- usr.bin/sed/compile.c | 102 ++++++++++++++++++++++++------------------ usr.bin/sed/sed.1 | 25 ++++++++--- 2 files changed, 78 insertions(+), 49 deletions(-) diff --git a/usr.bin/sed/compile.c b/usr.bin/sed/compile.c index f994049ac1b..c7a69f8248c 100644 --- a/usr.bin/sed/compile.c +++ b/usr.bin/sed/compile.c @@ -1,4 +1,4 @@ -/* $OpenBSD: compile.c,v 1.43 2017/12/08 18:41:59 martijn Exp $ */ +/* $OpenBSD: compile.c,v 1.44 2017/12/11 13:25:57 martijn Exp $ */ /*- * Copyright (c) 1992 Diomidis Spinellis. @@ -59,7 +59,7 @@ static struct labhash { static char *compile_addr(char *, struct s_addr *); static char *compile_ccl(char **, char *); -static char *compile_delimited(char *, char *, int); +static char *compile_delimited(char *, char *); static char *compile_flags(char *, struct s_subst *); static char *compile_re(char *, regex_t **); static char *compile_subst(char *, struct s_subst *); @@ -351,7 +351,7 @@ nonsel: /* Now parse the command */ * with the processed string. */ static char * -compile_delimited(char *p, char *d, int is_tr) +compile_delimited(char *p, char *d) { char c; @@ -376,10 +376,7 @@ compile_delimited(char *p, char *d, int is_tr) p += 2; continue; } else if (*p == '\\' && p[1] == '\\') { - if (is_tr) - p++; - else - *d++ = *p++; + *d++ = *p++; } else if (*p == c) { *d = '\0'; return (p + 1); @@ -436,7 +433,7 @@ compile_re(char *p, regex_t **repp) char *re; re = xmalloc(strlen(p) + 1); /* strlen(re) <= strlen(p) */ - p = compile_delimited(p, re, 0); + p = compile_delimited(p, re); if (p && strlen(re) == 0) { *repp = NULL; free(re); @@ -603,46 +600,63 @@ compile_flags(char *p, struct s_subst *s) * Compile a translation set of strings into a lookup table. */ static char * -compile_tr(char *p, char **transtab) +compile_tr(char *old, char **transtab) { int i; - char *lt, *op, *np; - char *old = NULL, *new = NULL; - - if (*p == '\0' || *p == '\\') - error(COMPILE, -"transform pattern can not be delimited by newline or backslash"); - old = xmalloc(strlen(p) + 1); - p = compile_delimited(p, old, 1); - if (p == NULL) { - error(COMPILE, "unterminated transform source string"); - goto bad; - } - new = xmalloc(strlen(p) + 1); - p = compile_delimited(--p, new, 1); - if (p == NULL) { - error(COMPILE, "unterminated transform target string"); - goto bad; - } - EATSPACE(); - if (strlen(new) != strlen(old)) { - error(COMPILE, "transform strings are not the same length"); - goto bad; - } + char delimiter, check[UCHAR_MAX]; + char *new, *end; + + memset(check, 0, sizeof(check)); + delimiter = *old; + if (delimiter == '\\') + error(COMPILE, "\\ can not be used as a string delimiter"); + else if (delimiter == '\n' || delimiter == '\0') + error(COMPILE, "newline can not be used as a string delimiter"); + + new = old++; + do { + if ((new = strchr(new + 1, delimiter)) == NULL) + error(COMPILE, "unterminated transform source string"); + } while (*(new - 1) == '\\' && *(new -2) != '\\'); + *new = '\0'; + end = new++; + do { + if ((end = strchr(end + 1, delimiter)) == NULL) + error(COMPILE, "unterminated transform target string"); + } while (*(end -1) == '\\' && *(end -2) != '\\'); + *end = '\0'; + /* We assume characters are 8 bits */ - lt = xmalloc(UCHAR_MAX + 1); + *transtab = xmalloc(UCHAR_MAX + 1); for (i = 0; i <= UCHAR_MAX; i++) - lt[i] = (char)i; - for (op = old, np = new; *op; op++, np++) - lt[(u_char)*op] = *np; - *transtab = lt; - free(old); - free(new); - return (p); -bad: - free(old); - free(new); - return (NULL); + (*transtab)[i] = (char)i; + + while (*old != '\0' && *new != '\0') { + if (*old == '\\') { + old++; + if (*old == 'n') + *old = '\n'; + else if (*old != delimiter && *old != '\\') + error(COMPILE, "Unexpected character after " + "backslash"); + + } + if (*new == '\\') { + new++; + if (*new == 'n') + *new = '\n'; + else if (*new != delimiter && *new != '\\') + error(COMPILE, "Unexpected character after " + "backslash"); + } + if (check[*old] == 1) + error(COMPILE, "Repeated character in source string"); + check[*old] = 1; + (*transtab)[*old++] = *new++; + } + if (*old != '\0' || *new != '\0') + error(COMPILE, "transform strings are not the same length"); + return end + 1; } /* diff --git a/usr.bin/sed/sed.1 b/usr.bin/sed/sed.1 index 5e91484f217..fa257d8ae95 100644 --- a/usr.bin/sed/sed.1 +++ b/usr.bin/sed/sed.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: sed.1,v 1.52 2017/12/08 18:41:59 martijn Exp $ +.\" $OpenBSD: sed.1,v 1.53 2017/12/11 13:25:57 martijn Exp $ .\" .\" Copyright (c) 1992, 1993 .\" The Regents of the University of California. All rights reserved. @@ -32,7 +32,7 @@ .\" .\" from: @(#)sed.1 8.2 (Berkeley) 12/30/93 .\" -.Dd $Mdocdate: December 8 2017 $ +.Dd $Mdocdate: December 11 2017 $ .Dt SED 1 .Os .Sh NAME @@ -482,14 +482,29 @@ in the pattern space with the corresponding characters from .Ar string2 . Any character other than a backslash or newline can be used instead of a slash to delimit the strings. +.Pp Within .Ar string1 and .Ar string2 , -a backslash followed by any character other than a newline is that literal -character, and a backslash followed by an +a backslash followed by another backslash +is replaced by a single backslash, +a backslash followed by an +.Sq n +is replaced by a newline character, +and a backslash followed by the delimiting character +is replaced by that character, +causing it to be treated literally, +with the exception of the .Sq n -is replaced by a newline character. +character, +which will still be treated like a newline character. +It is an error for a backslash to not be followed by another backslash, +.Sq n , +or the delimiting character, +or for +.Ar string1 +to contain repeating characters. .It [0addr] Ns Ic \&: Ns Ar label This function does nothing; it bears a .Ar label -- 2.20.1