From 7138ccd3b23521296a2a22c60c27a7bf927c5815 Mon Sep 17 00:00:00 2001
From: kn <kn@openbsd.org>
Date: Sat, 6 Nov 2021 14:27:45 +0000
Subject: [PATCH] Stop URL encoding the tilde character

RFC 1738 Uniform Resource Locators (URL) lists tilde as unsafe character.
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax updates it to

	The tilde "~" character was added to those in the "unreserved" set,
	since it is extensively used on the Internet in spite of the
	difficulty to transcribe it with some keyboards.

In theory, this shouldn't make a difference, but some servers do not decode
"%7e" and thus erroneously serve a 404.

RFC 2396 2.4.2. When to Escape and Unescape says:

	In some cases, data that could be represented by an unreserved
	character may appear escaped; for example, some of the unreserved
	"mark" characters are automatically escaped by some systems.  If the
	given URI scheme defines a canonicalization algorithm, then
	unreserved characters may be unescaped according to that algorithm.
	For example, "%7e" is sometimes used instead of "~" in an http URL
	path, but the two are equivalent for an http URL.


Update ftp(1) to RFC 2396 by no longer treating "~" as unsafe character.
This is effectively a one-character diff;  update comments accordingly as
well as the order of characters to ease code-to-standard comparison.

This matches curl(1) and wget(1) behaviour wrt. encoding of "~".

OK sthen
---
 usr.bin/ftp/fetch.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/usr.bin/ftp/fetch.c b/usr.bin/ftp/fetch.c
index c83ee02ae36..cfc68b08b02 100644
--- a/usr.bin/ftp/fetch.c
+++ b/usr.bin/ftp/fetch.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: fetch.c,v 1.205 2021/08/31 09:51:25 claudio Exp $	*/
+/*	$OpenBSD: fetch.c,v 1.206 2021/11/06 14:27:45 kn Exp $	*/
 /*	$NetBSD: fetch.c,v 1.14 1997/08/18 10:20:20 lukem Exp $	*/
 
 /*-
@@ -106,14 +106,17 @@ static int	redirect_loop;
 static int	retried;
 
 /*
- * Determine whether the character needs encoding, per RFC1738:
- *	- No corresponding graphic US-ASCII.
- *	- Unsafe characters.
+ * Determine whether the character needs encoding, per RFC2396.
  */
 static int
-unsafe_char(const char *c0)
+to_encode(const char *c0)
 {
-	const char *unsafe_chars = " <>\"#{}|\\^~[]`";
+	/* 2.4.3. Excluded US-ASCII Characters */
+	const char *excluded_chars =
+	    " "		/* space */
+	    "<>#\""	/* delims (modulo "%", see below) */
+	    "{}|\\^[]`"	/* unwise */
+	    ;
 	const unsigned char *c = (const unsigned char *)c0;
 
 	/*
@@ -123,16 +126,15 @@ unsafe_char(const char *c0)
 	return (iscntrl(*c) || !isascii(*c) ||
 
 	    /*
-	     * Unsafe characters.
-	     * '%' is also unsafe, if is not followed by two
+	     * '%' is also reserved, if is not followed by two
 	     * hexadecimal digits.
 	     */
-	    strchr(unsafe_chars, *c) != NULL ||
+	    strchr(excluded_chars, *c) != NULL ||
 	    (*c == '%' && (!isxdigit(c[1]) || !isxdigit(c[2]))));
 }
 
 /*
- * Encode given URL, per RFC1738.
+ * Encode given URL, per RFC2396.
  * Allocate and return string to the caller.
  */
 static char *
@@ -145,11 +147,10 @@ url_encode(const char *path)
 
 	/*
 	 * First pass:
-	 * Count unsafe characters, and determine length of the
-	 * final URL.
+	 * Count characters to encode and determine length of the final URL.
 	 */
 	for (i = 0; i < length; i++)
-		if (unsafe_char(path + i))
+		if (to_encode(path + i))
 			new_length += 2;
 
 	epath = epathp = malloc(new_length + 1);	/* One more for '\0'. */
@@ -161,7 +162,7 @@ url_encode(const char *path)
 	 * Encode, and copy final URL.
 	 */
 	for (i = 0; i < length; i++)
-		if (unsafe_char(path + i)) {
+		if (to_encode(path + i)) {
 			snprintf(epathp, 4, "%%" "%02x",
 			    (unsigned char)path[i]);
 			epathp += 3;
-- 
2.20.1