GopherProxy

	rework URI handling - hurl - Gopher/HTTP/HTTPS file grabber
	git clone git://git.codemadness.org/hurl
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit e8e1e1a7d09c614b57fac5070eb5c28822c948ba
	parent 5a9951db80a5e9b9f2d5ad7ca1c6efebbd00e11f
	Author: Hiltjo Posthuma <[email protected]>
	Date: Fri, 12 Mar 2021 22:22:13 +0100

	rework URI handling

	- Parse the URI in a more correct way following the Gopher URI RFC 4266 and
	General URI RFC 3986 - Uniform Resource Identifier (URI): Generic Syntax.
	- An URI fragment is not sent to the server anymore.
	- A gopher type is now optional for an empty path or for example:
	"gopher://codemadness.org".

	Also The use of strlcat() is removed and the code should now be more portable.

	Diffstat:
	M hurl.c \| 175 ++++++++++++++++++++---------…

	1 file changed, 116 insertions(+), 59 deletions(-)
	---
	diff --git a/hurl.c b/hurl.c
	@@ -28,12 +28,15 @@
	#define TLS_CA_CERT_FILE "/etc/ssl/cert.pem"
	#endif

	-/* uri */
	+/* URI */
	struct uri {
	- char proto[48];
	+ char proto[48]; /* scheme including ":" or "://" */
	+ char userinfo[256]; /* username [:password] */
	char host[256];
	- char path[2048];
	- char port[6]; /* numeric port */
	+ char port[6]; /* numeric port */
	+ char path[1024];
	+ char query[1024];
	+ char fragment[1024];
	};

	char *argv0;
	@@ -61,70 +64,115 @@ sighandler(int signo)
	}

	int
	-parseuri(const char s, struct uri u)
	+uri_parse(const char s, struct uri u)
	{
	- const char p = s, b;
	- char *endptr = NULL;
	+ const char *p = s;
	+ char *endptr;
	size_t i;
	- unsigned long l;
	+ long l;

	- u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
	- if (!*p)
	- return 0;
	+ u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
	+ u->path[0] = u->query[0] = u->fragment[0] = '\0';

	- /* protocol part */
	- for (p = s; p && (isalpha((unsigned char)p) \|\| isdigit((unsigned cha…
	- p == '+' \|\| p == '-' \|\| *p == '.'); p++)
	+ /* protocol-relative */
	+ if (p == '/' && (p + 1) == '/') {
	+ p += 2; /* skip "//" */
	+ goto parseauth;
	+ }
	+
	+ /* scheme / protocol part */
	+ for (; isalpha((unsigned char)p) \|\| isdigit((unsigned char)p) \|\|
	+ p == '+' \|\| p == '-' \|\| *p == '.'; p++)
	;
	- if (!strncmp(p, "://", 3)) {
	+ /* scheme, except if empty and starts with ":" then it is a path */
	+ if (*p == ':' && p != s) {
	+ if ((p + 1) == '/' && (p + 2) == '/')
	+ p += 3; /* skip "://" */
	+ else
	+ p++; /* skip ":" */
	+
	if ((size_t)(p - s) >= sizeof(u->proto))
	return -1; /* protocol too long */
	memcpy(u->proto, s, p - s);
	u->proto[p - s] = '\0';
	- p += 3; /* skip "://" */
	+
	+ if (*(p - 1) != '/')
	+ goto parsepath;
	} else {
	- return -1; /* no protocol specified */
	+ p = s; /* no scheme format, reset to start */
	+ goto parsepath;
	+ }
	+
	+parseauth:
	+ /* userinfo (username:password) */
	+ i = strcspn(p, "@/?#");
	+ if (p[i] == '@') {
	+ if (i >= sizeof(u->userinfo))
	+ return -1; /* userinfo too long */
	+ memcpy(u->userinfo, p, i);
	+ u->userinfo[i] = '\0';
	+ p += i + 1;
	}

	/* IPv6 address */
	if (*p == '[') {
	- /* bracket not found or host too long */
	- if (!(b = strchr(p, ']')) \|\| (size_t)(b - p) >= (ssize_t)sizeo…
	+ /* bracket not found, host too short or too long */
	+ i = strcspn(p, "]");
	+ if (p[i] != ']' \|\| i < 3)
	return -1;
	- memcpy(u->host, p + 1, b - p - 1);
	- u->host[b - p - 1] = '\0';
	- p = b + 1;
	+ i++; /* including "]" */
	} else {
	/* domain / host part, skip until port, path or end. */
	- if ((i = strcspn(p, ":/")) >= sizeof(u->host))
	- return -1; /* host too long */
	- memcpy(u->host, p, i);
	- u->host[i] = '\0';
	- p = &p[i];
	+ i = strcspn(p, ":/?#");
	}
	+ if (i >= sizeof(u->host))
	+ return -1; /* host too long */
	+ memcpy(u->host, p, i);
	+ u->host[i] = '\0';
	+ p += i;
	+
	/* port */
	if (*p == ':') {
	- if ((i = strcspn(++p, "/")) >= sizeof(u->port))
	+ p++;
	+ if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
	return -1; /* port too long */
	memcpy(u->port, p, i);
	u->port[i] = '\0';
	- /* check for valid port: range 1 - 65535 */
	+ /* check for valid port: range 1 - 65535, may be empty */
	errno = 0;
	- l = strtoul(u->port, &endptr, 10);
	- if (errno \|\| u->port[0] == '\0' \|\| *endptr \|\|
	- !l \|\| l > 65535)
	+ l = strtol(u->port, &endptr, 10);
	+ if (i && (errno \|\| *endptr \|\| l <= 0 \|\| l > 65535))
	return -1;
	- p = &p[i];
	+ p += i;
	}
	- if (u->host[0]) {
	- p = &p[strspn(p, "/")];
	- memcpy(u->path, "/", 2);
	- } else {
	- return -1;
	+
	+parsepath:
	+ /* path */
	+ if ((i = strcspn(p, "?#")) >= sizeof(u->path))
	+ return -1; /* path too long */
	+ memcpy(u->path, p, i);
	+ u->path[i] = '\0';
	+ p += i;
	+
	+ /* query */
	+ if (*p == '?') {
	+ p++;
	+ if ((i = strcspn(p, "#")) >= sizeof(u->query))
	+ return -1; /* query too long */
	+ memcpy(u->query, p, i);
	+ u->query[i] = '\0';
	+ p += i;
	}
	- /* treat truncation as an error */
	- if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
	- return -1;
	+
	+ /* fragment */
	+ if (*p == '#') {
	+ p++;
	+ if ((i = strlen(p)) >= sizeof(u->fragment))
	+ return -1; /* fragment too long */
	+ memcpy(u->fragment, p, i);
	+ u->fragment[i] = '\0';
	+ }
	+
	return 0;
	}

	@@ -206,11 +254,14 @@ https_request(void)

	/* create and send HTTP header */
	r = snprintf(buf, sizeof(buf),
	- "GET %s HTTP/1.0\r\n"
	+ "GET %s%s%s HTTP/1.0\r\n"
	"Host: %s%s%s\r\n"
	"Connection: close\r\n"
	"%s%s"
	- "\r\n", u.path, u.host,
	+ "\r\n",
	+ u.path[0] ? u.path : "/",
	+ u.query[0] ? "?" : "", u.query,
	+ u.host,
	stdport ? "" : ":",
	stdport ? "" : u.port,
	config_headers, config_headers[0] ? "\r\n" : "");
	@@ -334,11 +385,14 @@ http_request(void)

	/* create and send HTTP header */
	r = snprintf(buf, sizeof(buf),
	- "GET %s HTTP/1.0\r\n"
	+ "GET %s%s%s HTTP/1.0\r\n"
	"Host: %s%s%s\r\n"
	"Connection: close\r\n"
	"%s%s"
	- "\r\n", u.path, u.host,
	+ "\r\n",
	+ u.path[0] ? u.path : "/",
	+ u.query[0] ? "?" : "", u.query,
	+ u.host,
	stdport ? "" : ":",
	stdport ? "" : u.port,
	config_headers, config_headers[0] ? "\r\n" : "");
	@@ -427,7 +481,7 @@ int
	gopher_request(void)
	{
	char buf[READ_BUF_SIZ], *p;
	- const char *errstr;
	+ const char errstr, path;
	size_t len = 0;
	ssize_t r;
	int fd = -1, ret = 1;
	@@ -440,8 +494,13 @@ gopher_request(void)
	if (pledge("stdio", NULL) == -1)
	err(1, "pledge");

	- /* create and send path, skip type part */
	- r = snprintf(buf, sizeof(buf), "%s\r\n", u.path + 2);
	+ /* create and send path, skip type part, empty path is allowed,
	+ see RFC 4266 The gopher URI Scheme - section 2.1 */
	+ path = u.path;
	+ if (path == '/' && path++)
	+ path++;
	+ r = snprintf(buf, sizeof(buf), "%s%s%s\r\n",
	+ path, u.query[0] ? "?" : "", u.query);
	if (r < 0 \|\| (size_t)r >= sizeof(buf)) {
	fprintf(stderr, "not writing header because it is truncated");
	goto err;
	@@ -623,8 +682,10 @@ main(int argc, char **argv)
	usage();

	url = argv[0];
	- if (parseuri(url, &u) == -1)
	- errx(1, "invalid url: %s", url);
	+ if (uri_parse(url, &u) == -1)
	+ errx(1, "invalid URL: %s", url);
	+ if (u.userinfo[0])
	+ errx(1, "userinfo field not supported in the URL: %s", url);

	if (config_timeout > 0) {
	signal(SIGALRM, sighandler);
	@@ -632,7 +693,7 @@ main(int argc, char **argv)
	err(1, "alarm");
	}

	- if (!strcmp(u.proto, "https")) {
	+ if (!strcmp(u.proto, "https://")) {
	if (tls_init())
	errx(1, "tls_init failed");
	if (!(tls_config = tls_config_new()))
	@@ -643,22 +704,18 @@ main(int argc, char **argv)
	errx(1, "tls set ciphers failed: %s",
	tls_config_error(tls_config));
	}
	- if (!u.port[0] && !strcmp(u.proto, "https"))
	+ if (!u.port[0])
	memcpy(u.port, "443", 4);
	statuscode = https_request();
	- } else if (!strcmp(u.proto, "http")) {
	+ } else if (!strcmp(u.proto, "http://")) {
	if (!u.port[0])
	memcpy(u.port, "80", 3);
	statuscode = http_request();
	- } else if (!strcmp(u.proto, "gopher")) {
	+ } else if (!strcmp(u.proto, "gopher://")) {
	if (!u.port[0])
	memcpy(u.port, "70", 3);
	-
	- if (u.path[0] != '/' \|\| u.path[1] == '\0')
	- errx(1, "must specify type");
	-
	statuscode = gopher_request();
	- } else if (!strcmp(u.proto, "gophers")) {
	+ } else if (!strcmp(u.proto, "gophers://")) {
	if (tls_init())
	errx(1, "tls_init failed");
	if (!(tls_config = tls_config_new()))