GopherProxy

	sync URL parsing code - gopherproxy-c - Gopher HTTP proxy in C (CGI)
	git clone git://git.codemadness.org/gopherproxy-c
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit e9b0ad3f6eaef101ec93e70846460f9a4127e129
	parent ee13891f6be12921f48b361b571de30442b0f87b
	Author: Hiltjo Posthuma <[email protected]>
	Date: Sat, 19 Mar 2022 11:31:12 +0100

	sync URL parsing code

	Diffstat:
	M gopherproxy.c \| 155 ++++++++++++++++++++++++-----…

	1 file changed, 119 insertions(+), 36 deletions(-)
	---
	diff --git a/gopherproxy.c b/gopherproxy.c
	@@ -18,10 +18,15 @@
	#define pledge(a,b) 0
	#endif

	+/* URI */
	struct uri {
	+ char proto[48]; /* scheme including ":" or "://" */
	+ char userinfo[256]; /* username [:password] */
	char host[256];
	- char port[8];
	+ char port[6]; /* numeric port */
	char path[1024];
	+ char query[1024];
	+ char fragment[1024];
	};

	struct visited {
	@@ -447,53 +452,130 @@ checkparam(const char *s)
	return 1;
	}

	+/* check if string has a non-empty scheme / protocol part */
	int
	-parseuri(const char str, struct uri u)
	+uri_hasscheme(const char *s)
	{
	- const char s, e;
	+ const char *p = s;

	- memset(u, 0, sizeof(struct uri));
	+ for (; isalpha((unsigned char)p) \|\| isdigit((unsigned char)p) \|\|
	+ p == '+' \|\| p == '-' \|\| *p == '.'; p++)
	+ ;
	+ /* scheme, except if empty and starts with ":" then it is a path */
	+ return (*p == ':' && p != s);
	+}

	- s = str;
	+int
	+uri_parse(const char s, struct uri u)
	+{
	+ const char *p = s;
	+ char *endptr;
	+ size_t i;
	+ long l;

	- /* IPv6 */
	- if (*s == '[') {
	- s++;
	- e = strchr(s, ']');
	- if (!e \|\| e - s + 1 >= sizeof(u->host))
	- return 0;
	- memcpy(u->host, s, e - s);
	- u->host[e - s] = '\0';
	- e++;
	+ u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
	+ u->path[0] = u->query[0] = u->fragment[0] = '\0';
	+
	+ /* protocol-relative */
	+ if (p == '/' && (p + 1) == '/') {
	+ p += 2; /* skip "//" */
	+ goto parseauth;
	+ }
	+
	+ /* scheme / protocol part */
	+ for (; isalpha((unsigned char)p) \|\| isdigit((unsigned char)p) \|\|
	+ p == '+' \|\| p == '-' \|\| *p == '.'; p++)
	+ ;
	+ /* scheme, except if empty and starts with ":" then it is a path */
	+ if (*p == ':' && p != s) {
	+ if ((p + 1) == '/' && (p + 2) == '/')
	+ p += 3; /* skip "://" */
	+ else
	+ p++; /* skip ":" */
	+
	+ if ((size_t)(p - s) >= sizeof(u->proto))
	+ return -1; /* protocol too long */
	+ memcpy(u->proto, s, p - s);
	+ u->proto[p - s] = '\0';
	+
	+ if (*(p - 1) != '/')
	+ goto parsepath;
	} else {
	- e = &s[strcspn(s, ":/")];
	- if (e - s + 1 >= sizeof(u->host))
	- return 0;
	- memcpy(u->host, s, e - s);
	- u->host[e - s] = '\0';
	+ p = s; /* no scheme format, reset to start */
	+ goto parsepath;
	}

	- if (*e == ':') {
	- s = e + 1;
	- e = &s[strcspn(s, "/")];
	+parseauth:
	+ /* userinfo (username:password) */
	+ i = strcspn(p, "@/?#");
	+ if (p[i] == '@') {
	+ if (i >= sizeof(u->userinfo))
	+ return -1; /* userinfo too long */
	+ memcpy(u->userinfo, p, i);
	+ u->userinfo[i] = '\0';
	+ p += i + 1;
	+ }

	- if (e - s + 1 >= sizeof(u->port))
	- return 0;
	- memcpy(u->port, s, e - s);
	- u->port[e - s] = '\0';
	+ /* IPv6 address */
	+ if (*p == '[') {
	+ /* bracket not found, host too short or too long */
	+ i = strcspn(p, "]");
	+ if (p[i] != ']' \|\| i < 3)
	+ return -1;
	+ i++; /* including "]" */
	+ } else {
	+ /* domain / host part, skip until port, path or end. */
	+ i = strcspn(p, ":/?#");
	+ }
	+ if (i >= sizeof(u->host))
	+ return -1; /* host too long */
	+ memcpy(u->host, p, i);
	+ u->host[i] = '\0';
	+ p += i;
	+
	+ /* port */
	+ if (*p == ':') {
	+ p++;
	+ if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
	+ return -1; /* port too long */
	+ memcpy(u->port, p, i);
	+ u->port[i] = '\0';
	+ /* check for valid port: range 1 - 65535, may be empty */
	+ errno = 0;
	+ l = strtol(u->port, &endptr, 10);
	+ if (i && (errno \|\| *endptr \|\| l <= 0 \|\| l > 65535))
	+ return -1;
	+ p += i;
	}
	- if (e && e != '/')
	- return 0; /* invalid path */

	- s = e;
	- e = s + strlen(s);
	+parsepath:
	+ /* path */
	+ if ((i = strcspn(p, "?#")) >= sizeof(u->path))
	+ return -1; /* path too long */
	+ memcpy(u->path, p, i);
	+ u->path[i] = '\0';
	+ p += i;
	+
	+ /* query */
	+ if (*p == '?') {
	+ p++;
	+ if ((i = strcspn(p, "#")) >= sizeof(u->query))
	+ return -1; /* query too long */
	+ memcpy(u->query, p, i);
	+ u->query[i] = '\0';
	+ p += i;
	+ }

	- if (e - s + 1 >= sizeof(u->path))
	- return 0;
	- memcpy(u->path, s, e - s);
	- u->path[e - s] = '\0';
	+ /* fragment */
	+ if (*p == '#') {
	+ p++;
	+ if ((i = strlen(p)) >= sizeof(u->fragment))
	+ return -1; /* fragment too long */
	+ memcpy(u->fragment, p, i);
	+ u->fragment[i] = '\0';
	+ }

	- return 1;
	+ return 0;
	}

	int
	@@ -527,7 +609,8 @@ main(void)
	else
	uri = query;

	- if (!parseuri(uri, &u))
	+ if (!uri_hasscheme(uri) \|\|
	+ uri_parse(uri, &u) == -1)
	die(400, "Invalid uri: %s\n", uri);
	if (u.host[0] == '\0')
	die(400, "Invalid hostname\n");