sync URL parsing code - gopherproxy-c - Gopher HTTP proxy in C (CGI) | |
git clone git://git.codemadness.org/gopherproxy-c | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit e9b0ad3f6eaef101ec93e70846460f9a4127e129 | |
parent ee13891f6be12921f48b361b571de30442b0f87b | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 19 Mar 2022 11:31:12 +0100 | |
sync URL parsing code | |
Diffstat: | |
M gopherproxy.c | 155 ++++++++++++++++++++++++-----… | |
1 file changed, 119 insertions(+), 36 deletions(-) | |
--- | |
diff --git a/gopherproxy.c b/gopherproxy.c | |
@@ -18,10 +18,15 @@ | |
#define pledge(a,b) 0 | |
#endif | |
+/* URI */ | |
struct uri { | |
+ char proto[48]; /* scheme including ":" or "://" */ | |
+ char userinfo[256]; /* username [:password] */ | |
char host[256]; | |
- char port[8]; | |
+ char port[6]; /* numeric port */ | |
char path[1024]; | |
+ char query[1024]; | |
+ char fragment[1024]; | |
}; | |
struct visited { | |
@@ -447,53 +452,130 @@ checkparam(const char *s) | |
return 1; | |
} | |
+/* check if string has a non-empty scheme / protocol part */ | |
int | |
-parseuri(const char *str, struct uri *u) | |
+uri_hasscheme(const char *s) | |
{ | |
- const char *s, *e; | |
+ const char *p = s; | |
- memset(u, 0, sizeof(struct uri)); | |
+ for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || | |
+ *p == '+' || *p == '-' || *p == '.'; p++) | |
+ ; | |
+ /* scheme, except if empty and starts with ":" then it is a path */ | |
+ return (*p == ':' && p != s); | |
+} | |
- s = str; | |
+int | |
+uri_parse(const char *s, struct uri *u) | |
+{ | |
+ const char *p = s; | |
+ char *endptr; | |
+ size_t i; | |
+ long l; | |
- /* IPv6 */ | |
- if (*s == '[') { | |
- s++; | |
- e = strchr(s, ']'); | |
- if (!e || e - s + 1 >= sizeof(u->host)) | |
- return 0; | |
- memcpy(u->host, s, e - s); | |
- u->host[e - s] = '\0'; | |
- e++; | |
+ u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; | |
+ u->path[0] = u->query[0] = u->fragment[0] = '\0'; | |
+ | |
+ /* protocol-relative */ | |
+ if (*p == '/' && *(p + 1) == '/') { | |
+ p += 2; /* skip "//" */ | |
+ goto parseauth; | |
+ } | |
+ | |
+ /* scheme / protocol part */ | |
+ for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || | |
+ *p == '+' || *p == '-' || *p == '.'; p++) | |
+ ; | |
+ /* scheme, except if empty and starts with ":" then it is a path */ | |
+ if (*p == ':' && p != s) { | |
+ if (*(p + 1) == '/' && *(p + 2) == '/') | |
+ p += 3; /* skip "://" */ | |
+ else | |
+ p++; /* skip ":" */ | |
+ | |
+ if ((size_t)(p - s) >= sizeof(u->proto)) | |
+ return -1; /* protocol too long */ | |
+ memcpy(u->proto, s, p - s); | |
+ u->proto[p - s] = '\0'; | |
+ | |
+ if (*(p - 1) != '/') | |
+ goto parsepath; | |
} else { | |
- e = &s[strcspn(s, ":/")]; | |
- if (e - s + 1 >= sizeof(u->host)) | |
- return 0; | |
- memcpy(u->host, s, e - s); | |
- u->host[e - s] = '\0'; | |
+ p = s; /* no scheme format, reset to start */ | |
+ goto parsepath; | |
} | |
- if (*e == ':') { | |
- s = e + 1; | |
- e = &s[strcspn(s, "/")]; | |
+parseauth: | |
+ /* userinfo (username:password) */ | |
+ i = strcspn(p, "@/?#"); | |
+ if (p[i] == '@') { | |
+ if (i >= sizeof(u->userinfo)) | |
+ return -1; /* userinfo too long */ | |
+ memcpy(u->userinfo, p, i); | |
+ u->userinfo[i] = '\0'; | |
+ p += i + 1; | |
+ } | |
- if (e - s + 1 >= sizeof(u->port)) | |
- return 0; | |
- memcpy(u->port, s, e - s); | |
- u->port[e - s] = '\0'; | |
+ /* IPv6 address */ | |
+ if (*p == '[') { | |
+ /* bracket not found, host too short or too long */ | |
+ i = strcspn(p, "]"); | |
+ if (p[i] != ']' || i < 3) | |
+ return -1; | |
+ i++; /* including "]" */ | |
+ } else { | |
+ /* domain / host part, skip until port, path or end. */ | |
+ i = strcspn(p, ":/?#"); | |
+ } | |
+ if (i >= sizeof(u->host)) | |
+ return -1; /* host too long */ | |
+ memcpy(u->host, p, i); | |
+ u->host[i] = '\0'; | |
+ p += i; | |
+ | |
+ /* port */ | |
+ if (*p == ':') { | |
+ p++; | |
+ if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) | |
+ return -1; /* port too long */ | |
+ memcpy(u->port, p, i); | |
+ u->port[i] = '\0'; | |
+ /* check for valid port: range 1 - 65535, may be empty */ | |
+ errno = 0; | |
+ l = strtol(u->port, &endptr, 10); | |
+ if (i && (errno || *endptr || l <= 0 || l > 65535)) | |
+ return -1; | |
+ p += i; | |
} | |
- if (*e && *e != '/') | |
- return 0; /* invalid path */ | |
- s = e; | |
- e = s + strlen(s); | |
+parsepath: | |
+ /* path */ | |
+ if ((i = strcspn(p, "?#")) >= sizeof(u->path)) | |
+ return -1; /* path too long */ | |
+ memcpy(u->path, p, i); | |
+ u->path[i] = '\0'; | |
+ p += i; | |
+ | |
+ /* query */ | |
+ if (*p == '?') { | |
+ p++; | |
+ if ((i = strcspn(p, "#")) >= sizeof(u->query)) | |
+ return -1; /* query too long */ | |
+ memcpy(u->query, p, i); | |
+ u->query[i] = '\0'; | |
+ p += i; | |
+ } | |
- if (e - s + 1 >= sizeof(u->path)) | |
- return 0; | |
- memcpy(u->path, s, e - s); | |
- u->path[e - s] = '\0'; | |
+ /* fragment */ | |
+ if (*p == '#') { | |
+ p++; | |
+ if ((i = strlen(p)) >= sizeof(u->fragment)) | |
+ return -1; /* fragment too long */ | |
+ memcpy(u->fragment, p, i); | |
+ u->fragment[i] = '\0'; | |
+ } | |
- return 1; | |
+ return 0; | |
} | |
int | |
@@ -527,7 +609,8 @@ main(void) | |
else | |
uri = query; | |
- if (!parseuri(uri, &u)) | |
+ if (!uri_hasscheme(uri) || | |
+ uri_parse(uri, &u) == -1) | |
die(400, "Invalid uri: %s\n", uri); | |
if (u.host[0] == '\0') | |
die(400, "Invalid hostname\n"); |