improve base URL and <base href /> handling - webdump - HTML to plain-text conv… | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 0705fb754f00c7866b2cc8cee0739a88a584a2e1 | |
parent 7d4723febabeb679e1980c12b5dfd3b656475b4f | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Fri, 8 Sep 2023 13:09:37 +0200 | |
improve base URL and <base href /> handling | |
- Parse the base URI once and reuse the structure (optimization). | |
- Once it is parsed it cannot be overwritten again. This matches the browser | |
more closely. | |
Diffstat: | |
M webdump.1 | 1 + | |
M webdump.c | 30 ++++++++++++++++-------------- | |
2 files changed, 17 insertions(+), 14 deletions(-) | |
--- | |
diff --git a/webdump.1 b/webdump.1 | |
@@ -27,6 +27,7 @@ Toggle ANSI escape codes usage, by default it is not enabled. | |
.It Fl b Ar baseurl | |
Base URL of links. | |
This is used to make links absolute. | |
+The specified URL is always preferred over the value in a <base/> tag. | |
.It Fl i | |
Toggle if link reference numbers are displayed inline or not, by default it is | |
not enabled. | |
diff --git a/webdump.c b/webdump.c | |
@@ -148,6 +148,8 @@ static const char *str_ruler = "-"; | |
/* base href, to make URLs absolute */ | |
static char *basehref = ""; | |
static char basehrefdoc[4096]; /* base href in document, if any */ | |
+static int basehrefset = 0; /* base href set and can be used? */ | |
+static struct uri base; | |
/* buffers for some attributes of the current tag */ | |
String attr_alt; /* alt attribute */ | |
@@ -1311,14 +1313,13 @@ addlinkref(const char *url, const char *_type, int ishi… | |
links_cur->ishidden = ishidden; | |
} | |
-/* TODO: make parsed base URL global and overwrite it once. */ | |
static void | |
handleinlinelink(void) | |
{ | |
- struct uri base, newuri, olduri; | |
+ struct uri newuri, olduri; | |
struct node *cur; | |
char buf[4096], *url; | |
- int b, r; | |
+ int r; | |
if (!showrefbottom && !showrefinline && !showurlinline && !resources) | |
return; /* there is no need to collect the reference */ | |
@@ -1332,15 +1333,9 @@ handleinlinelink(void) | |
else | |
url = attr_href.data; | |
- b = -1; | |
- if (uri_hasscheme(url)) | |
- ; /* already absolute: nothing to do */ | |
- else if (basehref[0]) /* prefer -b option over <base> */ | |
- b = uri_parse(basehref, &base); | |
- else if (basehrefdoc[0]) | |
- b = uri_parse(basehrefdoc, &base); | |
- | |
- if (b != -1 && | |
+ /* Not an absolute URL yet: try to make it absolute. | |
+ If it is not possible use the relative URL */ | |
+ if (!uri_hasscheme(url) && basehrefset && | |
uri_parse(url, &olduri) != -1 && | |
uri_makeabs(&newuri, &olduri, &base) != -1 && | |
newuri.proto[0]) { | |
@@ -1948,7 +1943,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, con… | |
string_append(&attr_id, value, valuelen); | |
/* <base href="..." /> */ | |
- if (!attrcmp(name, "href") && !tagcmp(tag, "base")) | |
+ if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base")) | |
strlcat(basehrefdoc, value, sizeof(basehrefdoc)); | |
/* hide tags with attribute aria-hidden or hidden */ | |
@@ -1992,6 +1987,10 @@ static void | |
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, | |
size_t nl) | |
{ | |
+ /* set base URL, if it is set it cannot be overwritten again */ | |
+ if (!basehrefset && basehrefdoc[0] && | |
+ !attrcmp(n, "href") && !tagcmp(t, "base")) | |
+ basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; | |
} | |
static void | |
@@ -2013,7 +2012,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons… | |
else if (!attrcmp(n, "value")) | |
string_clear(&attr_value); | |
- if (!attrcmp(n, "href") && !tagcmp(t, "base")) | |
+ if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base")) | |
basehrefdoc[0] = '\0'; | |
} | |
@@ -2040,6 +2039,9 @@ main(int argc, char **argv) | |
break; | |
case 'b': | |
basehref = EARGF(usage()); | |
+ if (uri_parse(basehref, &base) == -1) | |
+ usage(); | |
+ basehrefset = 1; | |
break; | |
case 'i': | |
showrefinline = !showrefinline; |