add option for unique link references (-d) - webdump - HTML to plain-text conve… | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 91d236dab89449465eb123d756a450a17eb4195a | |
parent 790402682bab675461f2a12879408dd5ad30c90f | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Tue, 12 Sep 2023 20:02:57 +0200 | |
add option for unique link references (-d) | |
... also make link type "a" consistently "link" (also at the bottom | |
references). | |
... also flush inline link only if needed | |
Diffstat: | |
M webdump.1 | 7 +++++-- | |
M webdump.c | 87 ++++++++++++++++++++++-------… | |
2 files changed, 67 insertions(+), 27 deletions(-) | |
--- | |
diff --git a/webdump.1 b/webdump.1 | |
@@ -1,4 +1,4 @@ | |
-.Dd September 8, 2023 | |
+.Dd September 12, 2023 | |
.Dt WEBDUMP 1 | |
.Os | |
.Sh NAME | |
@@ -6,7 +6,7 @@ | |
.Nd convert HTML to plain-text | |
.Sh SYNOPSIS | |
.Nm | |
-.Op Fl 8aiIlrx | |
+.Op Fl 8adiIlrx | |
.Op Fl b Ar baseurl | |
.Op Fl s Ar selector | |
.Op Fl u Ar selector | |
@@ -28,6 +28,9 @@ Toggle ANSI escape codes usage, by default it is not enabled. | |
Base URL of links. | |
This is used to make links absolute. | |
The specified URL is always preferred over the value in a <base/> tag. | |
+.It Fl d | |
+Deduplicate link references. | |
+When a duplicate link reference is found reuse the same link reference number. | |
.It Fl i | |
Toggle if link reference numbers are displayed inline or not, by default it is | |
not enabled. | |
diff --git a/webdump.c b/webdump.c | |
@@ -51,12 +51,14 @@ static int showurlinline = 0; /* show full link reference … | |
static int linewrap = 0; /* line-wrapping */ | |
static int termwidth = 77; /* terminal width */ | |
static int resources = 0; /* write resources line-by-line to fd 3? */ | |
+static int uniqrefs = 0; /* number unique references */ | |
/* linked-list of link references */ | |
struct linkref { | |
char *type; | |
char *url; | |
int ishidden; | |
+ size_t linknr; | |
struct linkref *next; | |
}; | |
@@ -628,6 +630,20 @@ uri_format(char *buf, size_t bufsiz, struct uri *u) | |
u->fragment); | |
} | |
+/* compare tag name (case-insensitive) */ | |
+int | |
+tagcmp(const char *s1, const char *s2) | |
+{ | |
+ return strcasecmp(s1, s2); | |
+} | |
+ | |
+/* compare attribute name (case-insensitive) */ | |
+int | |
+attrcmp(const char *s1, const char *s2) | |
+{ | |
+ return strcasecmp(s1, s2); | |
+} | |
+ | |
static void | |
rindent(void) | |
{ | |
@@ -1325,9 +1341,26 @@ handleinlinealt(void) | |
} | |
} | |
-static void | |
-addlinkref(const char *url, const char *_type, int ishidden) | |
+/* slow linear lookup of link references | |
+ TODO: optimize it, maybe using tree.h RB_TREE? */ | |
+static struct linkref * | |
+findlinkref(const char *url) | |
{ | |
+ struct linkref *cur; | |
+ | |
+ for (cur = links_head; cur; cur = cur->next) { | |
+ if (!strcmp(url, cur->url)) | |
+ return cur; | |
+ } | |
+ return NULL; | |
+} | |
+ | |
+static struct linkref * | |
+addlinkref(const char *url, const char *_type, int ishidden, int linknr) | |
+{ | |
+ if (!tagcmp(_type, "a")) | |
+ _type = "link"; | |
+ | |
/* add to linked list */ | |
if (!links_head) | |
links_cur = links_head = ecalloc(1, sizeof(*links_head)); | |
@@ -1336,6 +1369,9 @@ addlinkref(const char *url, const char *_type, int ishidd… | |
links_cur->url = estrdup(url); | |
links_cur->type = estrdup(_type); | |
links_cur->ishidden = ishidden; | |
+ links_cur->linknr = linknr; | |
+ | |
+ return links_cur; | |
} | |
static void | |
@@ -1382,7 +1418,7 @@ handleinlinelink(void) | |
/* add hidden links directly to the reference, | |
the order doesn't matter */ | |
if (cur->tag.displaytype & DisplayNone) | |
- addlinkref(url, cur->tag.name, 1); | |
+ addlinkref(url, cur->tag.name, 1, 0); | |
} | |
void | |
@@ -1407,12 +1443,13 @@ printlinkrefs(void) | |
hashiddenrefs = 1; | |
continue; | |
} | |
- printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type); | |
+ printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, l… | |
i++; | |
} | |
if (hashiddenrefs) | |
printf("\n\nHidden references\n\n"); | |
+ /* hidden links don't have a link number, just count them */ | |
for (links_cur = links_head; links_cur; links_cur = links_cur->next) { | |
if (!links_cur->ishidden) | |
continue; | |
@@ -1507,20 +1544,6 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen) | |
xmldata(p, data, datalen); /* treat CDATA as data */ | |
} | |
-/* compare tag name (case-insensitive) */ | |
-int | |
-tagcmp(const char *s1, const char *s2) | |
-{ | |
- return strcasecmp(s1, s2); | |
-} | |
- | |
-/* compare attribute name (case-insensitive) */ | |
-int | |
-attrcmp(const char *s1, const char *s2) | |
-{ | |
- return strcasecmp(s1, s2); | |
-} | |
- | |
/* lookup function to compare tag name (case-insensitive) for sort functions */ | |
int | |
findtagcmp(const void *v1, const void *v2) | |
@@ -1582,6 +1605,7 @@ handleendtag(struct tag *tag) | |
static void | |
endnode(struct node *cur) | |
{ | |
+ struct linkref *ref; | |
int i, ishidden; | |
/* set a flag indicating the element and its parent containers have da… | |
@@ -1597,14 +1621,24 @@ endnode(struct node *cur) | |
/* add link and show the link number in the visible order */ | |
if (!ishidden && nodes_links[curnode].len > 0) { | |
- addlinkref(nodes_links[curnode].data, cur->tag.name, ishidden); | |
+ if (uniqrefs) | |
+ ref = findlinkref(nodes_links[curnode].data); | |
+ else | |
+ ref = NULL; | |
+ | |
+ /* new link: add it */ | |
+ if (!ref) { | |
+ linkcount++; | |
+ ref = addlinkref(nodes_links[curnode].data, | |
+ cur->tag.name, ishidden, linkcount); | |
+ } | |
+ | |
if (showrefinline) | |
- hprintf("[%zu]", ++linkcount); | |
+ hprintf("[%zu]", ref->linknr); | |
if (showurlinline) | |
- hprintf(" [%s: %s]", | |
- !tagcmp(cur->tag.name, "a") ? "link" : cur->ta… | |
- nodes_links[curnode].data); | |
- hflush(); | |
+ hprintf(" [%s: %s]", ref->type, ref->url); | |
+ if (showrefinline || showurlinline) | |
+ hflush(); | |
} | |
handleendtag(&(cur->tag)); | |
@@ -2110,7 +2144,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons… | |
void | |
usage(void) | |
{ | |
- fprintf(stderr, "%s [-8aiIlrx] [-b basehref] [-s selector] [-u selecto… | |
+ fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u select… | |
exit(1); | |
} | |
@@ -2134,6 +2168,9 @@ main(int argc, char **argv) | |
usage(); | |
basehrefset = 1; | |
break; | |
+ case 'd': | |
+ uniqrefs = !uniqrefs; | |
+ break; | |
case 'i': | |
showrefinline = !showrefinline; | |
break; |