Introduction
Introduction Statistics Contact Development Disclaimer Help
add option for unique link references (-d) - webdump - HTML to plain-text conve…
git clone git://git.codemadness.org/webdump
Log
Files
Refs
README
LICENSE
---
commit 91d236dab89449465eb123d756a450a17eb4195a
parent 790402682bab675461f2a12879408dd5ad30c90f
Author: Hiltjo Posthuma <[email protected]>
Date: Tue, 12 Sep 2023 20:02:57 +0200
add option for unique link references (-d)
... also make link type "a" consistently "link" (also at the bottom
references).
... also flush inline link only if needed
Diffstat:
M webdump.1 | 7 +++++--
M webdump.c | 87 ++++++++++++++++++++++-------…
2 files changed, 67 insertions(+), 27 deletions(-)
---
diff --git a/webdump.1 b/webdump.1
@@ -1,4 +1,4 @@
-.Dd September 8, 2023
+.Dd September 12, 2023
.Dt WEBDUMP 1
.Os
.Sh NAME
@@ -6,7 +6,7 @@
.Nd convert HTML to plain-text
.Sh SYNOPSIS
.Nm
-.Op Fl 8aiIlrx
+.Op Fl 8adiIlrx
.Op Fl b Ar baseurl
.Op Fl s Ar selector
.Op Fl u Ar selector
@@ -28,6 +28,9 @@ Toggle ANSI escape codes usage, by default it is not enabled.
Base URL of links.
This is used to make links absolute.
The specified URL is always preferred over the value in a <base/> tag.
+.It Fl d
+Deduplicate link references.
+When a duplicate link reference is found reuse the same link reference number.
.It Fl i
Toggle if link reference numbers are displayed inline or not, by default it is
not enabled.
diff --git a/webdump.c b/webdump.c
@@ -51,12 +51,14 @@ static int showurlinline = 0; /* show full link reference …
static int linewrap = 0; /* line-wrapping */
static int termwidth = 77; /* terminal width */
static int resources = 0; /* write resources line-by-line to fd 3? */
+static int uniqrefs = 0; /* number unique references */
/* linked-list of link references */
struct linkref {
char *type;
char *url;
int ishidden;
+ size_t linknr;
struct linkref *next;
};
@@ -628,6 +630,20 @@ uri_format(char *buf, size_t bufsiz, struct uri *u)
u->fragment);
}
+/* compare tag name (case-insensitive) */
+int
+tagcmp(const char *s1, const char *s2)
+{
+ return strcasecmp(s1, s2);
+}
+
+/* compare attribute name (case-insensitive) */
+int
+attrcmp(const char *s1, const char *s2)
+{
+ return strcasecmp(s1, s2);
+}
+
static void
rindent(void)
{
@@ -1325,9 +1341,26 @@ handleinlinealt(void)
}
}
-static void
-addlinkref(const char *url, const char *_type, int ishidden)
+/* slow linear lookup of link references
+ TODO: optimize it, maybe using tree.h RB_TREE? */
+static struct linkref *
+findlinkref(const char *url)
{
+ struct linkref *cur;
+
+ for (cur = links_head; cur; cur = cur->next) {
+ if (!strcmp(url, cur->url))
+ return cur;
+ }
+ return NULL;
+}
+
+static struct linkref *
+addlinkref(const char *url, const char *_type, int ishidden, int linknr)
+{
+ if (!tagcmp(_type, "a"))
+ _type = "link";
+
/* add to linked list */
if (!links_head)
links_cur = links_head = ecalloc(1, sizeof(*links_head));
@@ -1336,6 +1369,9 @@ addlinkref(const char *url, const char *_type, int ishidd…
links_cur->url = estrdup(url);
links_cur->type = estrdup(_type);
links_cur->ishidden = ishidden;
+ links_cur->linknr = linknr;
+
+ return links_cur;
}
static void
@@ -1382,7 +1418,7 @@ handleinlinelink(void)
/* add hidden links directly to the reference,
the order doesn't matter */
if (cur->tag.displaytype & DisplayNone)
- addlinkref(url, cur->tag.name, 1);
+ addlinkref(url, cur->tag.name, 1, 0);
}
void
@@ -1407,12 +1443,13 @@ printlinkrefs(void)
hashiddenrefs = 1;
continue;
}
- printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type);
+ printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, l…
i++;
}
if (hashiddenrefs)
printf("\n\nHidden references\n\n");
+ /* hidden links don't have a link number, just count them */
for (links_cur = links_head; links_cur; links_cur = links_cur->next) {
if (!links_cur->ishidden)
continue;
@@ -1507,20 +1544,6 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
xmldata(p, data, datalen); /* treat CDATA as data */
}
-/* compare tag name (case-insensitive) */
-int
-tagcmp(const char *s1, const char *s2)
-{
- return strcasecmp(s1, s2);
-}
-
-/* compare attribute name (case-insensitive) */
-int
-attrcmp(const char *s1, const char *s2)
-{
- return strcasecmp(s1, s2);
-}
-
/* lookup function to compare tag name (case-insensitive) for sort functions */
int
findtagcmp(const void *v1, const void *v2)
@@ -1582,6 +1605,7 @@ handleendtag(struct tag *tag)
static void
endnode(struct node *cur)
{
+ struct linkref *ref;
int i, ishidden;
/* set a flag indicating the element and its parent containers have da…
@@ -1597,14 +1621,24 @@ endnode(struct node *cur)
/* add link and show the link number in the visible order */
if (!ishidden && nodes_links[curnode].len > 0) {
- addlinkref(nodes_links[curnode].data, cur->tag.name, ishidden);
+ if (uniqrefs)
+ ref = findlinkref(nodes_links[curnode].data);
+ else
+ ref = NULL;
+
+ /* new link: add it */
+ if (!ref) {
+ linkcount++;
+ ref = addlinkref(nodes_links[curnode].data,
+ cur->tag.name, ishidden, linkcount);
+ }
+
if (showrefinline)
- hprintf("[%zu]", ++linkcount);
+ hprintf("[%zu]", ref->linknr);
if (showurlinline)
- hprintf(" [%s: %s]",
- !tagcmp(cur->tag.name, "a") ? "link" : cur->ta…
- nodes_links[curnode].data);
- hflush();
+ hprintf(" [%s: %s]", ref->type, ref->url);
+ if (showrefinline || showurlinline)
+ hflush();
}
handleendtag(&(cur->tag));
@@ -2110,7 +2144,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons…
void
usage(void)
{
- fprintf(stderr, "%s [-8aiIlrx] [-b basehref] [-s selector] [-u selecto…
+ fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u select…
exit(1);
}
@@ -2134,6 +2168,9 @@ main(int argc, char **argv)
usage();
basehrefset = 1;
break;
+ case 'd':
+ uniqrefs = !uniqrefs;
+ break;
case 'i':
showrefinline = !showrefinline;
break;
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.