Introduction
Introduction Statistics Contact Development Disclaimer Help
various improvements - webdump - HTML to plain-text converter for webpages
git clone git://git.codemadness.org/webdump
Log
Files
Refs
README
LICENSE
---
commit 89c9108dc27fe27e0f028f67508a1156ed242d2a
parent 62884d7b5684e791bb0cd6466f74367d6d71618d
Author: Hiltjo Posthuma <[email protected]>
Date: Thu, 14 Sep 2023 22:31:03 +0200
various improvements
- add an unique tagid number per tag. This allows checking by tag number.
- add support for the link reference <frame>, <iframe>, <embed src>.
- improve checking for open optional <p> tags when a block element (such as
<section> is open).
- check if the base URI using the -b option is absolute.
Diffstat:
M webdump.1 | 3 ++-
M webdump.c | 430 +++++++++++++++++------------…
2 files changed, 245 insertions(+), 188 deletions(-)
---
diff --git a/webdump.1 b/webdump.1
@@ -1,4 +1,4 @@
-.Dd September 12, 2023
+.Dd September 14, 2023
.Dt WEBDUMP 1
.Os
.Sh NAME
@@ -18,6 +18,7 @@ It converts and writes the output as plain-text to stdout.
A
.Ar baseurl
can be specified if the links in the feed are relative URLs.
+This must be an absolute URI.
.Bl -tag -width Ds
.It Fl 8
Use UTF-8 symbols for certain items like bullet items and rulers to make the
diff --git a/webdump.c b/webdump.c
@@ -53,19 +53,6 @@ static int termwidth = 77; /* terminal width */
static int resources = 0; /* write resources line-by-line to fd 3? */
static int uniqrefs = 0; /* number unique references */
-/* linked-list of link references */
-struct linkref {
- char *type;
- char *url;
- int ishidden;
- size_t linknr;
- struct linkref *next;
-};
-
-static struct linkref *links_head;
-static struct linkref *links_cur;
-static int linkcount; /* visible link count */
-
enum DisplayType {
DisplayUnknown = 0,
DisplayInline = 1 << 0,
@@ -106,8 +93,22 @@ typedef struct string {
size_t bufsiz; /* allocated size */
} String;
+enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
+TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite,
+TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir,
+TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure,
+TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6,
+TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns,
+TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta,
+TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript,
+TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
+TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot,
+TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo,
+TagWbr, TagXmp };
+
struct tag {
const char *name;
+ enum TagId id;
enum DisplayType displaytype;
enum MarkupType markuptype; /* ANSI markup */
enum DisplayType parenttype; /* display type belonging to element */
@@ -150,6 +151,20 @@ struct selectors {
size_t count;
};
+/* linked-list of link references */
+struct linkref {
+ char *type;
+ enum TagId tagid;
+ char *url;
+ int ishidden;
+ size_t linknr;
+ struct linkref *next;
+};
+
+static struct linkref *links_head;
+static struct linkref *links_cur;
+static int linkcount; /* visible link count */
+
static const char *str_bullet_item = "* ";
static const char *str_checkbox_checked = "x";
static const char *str_ruler = "-";
@@ -212,96 +227,100 @@ static enum MarkupType curmarkup;
/* selector to match */
static struct selectors *sel_hide, *sel_show;
-/* tag displaytype markup parent …
+/* tags table: needs to be sorted like tagcmp(), alphabetically */
+
+/* tag id displaytype markup …
static struct tag tags[] = {
-{ "a", DisplayInline, MarkupUnderline, 0, …
-{ "address", DisplayBlock, 0, 0, …
-{ "area", DisplayInline, 0, 0, …
-{ "article", DisplayBlock, 0, 0, …
-{ "aside", DisplayBlock, 0, 0, …
-{ "audio", DisplayInline, MarkupUnderline, 0, …
-{ "b", DisplayInline, MarkupBold, 0, …
-{ "base", DisplayInline, 0, 0, …
-{ "blink", DisplayInline, MarkupBlink, 0, …
-{ "blockquote", DisplayBlock, 0, 0, …
-{ "body", DisplayBlock, 0, 0, …
-{ "br", 0, 0, 0, …
-{ "button", DisplayInline | DisplayButton, 0, 0, …
-{ "cite", DisplayInline, MarkupItalic, 0, …
-{ "col", DisplayInline, 0, 0, …
-{ "colgroup", DisplayInline, 0, 0, …
-{ "datalist", DisplayNone, 0, 0, …
-{ "dd", DisplayBlock, 0, 0, …
-{ "del", DisplayInline, MarkupStrike, 0, …
-{ "details", DisplayBlock, 0, 0, …
-{ "dfn", DisplayInline, MarkupItalic, 0, …
-{ "dir", DisplayList, 0, 0, …
-{ "div", DisplayBlock, 0, 0, …
-{ "dl", DisplayBlock | DisplayDl, 0, 0, …
-{ "dt", DisplayBlock, MarkupBold, 0, …
-{ "em", DisplayInline, MarkupItalic, 0, …
-{ "embed", DisplayInline, 0, 0, …
-{ "fieldset", DisplayBlock, 0, 0, …
-{ "figcaption", DisplayBlock, 0, 0, …
-{ "figure", DisplayBlock, 0, 0, …
-{ "footer", DisplayBlock, 0, 0, …
-{ "form", DisplayBlock, 0, 0, …
-{ "h1", DisplayHeader, MarkupBold, 0, …
-{ "h2", DisplayHeader, MarkupBold, 0, …
-{ "h3", DisplayHeader, MarkupBold, 0, …
-{ "h4", DisplayHeader, MarkupBold, 0, …
-{ "h5", DisplayHeader, MarkupBold, 0, …
-{ "h6", DisplayHeader, MarkupBold, 0, …
-{ "head", DisplayBlock, 0, 0, …
-{ "header", DisplayBlock, 0, 0, …
-{ "hr", DisplayBlock, 0, 0, …
-{ "html", DisplayBlock, 0, 0, …
-{ "i", DisplayInline, MarkupItalic, 0, …
-{ "img", DisplayInline, MarkupUnderline, 0, …
-{ "input", DisplayInput, 0, 0, …
-{ "ins", DisplayInline, MarkupUnderline, 0, …
-{ "label", DisplayInline, 0, 0, …
-{ "legend", DisplayBlock, 0, 0, …
-{ "li", DisplayListItem, 0, DisplayList…
-{ "link", DisplayInline, 0, 0, …
-{ "main", DisplayBlock, 0, 0, …
-{ "mark", DisplayInline, MarkupReverse, 0, …
-{ "menu", DisplayList, 0, 0, …
-{ "meta", DisplayInline, 0, 0, …
-{ "nav", DisplayBlock, 0, 0, …
-{ "object", DisplayInline, 0, 0, …
-{ "ol", DisplayList | DisplayListOrdered, 0, 0, …
-{ "option", DisplayInline | DisplayOption, 0, 0, …
-{ "p", DisplayBlock, 0, 0, …
-{ "param", DisplayInline, 0, 0, …
-{ "pre", DisplayPre, 0, 0, …
-{ "s", DisplayInline, MarkupStrike, 0, …
-{ "search", DisplayBlock, 0, 0, …
-{ "script", DisplayNone, 0, 0, …
-{ "section", DisplayBlock, 0, 0, …
-{ "select", DisplayInline | DisplaySelect, 0, 0, …
-{ "source", DisplayInline, 0, 0, …
-{ "strike", DisplayInline, MarkupStrike, 0, …
-{ "strong", DisplayInline, MarkupBold, 0, …
-{ "style", DisplayNone, 0, 0, …
-{ "summary", DisplayBlock, 0, 0, …
-{ "table", DisplayTable, 0, 0, …
-{ "tbody", DisplayInline, 0, DisplayTabl…
-{ "td", DisplayTableCell, 0, DisplayTabl…
-{ "template", DisplayNone, 0, 0, …
-{ "textarea", DisplayInline, 0, 0, …
-{ "tfoot", DisplayInline, 0, DisplayTabl…
-{ "th", DisplayTableCell, MarkupBold, DisplayTabl…
-{ "thead", DisplayInline, 0, DisplayTabl…
-{ "title", DisplayBlock, 0, 0, …
-{ "tr", DisplayTableRow, 0, DisplayTabl…
-{ "track", DisplayInline, 0, 0, …
-{ "u", DisplayInline, MarkupUnderline, 0, …
-{ "ul", DisplayList, 0, 0, …
-{ "var", DisplayInline, MarkupItalic, 0, …
-{ "video", DisplayInline, MarkupUnderline, 0, …
-{ "wbr", DisplayInline, 0, 0, …
-{ "xmp", DisplayPre, 0, 0, …
+{ "a", TagA, DisplayInline, MarkupUnderli…
+{ "address", TagAddress, DisplayBlock, 0, …
+{ "area", TagArea, DisplayInline, 0, …
+{ "article", TagArticle, DisplayBlock, 0, …
+{ "aside", TagAside, DisplayBlock, 0, …
+{ "audio", TagAudio, DisplayInline, MarkupUnderli…
+{ "b", TagB, DisplayInline, MarkupBold, …
+{ "base", TagBase, DisplayInline, 0, …
+{ "blink", TagBlink, DisplayInline, MarkupBlink, …
+{ "blockquote", TagBlockquote, DisplayBlock, 0, …
+{ "body", TagBody, DisplayBlock, 0, …
+{ "br", TagBr, 0, 0, …
+{ "button", TagButton, DisplayInline | DisplayButton, 0, …
+{ "cite", TagCite, DisplayInline, MarkupItalic,…
+{ "col", TagCol, DisplayInline, 0, …
+{ "colgroup", TagColgroup, DisplayInline, 0, …
+{ "datalist", TagDatalist, DisplayNone, 0, …
+{ "dd", TagDd, DisplayBlock, 0, …
+{ "del", TagDel, DisplayInline, MarkupStrike,…
+{ "details", TagDetails, DisplayBlock, 0, …
+{ "dfn", TagDfn, DisplayInline, MarkupItalic,…
+{ "dir", TagDir, DisplayList, 0, …
+{ "div", TagDiv, DisplayBlock, 0, …
+{ "dl", TagDl, DisplayBlock | DisplayDl, 0, …
+{ "dt", TagDt, DisplayBlock, MarkupBold, …
+{ "em", TagEm, DisplayInline, MarkupItalic,…
+{ "embed", TagEmbed, DisplayInline, 0, …
+{ "fieldset", TagFieldset, DisplayBlock, 0, …
+{ "figcaption", TagFigcaption, DisplayBlock, 0, …
+{ "figure", TagFigure, DisplayBlock, 0, …
+{ "footer", TagFooter, DisplayBlock, 0, …
+{ "form", TagForm, DisplayBlock, 0, …
+{ "frame", TagFrame, DisplayInline, 0, …
+{ "h1", TagH1, DisplayHeader, MarkupBold, …
+{ "h2", TagH2, DisplayHeader, MarkupBold, …
+{ "h3", TagH3, DisplayHeader, MarkupBold, …
+{ "h4", TagH4, DisplayHeader, MarkupBold, …
+{ "h5", TagH5, DisplayHeader, MarkupBold, …
+{ "h6", TagH6, DisplayHeader, MarkupBold, …
+{ "head", TagHead, DisplayBlock, 0, …
+{ "header", TagHeader, DisplayBlock, 0, …
+{ "hr", TagHr, DisplayBlock, 0, …
+{ "html", TagHtml, DisplayBlock, 0, …
+{ "i", TagI, DisplayInline, MarkupItalic,…
+{ "iframe", TagIframe, DisplayInline, 0, …
+{ "img", TagImg, DisplayInline, MarkupUnderli…
+{ "input", TagInput, DisplayInput, 0, …
+{ "ins", TagIns, DisplayInline, MarkupUnderli…
+{ "label", TagLabel, DisplayInline, 0, …
+{ "legend", TagLegend, DisplayBlock, 0, …
+{ "li", TagLi, DisplayListItem, 0, …
+{ "link", TagLink, DisplayInline, 0, …
+{ "main", TagMain, DisplayBlock, 0, …
+{ "mark", TagMark, DisplayInline, MarkupReverse…
+{ "menu", TagMenu, DisplayList, 0, …
+{ "meta", TagMeta, DisplayInline, 0, …
+{ "nav", TagNav, DisplayBlock, 0, …
+{ "object", TagObject, DisplayInline, 0, …
+{ "ol", TagOl, DisplayList | DisplayListOrdered, 0, …
+{ "option", TagOption, DisplayInline | DisplayOption, 0, …
+{ "p", TagP, DisplayBlock, 0, …
+{ "param", TagParam, DisplayInline, 0, …
+{ "pre", TagPre, DisplayPre, 0, …
+{ "s", TagS, DisplayInline, MarkupStrike,…
+{ "script", TagScript, DisplayNone, 0, …
+{ "search", TagSearch, DisplayBlock, 0, …
+{ "section", TagSection, DisplayBlock, 0, …
+{ "select", TagSelect, DisplayInline | DisplaySelect, 0, …
+{ "source", TagSource, DisplayInline, 0, …
+{ "strike", TagStrike, DisplayInline, MarkupStrike,…
+{ "strong", TagStrong, DisplayInline, MarkupBold, …
+{ "style", TagStyle, DisplayNone, 0, …
+{ "summary", TagSummary, DisplayBlock, 0, …
+{ "table", TagTable, DisplayTable, 0, …
+{ "tbody", TagTbody, DisplayInline, 0, …
+{ "td", TagTd, DisplayTableCell, 0, …
+{ "template", TagTemplate, DisplayNone, 0, …
+{ "textarea", TagTextarea, DisplayInline, 0, …
+{ "tfoot", TagTfoot, DisplayInline, 0, …
+{ "th", TagTh, DisplayTableCell, MarkupBold, …
+{ "thead", TagThead, DisplayInline, 0, …
+{ "title", TagTitle, DisplayBlock, 0, …
+{ "tr", TagTr, DisplayTableRow, 0, …
+{ "track", TagTrack, DisplayInline, 0, …
+{ "u", TagU, DisplayInline, MarkupUnderli…
+{ "ul", TagUl, DisplayList, 0, …
+{ "var", TagVar, DisplayInline, MarkupItalic,…
+{ "video", TagVideo, DisplayInline, MarkupUnderli…
+{ "wbr", TagWbr, DisplayInline, 0, …
+{ "xmp", TagXmp, DisplayPre, 0, …
};
/* hint for compilers and static analyzers that a function exits */
@@ -1374,9 +1393,10 @@ findlinkref(const char *url)
}
static struct linkref *
-addlinkref(const char *url, const char *_type, int ishidden, int linknr)
+addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden,
+ int linknr)
{
- if (!tagcmp(_type, "a"))
+ if (tagid == TagA)
_type = "link";
/* add to linked list */
@@ -1386,6 +1406,7 @@ addlinkref(const char *url, const char *_type, int ishidd…
links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
links_cur->url = estrdup(url);
links_cur->type = estrdup(_type);
+ links_cur->tagid = tagid;
links_cur->ishidden = ishidden;
links_cur->linknr = linknr;
@@ -1441,7 +1462,7 @@ handleinlinelink(void)
/* add hidden links directly to the reference,
the order doesn't matter */
if (cur->tag.displaytype & DisplayNone)
- addlinkref(url, cur->tag.name, 1, 0);
+ addlinkref(url, cur->tag.name, cur->tag.id, 1, 0);
}
void
@@ -1658,7 +1679,7 @@ endnode(struct node *cur)
if (!ref) {
linkcount++;
ref = addlinkref(nodes_links[curnode].data,
- cur->tag.name, ishidden, linkcount);
+ cur->tag.name, cur->tag.id, ishidden, linkcoun…
}
if (showrefinline || showurlinline) {
@@ -1669,7 +1690,7 @@ endnode(struct node *cur)
if (showrefinline)
hprintf("[%zu]", ref->linknr);
if (showurlinline) {
- if (!tagcmp("link", ref->type))
+ if (ref->tagid == TagA)
hprintf("[%s]", ref->url);
else
hprintf("[%s: %s]", ref->type, ref->url);
@@ -1687,7 +1708,7 @@ static void
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
struct tag *found, *tag;
- char *child, *childs[16];
+ enum TagId child, childs[16];
size_t nchilds;
int i, j, k, nchildfound, parenttype;
@@ -1701,35 +1722,39 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int i…
in reality the optional tag rules are more complex, see:
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
- child = NULL;
+ child = 0;
nchilds = 0;
nchildfound = 0;
- parenttype = 0;
+ parenttype = 0; /* by default, seek until the root */
if (found && found->displaytype & DisplayPre) {
skipinitialws = 0; /* do not skip white-space, for margins */
} else if (found && found->displaytype & DisplayList) {
- childs[0] = "li";
+ childs[0] = TagLi;
nchilds = 1;
parenttype = DisplayList;
} else if (found && found->displaytype & DisplayTableRow) {
- childs[0] = "td";
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTableRow;
} else if (found && found->displaytype & DisplayTable) {
- childs[0] = "td";
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTable;
} else if (found && found->displaytype & DisplaySelect) {
- childs[0] = "option";
+ childs[0] = TagOption;
nchilds = 1;
parenttype = DisplaySelect;
} else if (found && found->displaytype & DisplayDl) {
- childs[0] = "p";
- childs[1] = "dd";
- childs[2] = "dt";
+ childs[0] = TagP;
+ childs[1] = TagDd;
+ childs[2] = TagDt;
nchilds = 3;
parenttype = DisplayDl;
+ } else if (found && found->displaytype & DisplayBlock) {
+ childs[0] = TagP;
+ nchilds = 1;
+ parenttype = 0; /* seek until the root */
}
if (nchilds > 0) {
@@ -1740,7 +1765,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int iss…
break;
for (j = 0; j < nchilds; j++) {
child = childs[j];
- if (!tagcmp(nodes[i].tag.name, child)) {
+ if (nodes[i].tag.id == child) {
/* fake closing the previous tags */
for (k = curnode; k >= i; k--)
endnode(&nodes[k]);
@@ -1794,7 +1819,8 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
{
struct tag *found;
struct node *cur;
- char *child, *childs[16];
+ enum TagId tagid;
+ enum TagId child, childs[16];
size_t nchilds;
char *s;
int i, j, k, nchildfound, parenttype;
@@ -1821,55 +1847,56 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
in reality the optional tag rules are more complex, see:
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
- child = NULL;
+ child = 0;
nchilds = 0;
nchildfound = 0;
- parenttype = 0;
+ parenttype = 0; /* by default, seek until the root */
/* if optional tag <p> is open and a list element is found, close </p>…
if (found && found->displaytype & DisplayList) {
/* not inside a list */
- childs[0] = "p";
+ childs[0] = TagP;
nchilds = 1;
parenttype = DisplayList;
} else if (found && found->isoptional) {
- if (!tagcmp(t, "li")) {
- childs[0] = "li";
+ tagid = found->id;
+ if (tagid == TagLi) {
+ childs[0] = TagLi;
nchilds = 1;
parenttype = DisplayList;
- } else if (!tagcmp(t, "td")) {
- childs[0] = "td";
+ } else if (tagid == TagTd) {
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTableRow;
- } else if (!tagcmp(t, "tr")) {
- childs[0] = "tr";
+ } else if (tagid == TagTr) {
+ childs[0] = TagTr;
nchilds = 1;
parenttype = DisplayTable;
- } else if (!tagcmp(t, "p")) {
- childs[0] = "p";
+ } else if (tagid == TagP) {
+ childs[0] = TagP;
nchilds = 1;
parenttype = 0; /* seek until the root */
- } else if (!tagcmp(t, "option")) {
- childs[0] = "option";
+ } else if (tagid == TagOption) {
+ childs[0] = TagOption;
nchilds = 1;
parenttype = DisplaySelect;
- } else if (!tagcmp(t, "dt")) {
- childs[0] = "dd";
+ } else if (tagid == TagDt) {
+ childs[0] = TagDd;
nchilds = 1;
parenttype = DisplayDl;
- } else if (!tagcmp(t, "dd")) {
- childs[0] = "dd";
- childs[1] = "dt";
+ } else if (tagid == TagDd) {
+ childs[0] = TagDd;
+ childs[1] = TagDt;
nchilds = 2;
parenttype = DisplayDl;
- } else if (!tagcmp(t, cur->tag.name)) {
+ } else if (tagid == cur->tag.id) {
/* fake closing the previous tag if it is the same and…
xmltagend(p, t, tl, 0);
}
} else if (found && found->displaytype & DisplayBlock) {
/* check if we have an open "<p>" tag */
- childs[0] = "p";
- childs[1] = "dl";
+ childs[0] = TagP;
+ childs[1] = TagDl;
nchilds = 2;
parenttype = DisplayDl;
}
@@ -1882,7 +1909,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
break;
for (j = 0; j < nchilds; j++) {
child = childs[j];
- if (!tagcmp(nodes[i].tag.name, child)) {
+ if (nodes[i].tag.id == child) {
/* fake closing the previous tags */
for (k = curnode; k >= i; k--)
xmltagend(p, nodes[k].tag.name…
@@ -1917,19 +1944,26 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
+ struct tag *found;
+ enum TagId tagid;
struct node *cur, *parent;
int i, margintop;
+ /* match tag */
+ tagid = 0;
+ if ((found = findtag(t)))
+ tagid = found->id;
+
/* temporary replace the callback except the reader and end of tag
restore the context once we receive the same ignored tag in the
end tag handler */
- if (!tagcmp(t, "script")) {
+ if (tagid == TagScript) {
ignorestate = endtag = "</script>";
getnext = p->getnext; /* for restore */
p->getnext = getnext_ignore;
xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
return;
- } else if (!tagcmp(t, "style")) {
+ } else if (tagid == TagStyle) {
ignorestate = endtag = "</style>";
getnext = p->getnext; /* for restore */
p->getnext = getnext_ignore;
@@ -2089,12 +2123,12 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t t…
the node */
cur->hasdata = 0;
- if (!tagcmp(t, "hr")) { /* ruler */
+ if (tagid == TagHr) { /* ruler */
i = termwidth - indent - defaultindent;
for (; i > 0; i--)
hprint(str_ruler);
cur->hasdata = 1; /* treat <hr/> as data */
- } else if (!tagcmp(t, "br")) {
+ } else if (tagid == TagBr) {
hflush();
hadnewline = 0; /* forced newline */
hputchar('\n');
@@ -2107,65 +2141,78 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t t…
}
static void
-xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
- size_t namelen, const char *value, size_t valuelen)
+xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
+ size_t nl, const char *v, size_t vl)
{
struct node *cur;
+ enum TagId tagid;
cur = &nodes[curnode];
-
- if (!attrcmp(name, "class"))
- string_append(&attr_class, value, valuelen);
- else if (!attrcmp(name, "id"))
- string_append(&attr_id, value, valuelen);
-
- /* <base href="..." /> */
- if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base"))
- strlcat(basehrefdoc, value, sizeof(basehrefdoc));
+ tagid = cur->tag.id;
/* hide tags with attribute aria-hidden or hidden */
- if (!attrcmp(name, "aria-hidden") || !attrcmp(name, "hidden"))
+ if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
cur->tag.displaytype |= DisplayNone;
- if (!tagcmp(tag, "select") && !attrcmp(name, "multiple"))
- cur->tag.displaytype |= DisplaySelectMulti;
+ if (!attrcmp(n, "class"))
+ string_append(&attr_class, v, vl);
+ else if (!attrcmp(n, "id"))
+ string_append(&attr_id, v, vl);
+ else if (!attrcmp(n, "type"))
+ string_append(&attr_type, v, vl);
+ else if (!attrcmp(n, "value"))
+ string_append(&attr_value, v, vl);
- if (!tagcmp(tag, "a") && !attrcmp(name, "href"))
- string_append(&attr_href, value, valuelen);
+ /* <base href="..." /> */
+ if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
+ strlcat(basehrefdoc, v, sizeof(basehrefdoc));
- if (!tagcmp(tag, "object") && !attrcmp(name, "data"))
- string_append(&attr_data, value, valuelen);
+ if (tagid == TagA && !attrcmp(n, "href"))
+ string_append(&attr_href, v, vl);
- if ((!tagcmp(tag, "img") || !tagcmp(tag, "video") ||
- !tagcmp(tag, "source") || !tagcmp(tag, "track") ||
- !tagcmp(tag, "audio")) &&
- !attrcmp(name, "src") && valuelen)
- string_append(&attr_src, value, valuelen);
+ if (tagid == TagSelect && !attrcmp(n, "multiple"))
+ cur->tag.displaytype |= DisplaySelectMulti;
- /* show img alt attribute as text. */
- if (!tagcmp(tag, "img") && !attrcmp(name, "alt"))
- string_append(&attr_alt, value, valuelen);
+ if (tagid == TagObject && !attrcmp(n, "data"))
+ string_append(&attr_data, v, vl);
- if (!attrcmp(name, "checked"))
- string_append(&attr_checked, value, valuelen);
- else if (!attrcmp(name, "type"))
- string_append(&attr_type, value, valuelen);
- else if (!attrcmp(name, "value"))
- string_append(&attr_value, value, valuelen);
+ /* show img alt attribute as text. */
+ if (tagid == TagImg && !attrcmp(n, "alt"))
+ string_append(&attr_alt, v, vl);
+
+ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
+ string_append(&attr_checked, v, vl);
+
+ /* src attribute */
+ switch (tagid) {
+ case TagAudio:
+ case TagEmbed:
+ case TagFrame:
+ case TagIframe:
+ case TagImg:
+ case TagSource:
+ case TagTrack:
+ case TagVideo:
+ if (!attrcmp(n, "src"))
+ string_append(&attr_src, v, vl);
+ break;
+ default:
+ break;
+ }
}
static void
-xmlattrentity(XMLParser *p, const char *tag, size_t taglen, const char *name,
- size_t namelen, const char *value, size_t valuelen)
+xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
+ size_t nl, const char *v, size_t vl)
{
char buf[16];
- int n;
+ int len;
- n = xml_entitytostr(value, buf, sizeof(buf));
- if (n > 0)
- xmlattr(p, tag, taglen, name, namelen, buf, (size_t)n);
+ len = xml_entitytostr(v, buf, sizeof(buf));
+ if (len > 0)
+ xmlattr(p, t, tl, n, nl, buf, (size_t)len);
else
- xmlattr(p, tag, taglen, name, namelen, value, valuelen);
+ xmlattr(p, t, tl, n, nl, v, vl);
}
static void
@@ -2173,12 +2220,14 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, cons…
size_t nl)
{
struct node *cur;
+ enum TagId tagid;
cur = &nodes[curnode];
+ tagid = cur->tag.id;
/* set base URL, if it is set it cannot be overwritten again */
if (!basehrefset && basehrefdoc[0] &&
- !attrcmp(n, "href") && !tagcmp(t, "base"))
+ tagid == TagBase && !attrcmp(n, "href"))
basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
/* if attribute checked is set but it has no value then set it to "che…
@@ -2190,6 +2239,12 @@ static void
xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
size_t nl)
{
+ struct node *cur;
+ enum TagId tagid;
+
+ cur = &nodes[curnode];
+ tagid = cur->tag.id;
+
if (!attrcmp(n, "alt"))
string_clear(&attr_alt);
else if (!attrcmp(n, "checked"))
@@ -2209,7 +2264,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons…
else if (!attrcmp(n, "value"))
string_clear(&attr_value);
- if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base"))
+ if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
basehrefdoc[0] = '\0';
}
@@ -2236,7 +2291,8 @@ main(int argc, char **argv)
break;
case 'b':
basehref = EARGF(usage());
- if (uri_parse(basehref, &base) == -1)
+ if (uri_parse(basehref, &base) == -1 ||
+ !base.proto[0])
usage();
basehrefset = 1;
break;
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.