various improvements - webdump - HTML to plain-text converter for webpages | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 89c9108dc27fe27e0f028f67508a1156ed242d2a | |
parent 62884d7b5684e791bb0cd6466f74367d6d71618d | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Thu, 14 Sep 2023 22:31:03 +0200 | |
various improvements | |
- add an unique tagid number per tag. This allows checking by tag number. | |
- add support for the link reference <frame>, <iframe>, <embed src>. | |
- improve checking for open optional <p> tags when a block element (such as | |
<section> is open). | |
- check if the base URI using the -b option is absolute. | |
Diffstat: | |
M webdump.1 | 3 ++- | |
M webdump.c | 430 +++++++++++++++++------------… | |
2 files changed, 245 insertions(+), 188 deletions(-) | |
--- | |
diff --git a/webdump.1 b/webdump.1 | |
@@ -1,4 +1,4 @@ | |
-.Dd September 12, 2023 | |
+.Dd September 14, 2023 | |
.Dt WEBDUMP 1 | |
.Os | |
.Sh NAME | |
@@ -18,6 +18,7 @@ It converts and writes the output as plain-text to stdout. | |
A | |
.Ar baseurl | |
can be specified if the links in the feed are relative URLs. | |
+This must be an absolute URI. | |
.Bl -tag -width Ds | |
.It Fl 8 | |
Use UTF-8 symbols for certain items like bullet items and rulers to make the | |
diff --git a/webdump.c b/webdump.c | |
@@ -53,19 +53,6 @@ static int termwidth = 77; /* terminal width */ | |
static int resources = 0; /* write resources line-by-line to fd 3? */ | |
static int uniqrefs = 0; /* number unique references */ | |
-/* linked-list of link references */ | |
-struct linkref { | |
- char *type; | |
- char *url; | |
- int ishidden; | |
- size_t linknr; | |
- struct linkref *next; | |
-}; | |
- | |
-static struct linkref *links_head; | |
-static struct linkref *links_cur; | |
-static int linkcount; /* visible link count */ | |
- | |
enum DisplayType { | |
DisplayUnknown = 0, | |
DisplayInline = 1 << 0, | |
@@ -106,8 +93,22 @@ typedef struct string { | |
size_t bufsiz; /* allocated size */ | |
} String; | |
+enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio, | |
+TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite, | |
+TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir, | |
+TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure, | |
+TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6, | |
+TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns, | |
+TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta, | |
+TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript, | |
+TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, | |
+TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot, | |
+TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo, | |
+TagWbr, TagXmp }; | |
+ | |
struct tag { | |
const char *name; | |
+ enum TagId id; | |
enum DisplayType displaytype; | |
enum MarkupType markuptype; /* ANSI markup */ | |
enum DisplayType parenttype; /* display type belonging to element */ | |
@@ -150,6 +151,20 @@ struct selectors { | |
size_t count; | |
}; | |
+/* linked-list of link references */ | |
+struct linkref { | |
+ char *type; | |
+ enum TagId tagid; | |
+ char *url; | |
+ int ishidden; | |
+ size_t linknr; | |
+ struct linkref *next; | |
+}; | |
+ | |
+static struct linkref *links_head; | |
+static struct linkref *links_cur; | |
+static int linkcount; /* visible link count */ | |
+ | |
static const char *str_bullet_item = "* "; | |
static const char *str_checkbox_checked = "x"; | |
static const char *str_ruler = "-"; | |
@@ -212,96 +227,100 @@ static enum MarkupType curmarkup; | |
/* selector to match */ | |
static struct selectors *sel_hide, *sel_show; | |
-/* tag displaytype markup parent … | |
+/* tags table: needs to be sorted like tagcmp(), alphabetically */ | |
+ | |
+/* tag id displaytype markup … | |
static struct tag tags[] = { | |
-{ "a", DisplayInline, MarkupUnderline, 0, … | |
-{ "address", DisplayBlock, 0, 0, … | |
-{ "area", DisplayInline, 0, 0, … | |
-{ "article", DisplayBlock, 0, 0, … | |
-{ "aside", DisplayBlock, 0, 0, … | |
-{ "audio", DisplayInline, MarkupUnderline, 0, … | |
-{ "b", DisplayInline, MarkupBold, 0, … | |
-{ "base", DisplayInline, 0, 0, … | |
-{ "blink", DisplayInline, MarkupBlink, 0, … | |
-{ "blockquote", DisplayBlock, 0, 0, … | |
-{ "body", DisplayBlock, 0, 0, … | |
-{ "br", 0, 0, 0, … | |
-{ "button", DisplayInline | DisplayButton, 0, 0, … | |
-{ "cite", DisplayInline, MarkupItalic, 0, … | |
-{ "col", DisplayInline, 0, 0, … | |
-{ "colgroup", DisplayInline, 0, 0, … | |
-{ "datalist", DisplayNone, 0, 0, … | |
-{ "dd", DisplayBlock, 0, 0, … | |
-{ "del", DisplayInline, MarkupStrike, 0, … | |
-{ "details", DisplayBlock, 0, 0, … | |
-{ "dfn", DisplayInline, MarkupItalic, 0, … | |
-{ "dir", DisplayList, 0, 0, … | |
-{ "div", DisplayBlock, 0, 0, … | |
-{ "dl", DisplayBlock | DisplayDl, 0, 0, … | |
-{ "dt", DisplayBlock, MarkupBold, 0, … | |
-{ "em", DisplayInline, MarkupItalic, 0, … | |
-{ "embed", DisplayInline, 0, 0, … | |
-{ "fieldset", DisplayBlock, 0, 0, … | |
-{ "figcaption", DisplayBlock, 0, 0, … | |
-{ "figure", DisplayBlock, 0, 0, … | |
-{ "footer", DisplayBlock, 0, 0, … | |
-{ "form", DisplayBlock, 0, 0, … | |
-{ "h1", DisplayHeader, MarkupBold, 0, … | |
-{ "h2", DisplayHeader, MarkupBold, 0, … | |
-{ "h3", DisplayHeader, MarkupBold, 0, … | |
-{ "h4", DisplayHeader, MarkupBold, 0, … | |
-{ "h5", DisplayHeader, MarkupBold, 0, … | |
-{ "h6", DisplayHeader, MarkupBold, 0, … | |
-{ "head", DisplayBlock, 0, 0, … | |
-{ "header", DisplayBlock, 0, 0, … | |
-{ "hr", DisplayBlock, 0, 0, … | |
-{ "html", DisplayBlock, 0, 0, … | |
-{ "i", DisplayInline, MarkupItalic, 0, … | |
-{ "img", DisplayInline, MarkupUnderline, 0, … | |
-{ "input", DisplayInput, 0, 0, … | |
-{ "ins", DisplayInline, MarkupUnderline, 0, … | |
-{ "label", DisplayInline, 0, 0, … | |
-{ "legend", DisplayBlock, 0, 0, … | |
-{ "li", DisplayListItem, 0, DisplayList… | |
-{ "link", DisplayInline, 0, 0, … | |
-{ "main", DisplayBlock, 0, 0, … | |
-{ "mark", DisplayInline, MarkupReverse, 0, … | |
-{ "menu", DisplayList, 0, 0, … | |
-{ "meta", DisplayInline, 0, 0, … | |
-{ "nav", DisplayBlock, 0, 0, … | |
-{ "object", DisplayInline, 0, 0, … | |
-{ "ol", DisplayList | DisplayListOrdered, 0, 0, … | |
-{ "option", DisplayInline | DisplayOption, 0, 0, … | |
-{ "p", DisplayBlock, 0, 0, … | |
-{ "param", DisplayInline, 0, 0, … | |
-{ "pre", DisplayPre, 0, 0, … | |
-{ "s", DisplayInline, MarkupStrike, 0, … | |
-{ "search", DisplayBlock, 0, 0, … | |
-{ "script", DisplayNone, 0, 0, … | |
-{ "section", DisplayBlock, 0, 0, … | |
-{ "select", DisplayInline | DisplaySelect, 0, 0, … | |
-{ "source", DisplayInline, 0, 0, … | |
-{ "strike", DisplayInline, MarkupStrike, 0, … | |
-{ "strong", DisplayInline, MarkupBold, 0, … | |
-{ "style", DisplayNone, 0, 0, … | |
-{ "summary", DisplayBlock, 0, 0, … | |
-{ "table", DisplayTable, 0, 0, … | |
-{ "tbody", DisplayInline, 0, DisplayTabl… | |
-{ "td", DisplayTableCell, 0, DisplayTabl… | |
-{ "template", DisplayNone, 0, 0, … | |
-{ "textarea", DisplayInline, 0, 0, … | |
-{ "tfoot", DisplayInline, 0, DisplayTabl… | |
-{ "th", DisplayTableCell, MarkupBold, DisplayTabl… | |
-{ "thead", DisplayInline, 0, DisplayTabl… | |
-{ "title", DisplayBlock, 0, 0, … | |
-{ "tr", DisplayTableRow, 0, DisplayTabl… | |
-{ "track", DisplayInline, 0, 0, … | |
-{ "u", DisplayInline, MarkupUnderline, 0, … | |
-{ "ul", DisplayList, 0, 0, … | |
-{ "var", DisplayInline, MarkupItalic, 0, … | |
-{ "video", DisplayInline, MarkupUnderline, 0, … | |
-{ "wbr", DisplayInline, 0, 0, … | |
-{ "xmp", DisplayPre, 0, 0, … | |
+{ "a", TagA, DisplayInline, MarkupUnderli… | |
+{ "address", TagAddress, DisplayBlock, 0, … | |
+{ "area", TagArea, DisplayInline, 0, … | |
+{ "article", TagArticle, DisplayBlock, 0, … | |
+{ "aside", TagAside, DisplayBlock, 0, … | |
+{ "audio", TagAudio, DisplayInline, MarkupUnderli… | |
+{ "b", TagB, DisplayInline, MarkupBold, … | |
+{ "base", TagBase, DisplayInline, 0, … | |
+{ "blink", TagBlink, DisplayInline, MarkupBlink, … | |
+{ "blockquote", TagBlockquote, DisplayBlock, 0, … | |
+{ "body", TagBody, DisplayBlock, 0, … | |
+{ "br", TagBr, 0, 0, … | |
+{ "button", TagButton, DisplayInline | DisplayButton, 0, … | |
+{ "cite", TagCite, DisplayInline, MarkupItalic,… | |
+{ "col", TagCol, DisplayInline, 0, … | |
+{ "colgroup", TagColgroup, DisplayInline, 0, … | |
+{ "datalist", TagDatalist, DisplayNone, 0, … | |
+{ "dd", TagDd, DisplayBlock, 0, … | |
+{ "del", TagDel, DisplayInline, MarkupStrike,… | |
+{ "details", TagDetails, DisplayBlock, 0, … | |
+{ "dfn", TagDfn, DisplayInline, MarkupItalic,… | |
+{ "dir", TagDir, DisplayList, 0, … | |
+{ "div", TagDiv, DisplayBlock, 0, … | |
+{ "dl", TagDl, DisplayBlock | DisplayDl, 0, … | |
+{ "dt", TagDt, DisplayBlock, MarkupBold, … | |
+{ "em", TagEm, DisplayInline, MarkupItalic,… | |
+{ "embed", TagEmbed, DisplayInline, 0, … | |
+{ "fieldset", TagFieldset, DisplayBlock, 0, … | |
+{ "figcaption", TagFigcaption, DisplayBlock, 0, … | |
+{ "figure", TagFigure, DisplayBlock, 0, … | |
+{ "footer", TagFooter, DisplayBlock, 0, … | |
+{ "form", TagForm, DisplayBlock, 0, … | |
+{ "frame", TagFrame, DisplayInline, 0, … | |
+{ "h1", TagH1, DisplayHeader, MarkupBold, … | |
+{ "h2", TagH2, DisplayHeader, MarkupBold, … | |
+{ "h3", TagH3, DisplayHeader, MarkupBold, … | |
+{ "h4", TagH4, DisplayHeader, MarkupBold, … | |
+{ "h5", TagH5, DisplayHeader, MarkupBold, … | |
+{ "h6", TagH6, DisplayHeader, MarkupBold, … | |
+{ "head", TagHead, DisplayBlock, 0, … | |
+{ "header", TagHeader, DisplayBlock, 0, … | |
+{ "hr", TagHr, DisplayBlock, 0, … | |
+{ "html", TagHtml, DisplayBlock, 0, … | |
+{ "i", TagI, DisplayInline, MarkupItalic,… | |
+{ "iframe", TagIframe, DisplayInline, 0, … | |
+{ "img", TagImg, DisplayInline, MarkupUnderli… | |
+{ "input", TagInput, DisplayInput, 0, … | |
+{ "ins", TagIns, DisplayInline, MarkupUnderli… | |
+{ "label", TagLabel, DisplayInline, 0, … | |
+{ "legend", TagLegend, DisplayBlock, 0, … | |
+{ "li", TagLi, DisplayListItem, 0, … | |
+{ "link", TagLink, DisplayInline, 0, … | |
+{ "main", TagMain, DisplayBlock, 0, … | |
+{ "mark", TagMark, DisplayInline, MarkupReverse… | |
+{ "menu", TagMenu, DisplayList, 0, … | |
+{ "meta", TagMeta, DisplayInline, 0, … | |
+{ "nav", TagNav, DisplayBlock, 0, … | |
+{ "object", TagObject, DisplayInline, 0, … | |
+{ "ol", TagOl, DisplayList | DisplayListOrdered, 0, … | |
+{ "option", TagOption, DisplayInline | DisplayOption, 0, … | |
+{ "p", TagP, DisplayBlock, 0, … | |
+{ "param", TagParam, DisplayInline, 0, … | |
+{ "pre", TagPre, DisplayPre, 0, … | |
+{ "s", TagS, DisplayInline, MarkupStrike,… | |
+{ "script", TagScript, DisplayNone, 0, … | |
+{ "search", TagSearch, DisplayBlock, 0, … | |
+{ "section", TagSection, DisplayBlock, 0, … | |
+{ "select", TagSelect, DisplayInline | DisplaySelect, 0, … | |
+{ "source", TagSource, DisplayInline, 0, … | |
+{ "strike", TagStrike, DisplayInline, MarkupStrike,… | |
+{ "strong", TagStrong, DisplayInline, MarkupBold, … | |
+{ "style", TagStyle, DisplayNone, 0, … | |
+{ "summary", TagSummary, DisplayBlock, 0, … | |
+{ "table", TagTable, DisplayTable, 0, … | |
+{ "tbody", TagTbody, DisplayInline, 0, … | |
+{ "td", TagTd, DisplayTableCell, 0, … | |
+{ "template", TagTemplate, DisplayNone, 0, … | |
+{ "textarea", TagTextarea, DisplayInline, 0, … | |
+{ "tfoot", TagTfoot, DisplayInline, 0, … | |
+{ "th", TagTh, DisplayTableCell, MarkupBold, … | |
+{ "thead", TagThead, DisplayInline, 0, … | |
+{ "title", TagTitle, DisplayBlock, 0, … | |
+{ "tr", TagTr, DisplayTableRow, 0, … | |
+{ "track", TagTrack, DisplayInline, 0, … | |
+{ "u", TagU, DisplayInline, MarkupUnderli… | |
+{ "ul", TagUl, DisplayList, 0, … | |
+{ "var", TagVar, DisplayInline, MarkupItalic,… | |
+{ "video", TagVideo, DisplayInline, MarkupUnderli… | |
+{ "wbr", TagWbr, DisplayInline, 0, … | |
+{ "xmp", TagXmp, DisplayPre, 0, … | |
}; | |
/* hint for compilers and static analyzers that a function exits */ | |
@@ -1374,9 +1393,10 @@ findlinkref(const char *url) | |
} | |
static struct linkref * | |
-addlinkref(const char *url, const char *_type, int ishidden, int linknr) | |
+addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden, | |
+ int linknr) | |
{ | |
- if (!tagcmp(_type, "a")) | |
+ if (tagid == TagA) | |
_type = "link"; | |
/* add to linked list */ | |
@@ -1386,6 +1406,7 @@ addlinkref(const char *url, const char *_type, int ishidd… | |
links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); | |
links_cur->url = estrdup(url); | |
links_cur->type = estrdup(_type); | |
+ links_cur->tagid = tagid; | |
links_cur->ishidden = ishidden; | |
links_cur->linknr = linknr; | |
@@ -1441,7 +1462,7 @@ handleinlinelink(void) | |
/* add hidden links directly to the reference, | |
the order doesn't matter */ | |
if (cur->tag.displaytype & DisplayNone) | |
- addlinkref(url, cur->tag.name, 1, 0); | |
+ addlinkref(url, cur->tag.name, cur->tag.id, 1, 0); | |
} | |
void | |
@@ -1658,7 +1679,7 @@ endnode(struct node *cur) | |
if (!ref) { | |
linkcount++; | |
ref = addlinkref(nodes_links[curnode].data, | |
- cur->tag.name, ishidden, linkcount); | |
+ cur->tag.name, cur->tag.id, ishidden, linkcoun… | |
} | |
if (showrefinline || showurlinline) { | |
@@ -1669,7 +1690,7 @@ endnode(struct node *cur) | |
if (showrefinline) | |
hprintf("[%zu]", ref->linknr); | |
if (showurlinline) { | |
- if (!tagcmp("link", ref->type)) | |
+ if (ref->tagid == TagA) | |
hprintf("[%s]", ref->url); | |
else | |
hprintf("[%s: %s]", ref->type, ref->url); | |
@@ -1687,7 +1708,7 @@ static void | |
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
{ | |
struct tag *found, *tag; | |
- char *child, *childs[16]; | |
+ enum TagId child, childs[16]; | |
size_t nchilds; | |
int i, j, k, nchildfound, parenttype; | |
@@ -1701,35 +1722,39 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int i… | |
in reality the optional tag rules are more complex, see: | |
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ | |
- child = NULL; | |
+ child = 0; | |
nchilds = 0; | |
nchildfound = 0; | |
- parenttype = 0; | |
+ parenttype = 0; /* by default, seek until the root */ | |
if (found && found->displaytype & DisplayPre) { | |
skipinitialws = 0; /* do not skip white-space, for margins */ | |
} else if (found && found->displaytype & DisplayList) { | |
- childs[0] = "li"; | |
+ childs[0] = TagLi; | |
nchilds = 1; | |
parenttype = DisplayList; | |
} else if (found && found->displaytype & DisplayTableRow) { | |
- childs[0] = "td"; | |
+ childs[0] = TagTd; | |
nchilds = 1; | |
parenttype = DisplayTableRow; | |
} else if (found && found->displaytype & DisplayTable) { | |
- childs[0] = "td"; | |
+ childs[0] = TagTd; | |
nchilds = 1; | |
parenttype = DisplayTable; | |
} else if (found && found->displaytype & DisplaySelect) { | |
- childs[0] = "option"; | |
+ childs[0] = TagOption; | |
nchilds = 1; | |
parenttype = DisplaySelect; | |
} else if (found && found->displaytype & DisplayDl) { | |
- childs[0] = "p"; | |
- childs[1] = "dd"; | |
- childs[2] = "dt"; | |
+ childs[0] = TagP; | |
+ childs[1] = TagDd; | |
+ childs[2] = TagDt; | |
nchilds = 3; | |
parenttype = DisplayDl; | |
+ } else if (found && found->displaytype & DisplayBlock) { | |
+ childs[0] = TagP; | |
+ nchilds = 1; | |
+ parenttype = 0; /* seek until the root */ | |
} | |
if (nchilds > 0) { | |
@@ -1740,7 +1765,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int iss… | |
break; | |
for (j = 0; j < nchilds; j++) { | |
child = childs[j]; | |
- if (!tagcmp(nodes[i].tag.name, child)) { | |
+ if (nodes[i].tag.id == child) { | |
/* fake closing the previous tags */ | |
for (k = curnode; k >= i; k--) | |
endnode(&nodes[k]); | |
@@ -1794,7 +1819,8 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
{ | |
struct tag *found; | |
struct node *cur; | |
- char *child, *childs[16]; | |
+ enum TagId tagid; | |
+ enum TagId child, childs[16]; | |
size_t nchilds; | |
char *s; | |
int i, j, k, nchildfound, parenttype; | |
@@ -1821,55 +1847,56 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
in reality the optional tag rules are more complex, see: | |
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ | |
- child = NULL; | |
+ child = 0; | |
nchilds = 0; | |
nchildfound = 0; | |
- parenttype = 0; | |
+ parenttype = 0; /* by default, seek until the root */ | |
/* if optional tag <p> is open and a list element is found, close </p>… | |
if (found && found->displaytype & DisplayList) { | |
/* not inside a list */ | |
- childs[0] = "p"; | |
+ childs[0] = TagP; | |
nchilds = 1; | |
parenttype = DisplayList; | |
} else if (found && found->isoptional) { | |
- if (!tagcmp(t, "li")) { | |
- childs[0] = "li"; | |
+ tagid = found->id; | |
+ if (tagid == TagLi) { | |
+ childs[0] = TagLi; | |
nchilds = 1; | |
parenttype = DisplayList; | |
- } else if (!tagcmp(t, "td")) { | |
- childs[0] = "td"; | |
+ } else if (tagid == TagTd) { | |
+ childs[0] = TagTd; | |
nchilds = 1; | |
parenttype = DisplayTableRow; | |
- } else if (!tagcmp(t, "tr")) { | |
- childs[0] = "tr"; | |
+ } else if (tagid == TagTr) { | |
+ childs[0] = TagTr; | |
nchilds = 1; | |
parenttype = DisplayTable; | |
- } else if (!tagcmp(t, "p")) { | |
- childs[0] = "p"; | |
+ } else if (tagid == TagP) { | |
+ childs[0] = TagP; | |
nchilds = 1; | |
parenttype = 0; /* seek until the root */ | |
- } else if (!tagcmp(t, "option")) { | |
- childs[0] = "option"; | |
+ } else if (tagid == TagOption) { | |
+ childs[0] = TagOption; | |
nchilds = 1; | |
parenttype = DisplaySelect; | |
- } else if (!tagcmp(t, "dt")) { | |
- childs[0] = "dd"; | |
+ } else if (tagid == TagDt) { | |
+ childs[0] = TagDd; | |
nchilds = 1; | |
parenttype = DisplayDl; | |
- } else if (!tagcmp(t, "dd")) { | |
- childs[0] = "dd"; | |
- childs[1] = "dt"; | |
+ } else if (tagid == TagDd) { | |
+ childs[0] = TagDd; | |
+ childs[1] = TagDt; | |
nchilds = 2; | |
parenttype = DisplayDl; | |
- } else if (!tagcmp(t, cur->tag.name)) { | |
+ } else if (tagid == cur->tag.id) { | |
/* fake closing the previous tag if it is the same and… | |
xmltagend(p, t, tl, 0); | |
} | |
} else if (found && found->displaytype & DisplayBlock) { | |
/* check if we have an open "<p>" tag */ | |
- childs[0] = "p"; | |
- childs[1] = "dl"; | |
+ childs[0] = TagP; | |
+ childs[1] = TagDl; | |
nchilds = 2; | |
parenttype = DisplayDl; | |
} | |
@@ -1882,7 +1909,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
break; | |
for (j = 0; j < nchilds; j++) { | |
child = childs[j]; | |
- if (!tagcmp(nodes[i].tag.name, child)) { | |
+ if (nodes[i].tag.id == child) { | |
/* fake closing the previous tags */ | |
for (k = curnode; k >= i; k--) | |
xmltagend(p, nodes[k].tag.name… | |
@@ -1917,19 +1944,26 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
static void | |
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) | |
{ | |
+ struct tag *found; | |
+ enum TagId tagid; | |
struct node *cur, *parent; | |
int i, margintop; | |
+ /* match tag */ | |
+ tagid = 0; | |
+ if ((found = findtag(t))) | |
+ tagid = found->id; | |
+ | |
/* temporary replace the callback except the reader and end of tag | |
restore the context once we receive the same ignored tag in the | |
end tag handler */ | |
- if (!tagcmp(t, "script")) { | |
+ if (tagid == TagScript) { | |
ignorestate = endtag = "</script>"; | |
getnext = p->getnext; /* for restore */ | |
p->getnext = getnext_ignore; | |
xmltagend(p, t, tl, 0); /* fake the call the tag was ended */ | |
return; | |
- } else if (!tagcmp(t, "style")) { | |
+ } else if (tagid == TagStyle) { | |
ignorestate = endtag = "</style>"; | |
getnext = p->getnext; /* for restore */ | |
p->getnext = getnext_ignore; | |
@@ -2089,12 +2123,12 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t t… | |
the node */ | |
cur->hasdata = 0; | |
- if (!tagcmp(t, "hr")) { /* ruler */ | |
+ if (tagid == TagHr) { /* ruler */ | |
i = termwidth - indent - defaultindent; | |
for (; i > 0; i--) | |
hprint(str_ruler); | |
cur->hasdata = 1; /* treat <hr/> as data */ | |
- } else if (!tagcmp(t, "br")) { | |
+ } else if (tagid == TagBr) { | |
hflush(); | |
hadnewline = 0; /* forced newline */ | |
hputchar('\n'); | |
@@ -2107,65 +2141,78 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t t… | |
} | |
static void | |
-xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, | |
- size_t namelen, const char *value, size_t valuelen) | |
+xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, | |
+ size_t nl, const char *v, size_t vl) | |
{ | |
struct node *cur; | |
+ enum TagId tagid; | |
cur = &nodes[curnode]; | |
- | |
- if (!attrcmp(name, "class")) | |
- string_append(&attr_class, value, valuelen); | |
- else if (!attrcmp(name, "id")) | |
- string_append(&attr_id, value, valuelen); | |
- | |
- /* <base href="..." /> */ | |
- if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base")) | |
- strlcat(basehrefdoc, value, sizeof(basehrefdoc)); | |
+ tagid = cur->tag.id; | |
/* hide tags with attribute aria-hidden or hidden */ | |
- if (!attrcmp(name, "aria-hidden") || !attrcmp(name, "hidden")) | |
+ if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden")) | |
cur->tag.displaytype |= DisplayNone; | |
- if (!tagcmp(tag, "select") && !attrcmp(name, "multiple")) | |
- cur->tag.displaytype |= DisplaySelectMulti; | |
+ if (!attrcmp(n, "class")) | |
+ string_append(&attr_class, v, vl); | |
+ else if (!attrcmp(n, "id")) | |
+ string_append(&attr_id, v, vl); | |
+ else if (!attrcmp(n, "type")) | |
+ string_append(&attr_type, v, vl); | |
+ else if (!attrcmp(n, "value")) | |
+ string_append(&attr_value, v, vl); | |
- if (!tagcmp(tag, "a") && !attrcmp(name, "href")) | |
- string_append(&attr_href, value, valuelen); | |
+ /* <base href="..." /> */ | |
+ if (!basehrefset && tagid == TagBase && !attrcmp(n, "href")) | |
+ strlcat(basehrefdoc, v, sizeof(basehrefdoc)); | |
- if (!tagcmp(tag, "object") && !attrcmp(name, "data")) | |
- string_append(&attr_data, value, valuelen); | |
+ if (tagid == TagA && !attrcmp(n, "href")) | |
+ string_append(&attr_href, v, vl); | |
- if ((!tagcmp(tag, "img") || !tagcmp(tag, "video") || | |
- !tagcmp(tag, "source") || !tagcmp(tag, "track") || | |
- !tagcmp(tag, "audio")) && | |
- !attrcmp(name, "src") && valuelen) | |
- string_append(&attr_src, value, valuelen); | |
+ if (tagid == TagSelect && !attrcmp(n, "multiple")) | |
+ cur->tag.displaytype |= DisplaySelectMulti; | |
- /* show img alt attribute as text. */ | |
- if (!tagcmp(tag, "img") && !attrcmp(name, "alt")) | |
- string_append(&attr_alt, value, valuelen); | |
+ if (tagid == TagObject && !attrcmp(n, "data")) | |
+ string_append(&attr_data, v, vl); | |
- if (!attrcmp(name, "checked")) | |
- string_append(&attr_checked, value, valuelen); | |
- else if (!attrcmp(name, "type")) | |
- string_append(&attr_type, value, valuelen); | |
- else if (!attrcmp(name, "value")) | |
- string_append(&attr_value, value, valuelen); | |
+ /* show img alt attribute as text. */ | |
+ if (tagid == TagImg && !attrcmp(n, "alt")) | |
+ string_append(&attr_alt, v, vl); | |
+ | |
+ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked")) | |
+ string_append(&attr_checked, v, vl); | |
+ | |
+ /* src attribute */ | |
+ switch (tagid) { | |
+ case TagAudio: | |
+ case TagEmbed: | |
+ case TagFrame: | |
+ case TagIframe: | |
+ case TagImg: | |
+ case TagSource: | |
+ case TagTrack: | |
+ case TagVideo: | |
+ if (!attrcmp(n, "src")) | |
+ string_append(&attr_src, v, vl); | |
+ break; | |
+ default: | |
+ break; | |
+ } | |
} | |
static void | |
-xmlattrentity(XMLParser *p, const char *tag, size_t taglen, const char *name, | |
- size_t namelen, const char *value, size_t valuelen) | |
+xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, | |
+ size_t nl, const char *v, size_t vl) | |
{ | |
char buf[16]; | |
- int n; | |
+ int len; | |
- n = xml_entitytostr(value, buf, sizeof(buf)); | |
- if (n > 0) | |
- xmlattr(p, tag, taglen, name, namelen, buf, (size_t)n); | |
+ len = xml_entitytostr(v, buf, sizeof(buf)); | |
+ if (len > 0) | |
+ xmlattr(p, t, tl, n, nl, buf, (size_t)len); | |
else | |
- xmlattr(p, tag, taglen, name, namelen, value, valuelen); | |
+ xmlattr(p, t, tl, n, nl, v, vl); | |
} | |
static void | |
@@ -2173,12 +2220,14 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, cons… | |
size_t nl) | |
{ | |
struct node *cur; | |
+ enum TagId tagid; | |
cur = &nodes[curnode]; | |
+ tagid = cur->tag.id; | |
/* set base URL, if it is set it cannot be overwritten again */ | |
if (!basehrefset && basehrefdoc[0] && | |
- !attrcmp(n, "href") && !tagcmp(t, "base")) | |
+ tagid == TagBase && !attrcmp(n, "href")) | |
basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; | |
/* if attribute checked is set but it has no value then set it to "che… | |
@@ -2190,6 +2239,12 @@ static void | |
xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, | |
size_t nl) | |
{ | |
+ struct node *cur; | |
+ enum TagId tagid; | |
+ | |
+ cur = &nodes[curnode]; | |
+ tagid = cur->tag.id; | |
+ | |
if (!attrcmp(n, "alt")) | |
string_clear(&attr_alt); | |
else if (!attrcmp(n, "checked")) | |
@@ -2209,7 +2264,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons… | |
else if (!attrcmp(n, "value")) | |
string_clear(&attr_value); | |
- if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base")) | |
+ if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) | |
basehrefdoc[0] = '\0'; | |
} | |
@@ -2236,7 +2291,8 @@ main(int argc, char **argv) | |
break; | |
case 'b': | |
basehref = EARGF(usage()); | |
- if (uri_parse(basehref, &base) == -1) | |
+ if (uri_parse(basehref, &base) == -1 || | |
+ !base.proto[0]) | |
usage(); | |
basehrefset = 1; | |
break; |