optional tag handling improvements - webdump - HTML to plain-text converter for… | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 2e32abeb2743e5fce55bdfc1591bb66eedd63a45 | |
parent 9f4c3a0a47eb2bb127db5a270dfa27ad368deb6a | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Mon, 11 Sep 2023 19:03:25 +0200 | |
optional tag handling improvements | |
Much better handling for the optional tags: <p>, <dd>, <dt>, <dl>. | |
An example page: | |
https://www.openbsd.org/policy.html | |
Some tags to add: | |
- aside | |
- menu | |
- address | |
- details | |
Maybe: | |
- search | |
- hgroup | |
Diffstat: | |
M webdump.c | 105 +++++++++++++++++++++++------… | |
1 file changed, 78 insertions(+), 27 deletions(-) | |
--- | |
diff --git a/webdump.c b/webdump.c | |
@@ -78,7 +78,8 @@ enum DisplayType { | |
DisplayTable = 1 << 9, | |
DisplayTableRow = 1 << 10, | |
DisplayTableCell = 1 << 11, | |
- DisplayHeader = 1 << 12 | |
+ DisplayHeader = 1 << 12, | |
+ DisplayDl = 1 << 13 | |
}; | |
/* ANSI markup */ | |
@@ -222,7 +223,7 @@ static struct tag tags[] = { | |
{ "dd", DisplayBlock, 0, 0, … | |
{ "del", DisplayInline, MarkupStrike, 0, … | |
{ "div", DisplayBlock, 0, 0, … | |
-{ "dl", DisplayInline, 0, 0, … | |
+{ "dl", DisplayBlock|DisplayDl, 0, 0, … | |
{ "dt", DisplayBlock, MarkupBold, 0, … | |
{ "em", DisplayInline, MarkupItalic, 0, … | |
{ "embed", DisplayInline, 0, 0, … | |
@@ -1600,8 +1601,9 @@ static void | |
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
{ | |
struct tag *found, *tag; | |
- const char *child; | |
- int i, j, parenttype; | |
+ char *child, *childs[16]; | |
+ size_t nchilds; | |
+ int i, j, k, nchildfound, parenttype; | |
/* ignore closing of void elements, like </br>, which is not allowed */ | |
if ((found = findtag(t))) { | |
@@ -1614,31 +1616,48 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int i… | |
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ | |
child = NULL; | |
+ nchilds = 0; | |
+ nchildfound = 0; | |
parenttype = 0; | |
if (found && found->displaytype & DisplayPre) { | |
skipinitialws = 0; /* do not skip white-space, for margins */ | |
} else if (found && found->displaytype & DisplayList) { | |
- child = "li"; | |
+ childs[0] = "li"; | |
+ nchilds = 1; | |
parenttype = DisplayList; | |
} else if (found && found->displaytype & DisplayTableRow) { | |
- child = "td"; | |
+ childs[0] = "td"; | |
+ nchilds = 1; | |
parenttype = DisplayTableRow; | |
} else if (found && found->displaytype & DisplayTable) { | |
- child = "td"; | |
+ childs[0] = "td"; | |
+ nchilds = 1; | |
parenttype = DisplayTable; | |
+ } else if (found && found->displaytype & DisplayDl) { | |
+ childs[0] = "p"; | |
+ childs[1] = "dd"; | |
+ childs[2] = "dt"; | |
+ nchilds = 3; | |
+ parenttype = DisplayDl; | |
} | |
- if (child && parenttype) { | |
+ if (nchilds > 0) { | |
for (i = curnode; i >= 0; i--) { | |
- if ((nodes[i].tag.displaytype & parenttype)) | |
+ if (nchildfound) | |
break; | |
- if (!tagcmp(nodes[i].tag.name, child)) { | |
- /* fake closing the previous tags */ | |
- for (j = curnode; j >= i; j--) | |
- endnode(&nodes[j]); | |
- curnode = j; | |
+ if ((nodes[i].tag.displaytype & parenttype)) | |
break; | |
+ for (j = 0; j < nchilds; j++) { | |
+ child = childs[j]; | |
+ if (!tagcmp(nodes[i].tag.name, child)) { | |
+ /* fake closing the previous tags */ | |
+ for (k = curnode; k >= i; k--) | |
+ endnode(&nodes[k]); | |
+ curnode = k; | |
+ nchildfound = 1; | |
+ break; | |
+ } | |
} | |
} | |
} | |
@@ -1685,9 +1704,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
{ | |
struct tag *found; | |
struct node *cur; | |
- const char *child; | |
+ char *child, *childs[16]; | |
+ size_t nchilds; | |
char *s; | |
- int i, j, parenttype; | |
+ int i, j, k, nchildfound, parenttype; | |
if (curnode >= MAX_DEPTH - 2) | |
errx(1, "max tag depth reached: %d\n", curnode); | |
@@ -1711,38 +1731,69 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ | |
child = NULL; | |
+ nchilds = 0; | |
+ nchildfound = 0; | |
parenttype = 0; | |
- /* if optional tag <p> is open and a block element is found, close </p… | |
+ /* if optional tag <p> is open and a list element is found, close </p>… | |
if (found && found->displaytype & DisplayList) { | |
/* not inside a list */ | |
- child = "p"; | |
+ childs[0] = "p"; | |
+ nchilds = 1; | |
parenttype = DisplayList; | |
} else if (found && found->isoptional) { | |
if (!tagcmp(t, "li")) { | |
- child = "li"; | |
+ childs[0] = "li"; | |
+ nchilds = 1; | |
parenttype = DisplayList; | |
} else if (!tagcmp(t, "td")) { | |
- child = "td"; | |
+ childs[0] = "td"; | |
+ nchilds = 1; | |
parenttype = DisplayTableRow; | |
} else if (!tagcmp(t, "tr")) { | |
- child = "tr"; | |
+ childs[0] = "tr"; | |
+ nchilds = 1; | |
parenttype = DisplayTable; | |
+ } else if (!tagcmp(t, "p")) { | |
+ childs[0] = "p"; | |
+ nchilds = 1; | |
+ parenttype = 0; /* seek until the root */ | |
+ } else if (!tagcmp(t, "dt")) { | |
+ childs[0] = "dd"; | |
+ nchilds = 1; | |
+ parenttype = 0; /* seek until the root */ | |
+ } else if (!tagcmp(t, "dd")) { | |
+ childs[0] = "dd"; | |
+ childs[1] = "dt"; | |
+ nchilds = 2; | |
+ parenttype = 0; /* seek until the root */ | |
} else if (!tagcmp(t, cur->tag.name)) { | |
/* fake closing the previous tag if it is the same and… | |
xmltagend(p, t, tl, 0); | |
} | |
+ } else if (found && found->displaytype & DisplayBlock) { | |
+ /* check if we have an open "<p>" tag */ | |
+ childs[0] = "p"; | |
+ childs[1] = "dl"; | |
+ nchilds = 2; | |
+ parenttype = 0; /* seek until the root */ | |
} | |
- if (child && parenttype) { | |
+ if (nchilds > 0) { | |
for (i = curnode; i >= 0; i--) { | |
- if ((nodes[i].tag.displaytype & parenttype)) | |
+ if (nchildfound) | |
break; | |
- if (!tagcmp(nodes[i].tag.name, child)) { | |
- /* fake closing the previous tags */ | |
- for (j = curnode; j >= i; j--) | |
- xmltagend(p, nodes[j].tag.name, strlen… | |
+ if ((nodes[i].tag.displaytype & parenttype)) | |
break; | |
+ for (j = 0; j < nchilds; j++) { | |
+ child = childs[j]; | |
+ if (!tagcmp(nodes[i].tag.name, child)) { | |
+ /* fake closing the previous tags */ | |
+ for (k = curnode; k >= i; k--) | |
+ xmltagend(p, nodes[k].tag.name… | |
+ nchildfound = 1; | |
+ break; | |
+ } | |
} | |
} | |
} |