improve forms a bit - webdump - HTML to plain-text converter for webpages | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 630f76162a192327a3eecd4fc0adcb9b31cd4504 | |
parent 0705fb754f00c7866b2cc8cee0739a88a584a2e1 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Fri, 8 Sep 2023 15:05:38 +0200 | |
improve forms a bit | |
- Treat fieldset and legend as block elements. | |
- Support more types, default or unsupported is "text". | |
- Show the default selected value for radio and checkboxes. | |
- Don't show hidden input types. | |
- Add a DisplayType DisplayInput to check the tag faster. | |
Diffstat: | |
M webdump.c | 64 +++++++++++++++++++++--------… | |
1 file changed, 44 insertions(+), 20 deletions(-) | |
--- | |
diff --git a/webdump.c b/webdump.c | |
@@ -68,16 +68,17 @@ enum DisplayType { | |
DisplayUnknown = 0, | |
DisplayInline = 1 << 0, | |
DisplayInlineBlock = 1 << 1, /* unused for now */ | |
- DisplayBlock = 1 << 2, | |
- DisplayNone = 1 << 3, | |
- DisplayPre = 1 << 4, | |
- DisplayList = 1 << 5, | |
- DisplayListOrdered = 1 << 6, | |
- DisplayListItem = 1 << 7, | |
- DisplayTable = 1 << 8, | |
- DisplayTableRow = 1 << 9, | |
- DisplayTableCell = 1 << 10, | |
- DisplayHeader = 1 << 11 | |
+ DisplayInput = 1 << 2, | |
+ DisplayBlock = 1 << 3, | |
+ DisplayNone = 1 << 4, | |
+ DisplayPre = 1 << 5, | |
+ DisplayList = 1 << 6, | |
+ DisplayListOrdered = 1 << 7, | |
+ DisplayListItem = 1 << 8, | |
+ DisplayTable = 1 << 9, | |
+ DisplayTableRow = 1 << 10, | |
+ DisplayTableCell = 1 << 11, | |
+ DisplayHeader = 1 << 12 | |
}; | |
/* ANSI markup */ | |
@@ -143,7 +144,9 @@ struct selectors { | |
}; | |
static const char *str_bullet_item = "* "; | |
+static const char *str_checkbox_checked = "x"; | |
static const char *str_ruler = "-"; | |
+static const char *str_radio_checked = "*"; | |
/* base href, to make URLs absolute */ | |
static char *basehref = ""; | |
@@ -153,6 +156,7 @@ static struct uri base; | |
/* buffers for some attributes of the current tag */ | |
String attr_alt; /* alt attribute */ | |
+String attr_checked; /* checked attribute */ | |
String attr_class; /* class attribute */ | |
String attr_href; /* href attribute */ | |
String attr_id; /* id attribute */ | |
@@ -221,6 +225,7 @@ static struct tag tags[] = { | |
{ "dt", DisplayBlock, MarkupBold, 0, … | |
{ "em", DisplayInline, MarkupItalic, 0, … | |
{ "embed", DisplayInline, 0, 0, … | |
+{ "fieldset", DisplayBlock, 0, 0, … | |
{ "figcaption", DisplayBlock, 0, 0, … | |
{ "figure", DisplayBlock, 0, 0, … | |
{ "footer", DisplayBlock, 0, 0, … | |
@@ -236,8 +241,9 @@ static struct tag tags[] = { | |
{ "html", DisplayBlock, 0, 0, … | |
{ "i", DisplayInline, MarkupItalic, 0, … | |
{ "img", DisplayInline, MarkupUnderline, 0, … | |
-{ "input", DisplayInline, 0, 0, … | |
+{ "input", DisplayInput, 0, 0, … | |
{ "label", DisplayInline, MarkupBold, 0, … | |
+{ "legend", DisplayBlock, 0, 0, … | |
{ "li", DisplayListItem, 0, DisplayList… | |
{ "link", DisplayInline, 0, 0, … | |
{ "main", DisplayBlock, 0, 0, … | |
@@ -1684,6 +1690,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
cur = &nodes[curnode]; | |
string_clear(&attr_alt); | |
+ string_clear(&attr_checked); | |
string_clear(&attr_class); | |
string_clear(&attr_href); | |
string_clear(&attr_id); | |
@@ -1891,18 +1898,23 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t t… | |
if (!tagcmp(cur->tag.name, "input")) { | |
if (!attr_type.len) { | |
hprintf("[%-15s]", attr_value.len ? attr_value.data : … | |
- } else if (!strcasecmp(attr_type.data, "text")) { | |
- hprintf("[%-15s]", attr_value.len ? attr_value.data : … | |
- } else if (!strcasecmp(attr_type.data, "search")) { | |
- hprintf("[%-15s]", attr_value.len ? attr_value.data : … | |
- } else if (!strcasecmp(attr_type.data, "button")) { | |
- hprintf("[%s]", attr_value.len ? attr_value.data : ""); | |
- } else if (!strcasecmp(attr_type.data, "submit")) { | |
+ } else if (!strcasecmp(attr_type.data, "button") || | |
+ !strcasecmp(attr_type.data, "submit") || | |
+ !strcasecmp(attr_type.data, "reset")) { | |
hprintf("[%s]", attr_value.len ? attr_value.data : ""); | |
} else if (!strcasecmp(attr_type.data, "checkbox")) { | |
- hprint("[ ]"); /* TODO: show x or unicode checkmark wh… | |
+ hprintf("[%s]", | |
+ attr_checked.len && | |
+ !strcasecmp(attr_checked.data, "checked") ? st… | |
} else if (!strcasecmp(attr_type.data, "radio")) { | |
- hprint("( )"); /* TODO: show x or unicode checkmark wh… | |
+ hprintf("[%s]", | |
+ attr_checked.len && | |
+ !strcasecmp(attr_checked.data, "checked") ? st… | |
+ } else if (!strcasecmp(attr_type.data, "hidden")) { | |
+ cur->tag.displaytype |= DisplayNone; | |
+ } else { | |
+ /* unrecognized / default case is text */ | |
+ hprintf("[%-15s]", attr_value.len ? attr_value.data : … | |
} | |
} | |
@@ -1963,6 +1975,8 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, con… | |
if (!tagcmp(tag, "img") && !attrcmp(name, "alt")) | |
string_append(&attr_alt, value, valuelen); | |
+ if (!attrcmp(name, "checked")) | |
+ string_append(&attr_checked, value, valuelen); | |
if (!attrcmp(name, "type")) | |
string_append(&attr_type, value, valuelen); | |
if (!attrcmp(name, "value")) | |
@@ -1987,10 +2001,18 @@ static void | |
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, | |
size_t nl) | |
{ | |
+ struct node *cur; | |
+ | |
+ cur = &nodes[curnode]; | |
+ | |
/* set base URL, if it is set it cannot be overwritten again */ | |
if (!basehrefset && basehrefdoc[0] && | |
!attrcmp(n, "href") && !tagcmp(t, "base")) | |
basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; | |
+ | |
+ /* if attribute checked is set but it has no value then set it to "che… | |
+ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !… | |
+ string_append(&attr_checked, "checked", sizeof("checked") - 1); | |
} | |
static void | |
@@ -1999,6 +2021,8 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, cons… | |
{ | |
if (!attrcmp(n, "alt")) | |
string_clear(&attr_alt); | |
+ else if (!attrcmp(n, "checked")) | |
+ string_clear(&attr_checked); | |
else if (!attrcmp(n, "class")) | |
string_clear(&attr_class); | |
else if (!attrcmp(n, "href")) |