ignore incorrect unescaped HTML in <style> or <script> in a better way - grabti… | |
git clone git://git.codemadness.org/grabtitle | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit d908478d0f84bc275428fd71e934c993bb29211c | |
parent 0cca681092b680c5b80da62771d47fa383be6cd1 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Mon, 10 Dec 2018 19:01:58 +0100 | |
ignore incorrect unescaped HTML in <style> or <script> in a better way | |
this way we can still use a (mostly) XML parser for HTML data. | |
Diffstat: | |
M grabtitle.c | 71 +++++++++++++++++++----------… | |
1 file changed, 44 insertions(+), 27 deletions(-) | |
--- | |
diff --git a/grabtitle.c b/grabtitle.c | |
@@ -16,28 +16,38 @@ | |
#endif | |
static XMLParser parser; | |
-static int istitle, ignore; | |
- | |
-static void | |
-xmltagstart(XMLParser *p, const char *t, size_t tl) | |
+static const char *state, *endtag; | |
+static int (*getnext)(void); | |
+ | |
+/* return a space for all data until some case-insensitive string occurs. This | |
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script | |
+ or style tags. */ | |
+static inline int | |
+getchar_ignore(void) | |
{ | |
- if ((tl == 6 && !strcasecmp(t, "script")) || | |
- (tl == 5 && !strcasecmp(t, "style"))) | |
- ignore = 1; | |
- if (!ignore && tl == 5 && !strcasecmp(t, "title")) | |
- istitle = 1; | |
+ int c; | |
+ | |
+ if ((c = getnext()) == EOF) | |
+ return EOF; | |
+ | |
+ if (tolower(c) == tolower((unsigned char)*state)) { | |
+ state++; | |
+ if (*state == '\0') { | |
+ parser.getnext = getnext; /* restore */ | |
+ return c; | |
+ } | |
+ } else { | |
+ state = endtag; | |
+ } | |
+ | |
+ return ' '; | |
} | |
static void | |
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
{ | |
- if (ignore && ((tl == 6 && !strcasecmp(t, "script")) || | |
- (tl == 5 && !strcasecmp(t, "style")))) | |
- ignore = 0; | |
- if (istitle && tl == 5 && !strcasecmp(t, "title")) { | |
- putchar('\n'); | |
- exit(0); | |
- } | |
+ putchar('\n'); | |
+ exit(0); | |
} | |
/* data and CDATA */ | |
@@ -46,9 +56,6 @@ xmldata(XMLParser *p, const char *d, size_t dl) | |
{ | |
size_t i; | |
- if (!istitle) | |
- return; | |
- | |
for (i = 0; *d && i < dl; i++, d++) { | |
if (iscntrl((unsigned char)*d)) | |
putchar(' '); | |
@@ -63,15 +70,30 @@ xmldataentity(XMLParser *p, const char *d, size_t dl) | |
char buf[16]; | |
ssize_t len; | |
- if (!istitle) | |
- return; | |
- | |
if ((len = xml_entitytostr(d, buf, sizeof(buf)))) | |
xmldata(p, buf, (size_t)len); | |
else | |
xmldata(p, d, dl); | |
} | |
+static void | |
+xmltagstart(XMLParser *p, const char *t, size_t tl) | |
+{ | |
+ if (tl == 6 && !strcasecmp(t, "script")) { | |
+ state = endtag = "</script>"; | |
+ getnext = p->getnext; /* for restore */ | |
+ p->getnext = getchar_ignore; | |
+ } else if (tl == 5 && !strcasecmp(t, "style")) { | |
+ state = endtag = "</style>"; | |
+ getnext = p->getnext; /* for restore */ | |
+ p->getnext = getchar_ignore; | |
+ } else if (tl == 5 && !strcasecmp(t, "title")) { | |
+ p->xmltagend = xmltagend; | |
+ p->xmlcdata = p->xmldata = xmldata; | |
+ p->xmldataentity = xmldataentity; | |
+ } | |
+} | |
+ | |
int | |
main(int argc, char *argv[]) | |
{ | |
@@ -81,11 +103,6 @@ main(int argc, char *argv[]) | |
} | |
parser.xmltagstart = xmltagstart; | |
- parser.xmltagend = xmltagend; | |
- parser.xmldata = xmldata; | |
- parser.xmlcdata = xmldata; | |
- parser.xmldataentity = xmldataentity; | |
- | |
parser.getnext = getchar; | |
xml_parse(&parser); | |