Introduction
Introduction Statistics Contact Development Disclaimer Help
ignore incorrect unescaped HTML in <style> or <script> in a better way - grabti…
git clone git://git.codemadness.org/grabtitle
Log
Files
Refs
README
LICENSE
---
commit d908478d0f84bc275428fd71e934c993bb29211c
parent 0cca681092b680c5b80da62771d47fa383be6cd1
Author: Hiltjo Posthuma <[email protected]>
Date: Mon, 10 Dec 2018 19:01:58 +0100
ignore incorrect unescaped HTML in <style> or <script> in a better way
this way we can still use a (mostly) XML parser for HTML data.
Diffstat:
M grabtitle.c | 71 +++++++++++++++++++----------…
1 file changed, 44 insertions(+), 27 deletions(-)
---
diff --git a/grabtitle.c b/grabtitle.c
@@ -16,28 +16,38 @@
#endif
static XMLParser parser;
-static int istitle, ignore;
-
-static void
-xmltagstart(XMLParser *p, const char *t, size_t tl)
+static const char *state, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. */
+static inline int
+getchar_ignore(void)
{
- if ((tl == 6 && !strcasecmp(t, "script")) ||
- (tl == 5 && !strcasecmp(t, "style")))
- ignore = 1;
- if (!ignore && tl == 5 && !strcasecmp(t, "title"))
- istitle = 1;
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*state)) {
+ state++;
+ if (*state == '\0') {
+ parser.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ state = endtag;
+ }
+
+ return ' ';
}
static void
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
- if (ignore && ((tl == 6 && !strcasecmp(t, "script")) ||
- (tl == 5 && !strcasecmp(t, "style"))))
- ignore = 0;
- if (istitle && tl == 5 && !strcasecmp(t, "title")) {
- putchar('\n');
- exit(0);
- }
+ putchar('\n');
+ exit(0);
}
/* data and CDATA */
@@ -46,9 +56,6 @@ xmldata(XMLParser *p, const char *d, size_t dl)
{
size_t i;
- if (!istitle)
- return;
-
for (i = 0; *d && i < dl; i++, d++) {
if (iscntrl((unsigned char)*d))
putchar(' ');
@@ -63,15 +70,30 @@ xmldataentity(XMLParser *p, const char *d, size_t dl)
char buf[16];
ssize_t len;
- if (!istitle)
- return;
-
if ((len = xml_entitytostr(d, buf, sizeof(buf))))
xmldata(p, buf, (size_t)len);
else
xmldata(p, d, dl);
}
+static void
+xmltagstart(XMLParser *p, const char *t, size_t tl)
+{
+ if (tl == 6 && !strcasecmp(t, "script")) {
+ state = endtag = "</script>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getchar_ignore;
+ } else if (tl == 5 && !strcasecmp(t, "style")) {
+ state = endtag = "</style>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getchar_ignore;
+ } else if (tl == 5 && !strcasecmp(t, "title")) {
+ p->xmltagend = xmltagend;
+ p->xmlcdata = p->xmldata = xmldata;
+ p->xmldataentity = xmldataentity;
+ }
+}
+
int
main(int argc, char *argv[])
{
@@ -81,11 +103,6 @@ main(int argc, char *argv[])
}
parser.xmltagstart = xmltagstart;
- parser.xmltagend = xmltagend;
- parser.xmldata = xmldata;
- parser.xmlcdata = xmldata;
- parser.xmldataentity = xmldataentity;
-
parser.getnext = getchar;
xml_parse(&parser);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.