Introduction
Introduction Statistics Contact Development Disclaimer Help
ignore incorrect unescaped HTML in <style> or <script> in a better way - tscrap…
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit ed3a979265abe557e783ea22c6a09fb96241ff95
parent 0fac9621c44b76c38d911438b1966d665e3b8134
Author: Hiltjo Posthuma <[email protected]>
Date: Mon, 17 Dec 2018 18:32:50 +0100
ignore incorrect unescaped HTML in <style> or <script> in a better way
Diffstat:
M tscrape.c | 53 +++++++++++++++++++++--------…
1 file changed, 37 insertions(+), 16 deletions(-)
---
diff --git a/tscrape.c b/tscrape.c
@@ -40,6 +40,34 @@ static char retweetid[64];
static int state;
static XMLParser p;
+static const char *ignorestate, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. If you see some </script> tag in a CDATA or comment
+ section then e-mail W3C and tell them the web is too complex. */
+static inline int
+getchar_ignore(void)
+{
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+ ignorestate++;
+ if (*ignorestate == '\0') {
+ p.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ ignorestate = endtag;
+ }
+
+ return ' ';
+}
+
static void
printtweet(void)
{
@@ -100,16 +128,6 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int issh…
state &= ~(Timestamp);
}
-static char ignoretag[8];
-static XMLParser xo; /* old context */
-
-static void
-xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- if (!strcasecmp(t, ignoretag))
- memcpy(p, &xo, sizeof(*p)); /* restore context */
-}
-
static void
xmltagstart(XMLParser *x, const char *t, size_t tl)
{
@@ -122,12 +140,15 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl,…
/* temporary replace the callback except the reader and end of tag
restore the context once we receive the same ignored tag in the
end tag handler */
- if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
- strlcpy(ignoretag, t, sizeof(ignoretag));
- memcpy(&xo, x, sizeof(xo)); /* store old context */
- memset(x, 0, sizeof(*x));
- x->xmltagend = xmlignoretagend;
- x->getnext = xo.getnext;
+ if (!strcasecmp(t, "script")) {
+ ignorestate = endtag = "</script>";
+ getnext = x->getnext; /* for restore */
+ x->getnext = getchar_ignore;
+ return;
+ } else if (!strcasecmp(t, "style")) {
+ ignorestate = endtag = "</style>";
+ getnext = x->getnext; /* for restore */
+ x->getnext = getchar_ignore;
return;
}
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.