GopherProxy

	ignore incorrect unescaped HTML in <style> or <script> in a better way - tscrap…
	git clone git://git.codemadness.org/tscrape
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit ed3a979265abe557e783ea22c6a09fb96241ff95
	parent 0fac9621c44b76c38d911438b1966d665e3b8134
	Author: Hiltjo Posthuma <[email protected]>
	Date: Mon, 17 Dec 2018 18:32:50 +0100

	ignore incorrect unescaped HTML in <style> or <script> in a better way

	Diffstat:
	M tscrape.c \| 53 +++++++++++++++++++++--------…

	1 file changed, 37 insertions(+), 16 deletions(-)
	---
	diff --git a/tscrape.c b/tscrape.c
	@@ -40,6 +40,34 @@ static char retweetid[64];
	static int state;
	static XMLParser p;

	+static const char ignorestate, endtag;
	+static int (*getnext)(void);
	+
	+/* return a space for all data until some case-insensitive string occurs. This
	+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
	+ or style tags. If you see some </script> tag in a CDATA or comment
	+ section then e-mail W3C and tell them the web is too complex. */
	+static inline int
	+getchar_ignore(void)
	+{
	+ int c;
	+
	+ if ((c = getnext()) == EOF)
	+ return EOF;
	+
	+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
	+ ignorestate++;
	+ if (*ignorestate == '\0') {
	+ p.getnext = getnext; /* restore */
	+ return c;
	+ }
	+ } else {
	+ ignorestate = endtag;
	+ }
	+
	+ return ' ';
	+}
	+
	static void
	printtweet(void)
	{
	@@ -100,16 +128,6 @@ xmltagend(XMLParser x, const char t, size_t tl, int issh…
	state &= ~(Timestamp);
	}

	-static char ignoretag[8];
	-static XMLParser xo; /* old context */
	-
	-static void
	-xmlignoretagend(XMLParser p, const char t, size_t tl, int isshort)
	-{
	- if (!strcasecmp(t, ignoretag))
	- memcpy(p, &xo, sizeof(p)); / restore context */
	-}
	-
	static void
	xmltagstart(XMLParser x, const char t, size_t tl)
	{
	@@ -122,12 +140,15 @@ xmltagstartparsed(XMLParser x, const char t, size_t tl,…
	/* temporary replace the callback except the reader and end of tag
	restore the context once we receive the same ignored tag in the
	end tag handler */
	- if (!strcasecmp(t, "script") \|\| !strcasecmp(t, "style")) {
	- strlcpy(ignoretag, t, sizeof(ignoretag));
	- memcpy(&xo, x, sizeof(xo)); /* store old context */
	- memset(x, 0, sizeof(*x));
	- x->xmltagend = xmlignoretagend;
	- x->getnext = xo.getnext;
	+ if (!strcasecmp(t, "script")) {
	+ ignorestate = endtag = "</script>";
	+ getnext = x->getnext; /* for restore */
	+ x->getnext = getchar_ignore;
	+ return;
	+ } else if (!strcasecmp(t, "style")) {
	+ ignorestate = endtag = "</style>";
	+ getnext = x->getnext; /* for restore */
	+ x->getnext = getchar_ignore;
	return;
	}