GopherProxy

	ignore incorrect unescaped HTML in <style> or <script> in a better way - grabti…
	git clone git://git.codemadness.org/grabtitle
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit d908478d0f84bc275428fd71e934c993bb29211c
	parent 0cca681092b680c5b80da62771d47fa383be6cd1
	Author: Hiltjo Posthuma <[email protected]>
	Date: Mon, 10 Dec 2018 19:01:58 +0100

	ignore incorrect unescaped HTML in <style> or <script> in a better way

	this way we can still use a (mostly) XML parser for HTML data.

	Diffstat:
	M grabtitle.c \| 71 +++++++++++++++++++----------…

	1 file changed, 44 insertions(+), 27 deletions(-)
	---
	diff --git a/grabtitle.c b/grabtitle.c
	@@ -16,28 +16,38 @@
	#endif

	static XMLParser parser;
	-static int istitle, ignore;
	-
	-static void
	-xmltagstart(XMLParser p, const char t, size_t tl)
	+static const char state, endtag;
	+static int (*getnext)(void);
	+
	+/* return a space for all data until some case-insensitive string occurs. This
	+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
	+ or style tags. */
	+static inline int
	+getchar_ignore(void)
	{
	- if ((tl == 6 && !strcasecmp(t, "script")) \|\|
	- (tl == 5 && !strcasecmp(t, "style")))
	- ignore = 1;
	- if (!ignore && tl == 5 && !strcasecmp(t, "title"))
	- istitle = 1;
	+ int c;
	+
	+ if ((c = getnext()) == EOF)
	+ return EOF;
	+
	+ if (tolower(c) == tolower((unsigned char)*state)) {
	+ state++;
	+ if (*state == '\0') {
	+ parser.getnext = getnext; /* restore */
	+ return c;
	+ }
	+ } else {
	+ state = endtag;
	+ }
	+
	+ return ' ';
	}

	static void
	xmltagend(XMLParser p, const char t, size_t tl, int isshort)
	{
	- if (ignore && ((tl == 6 && !strcasecmp(t, "script")) \|\|
	- (tl == 5 && !strcasecmp(t, "style"))))
	- ignore = 0;
	- if (istitle && tl == 5 && !strcasecmp(t, "title")) {
	- putchar('\n');
	- exit(0);
	- }
	+ putchar('\n');
	+ exit(0);
	}

	/* data and CDATA */
	@@ -46,9 +56,6 @@ xmldata(XMLParser p, const char d, size_t dl)
	{
	size_t i;

	- if (!istitle)
	- return;
	-
	for (i = 0; *d && i < dl; i++, d++) {
	if (iscntrl((unsigned char)*d))
	putchar(' ');
	@@ -63,15 +70,30 @@ xmldataentity(XMLParser p, const char d, size_t dl)
	char buf[16];
	ssize_t len;

	- if (!istitle)
	- return;
	-
	if ((len = xml_entitytostr(d, buf, sizeof(buf))))
	xmldata(p, buf, (size_t)len);
	else
	xmldata(p, d, dl);
	}

	+static void
	+xmltagstart(XMLParser p, const char t, size_t tl)
	+{
	+ if (tl == 6 && !strcasecmp(t, "script")) {
	+ state = endtag = "</script>";
	+ getnext = p->getnext; /* for restore */
	+ p->getnext = getchar_ignore;
	+ } else if (tl == 5 && !strcasecmp(t, "style")) {
	+ state = endtag = "</style>";
	+ getnext = p->getnext; /* for restore */
	+ p->getnext = getchar_ignore;
	+ } else if (tl == 5 && !strcasecmp(t, "title")) {
	+ p->xmltagend = xmltagend;
	+ p->xmlcdata = p->xmldata = xmldata;
	+ p->xmldataentity = xmldataentity;
	+ }
	+}
	+
	int
	main(int argc, char *argv[])
	{
	@@ -81,11 +103,6 @@ main(int argc, char *argv[])
	}

	parser.xmltagstart = xmltagstart;
	- parser.xmltagend = xmltagend;
	- parser.xmldata = xmldata;
	- parser.xmlcdata = xmldata;
	- parser.xmldataentity = xmldataentity;
	-
	parser.getnext = getchar;
	xml_parse(&parser);