GopherProxy

	simplify ignore tags parsing - tscrape - twitter scraper
	git clone git://git.codemadness.org/tscrape
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit cb8ed18e7f5f31e68c9d5ab11a6daa8677af6636
	parent 2dc167003132b6d9db8e779f26681c560c07a119
	Author: Hiltjo Posthuma <[email protected]>
	Date: Sat, 26 Aug 2017 12:43:15 +0200

	simplify ignore tags parsing

	Diffstat:
	M tscrape.c \| 79 ++++++++++++-----------------…

	1 file changed, 30 insertions(+), 49 deletions(-)
	---
	diff --git a/tscrape.c b/tscrape.c
	@@ -38,15 +38,9 @@ static char classname[256];
	static char datatime[16];
	static char itemid[64];
	static char retweetid[64];
	-static int isignore, state;
	+static int state;
	static XMLParser p;

	-/* ignored tag, all text between this is interpreted literally and ignored */
	-static char *ignoretags[] = {
	- "style",
	- "script",
	-};
	-
	static void
	printtweet(void)
	{
	@@ -100,9 +94,6 @@ html_entitytostr(const char s, char buf, size_t bufsiz)
	static void
	xmltagend(XMLParser x, const char t, size_t tl, int isshort)
	{
	- if (isignore)
	- return;
	-
	if (!strcmp(t, "p"))
	state &= ~Text;
	else if (!strcmp(t, "span"))
	@@ -112,51 +103,44 @@ xmltagend(XMLParser x, const char t, size_t tl, int iss…
	static void
	xmltagstart(XMLParser x, const char t, size_t tl)
	{
	- int i;
	+ char tmp[64];
	+ int c, i;

	classname[0] = '\0';

	- for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
	- if (!strcasecmp(ignoretags[i], t)) {
	- isignore = 1;
	+ /* HACK: ignored tag is parsed, hook into reader and read raw data
	+ until literal end tag (without using the normal parser).
	+ process (buffered) as xml[c]data (no entity) */
	+ if (strcasecmp(t, "script") && strcasecmp(t, "style"))
	+ return;
	+
	+startignore:
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == '<')
	break;
	- }
	}
	+ if (c == EOF)
	+ return;
	+ if ((c = x->getnext()) != '/')
	+ goto startignore;
	+ for (i = 0; (c = x->getnext()) != EOF; i++) {
	+ if (c == '>')
	+ break;
	+ if (i + 1 >= sizeof(tmp))
	+ goto startignore;
	+ tmp[i] = c;
	+ }
	+ tmp[i] = '\0';
	+
	+ /* compare against current ignored tag */
	+ if (strcasecmp(t, tmp))
	+ goto startignore;
	}

	static void
	xmltagstartparsed(XMLParser x, const char t, size_t tl, int isshort)
	{
	- char tmp[64];
	- int c, i;
	-
	- if (isignore) {
	- /* HACK: ignored tag is parsed, hook into reader and read raw …
	- until literal end tag (without using the normal parser).
	- process (buffered) as xml[c]data (no entity) */
	-startignore:
	- while ((c = x->getnext()) != EOF) {
	- if (c == '<')
	- break;
	- }
	- if (c == EOF)
	- return;
	- if ((c = x->getnext()) != '/')
	- goto startignore;
	- for (i = 0; (c = x->getnext()) != EOF; i++) {
	- if (c == '>')
	- break;
	- if (i + 1 >= sizeof(tmp))
	- goto startignore;
	- tmp[i] = c;
	- }
	- tmp[i] = '\0';
	-
	- /* compare against current ignored tag */
	- if (!strcasecmp(t, tmp))
	- isignore = 0;
	- return;
	- }
	+ int i;

	if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
	if (state & (Item \| Stream \| Header))
	@@ -197,9 +181,6 @@ static void
	xmlattr(XMLParser x, const char t, size_t tl, const char *a, size_t al,
	const char *v, size_t vl)
	{
	- if (isignore)
	- return;
	-
	/* NOTE: assumes classname attribute is set before data-* in current t…
	if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-…
	if (!strcmp(a, "data-screen-name")) {
	@@ -255,7 +236,7 @@ xmlattrentity(XMLParser x, const char t, size_t tl, const…
	char buf[16];
	ssize_t len;

	- if (!state \|\| isignore)
	+ if (!state)
	return;
	if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
	xmlattr(x, t, tl, a, al, buf, (size_t)len);