initial support to ignore literals in <script> and <style> - tscrape - twitter … | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 1ff56f1ce94cd62b0c16ee343917435c9048b8b8 | |
parent 006a11c3aced38fa2cc3915793c1b9e886d0ad41 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Fri, 25 Aug 2017 17:44:37 +0200 | |
initial support to ignore literals in <script> and <style> | |
Diffstat: | |
M tscrape.c | 70 +++++++++++++++++++++++++++--… | |
1 file changed, 61 insertions(+), 9 deletions(-) | |
--- | |
diff --git a/tscrape.c b/tscrape.c | |
@@ -38,9 +38,15 @@ static char classname[256]; | |
static char datatime[16]; | |
static char itemid[64]; | |
static char retweetid[64]; | |
-static int state; | |
+static int isignore, state; | |
static XMLParser p; | |
+/* ignored tag, all text between this is interpreted literally and ignored */ | |
+static char *ignoretags[] = { | |
+ "style", | |
+ "script", | |
+}; | |
+ | |
static void | |
printtweet(void) | |
{ | |
@@ -94,6 +100,9 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz) | |
static void | |
xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) | |
{ | |
+ if (isignore) | |
+ return; | |
+ | |
if (!strcmp(t, "p")) | |
state &= ~Text; | |
else if (!strcmp(t, "span")) | |
@@ -103,35 +112,78 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int iss… | |
static void | |
xmltagstart(XMLParser *x, const char *t, size_t tl) | |
{ | |
+ int i; | |
+ | |
classname[0] = '\0'; | |
+ | |
+ for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) { | |
+ if (!strcasecmp(ignoretags[i], t)) { | |
+ isignore = 1; | |
+ break; | |
+ } | |
+ } | |
} | |
static void | |
xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) | |
{ | |
- const char *c = classname; | |
+ char tmp[64]; | |
+ int c, i; | |
+ | |
+ if (isignore) { | |
+ /* HACK: ignored tag is parsed, hook into reader and read raw … | |
+ until literal end tag (without using the normal parser). | |
+ process (buffered) as xml[c]data (no entity) */ | |
+startignore: | |
+ while ((c = x->getnext()) != EOF) { | |
+ if (c == '<') | |
+ break; | |
+ } | |
+ if (c == EOF) | |
+ return; | |
+ if ((c = x->getnext()) != '/') | |
+ goto startignore; | |
+ for (i = 0; (c = x->getnext()) != EOF; i++) { | |
+ if (c == '>') | |
+ break; | |
+ if (i + 1 >= sizeof(tmp)) | |
+ goto startignore; | |
+ tmp[i] = c; | |
+ } | |
+ tmp[i] = '\0'; | |
+ | |
+ /* compare against current ignored tag */ | |
+ if (!strcasecmp(t, tmp)) | |
+ isignore = 0; | |
+ return; | |
+ } | |
- if (!strcmp(t, "p") && isclassmatch(c, STRP("js-tweet-text"))) { | |
+ if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))… | |
if (state & (Item | Stream | Header)) | |
state |= Text; | |
- } else if (!strcmp(t, "div") && isclassmatch(c, STRP("stream-item-foot… | |
+ } else if (!strcmp(t, "div") && | |
+ isclassmatch(classname, STRP("stream-item-footer"))) { | |
if (text[0] && username[0]) | |
printtweet(); | |
state = 0; | |
- } else if (!strcmp(t, "li") && isclassmatch(c, STRP("js-stream-item"))… | |
+ } else if (!strcmp(t, "li") && | |
+ isclassmatch(classname, STRP("js-stream-item"))) { | |
state |= Item; | |
datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0'; | |
itemid[0] = itemusername[0] = retweetid[0] = '\0'; | |
ispinned = 0; | |
- if (isclassmatch(c, STRP("js-pinned"))) | |
+ if (isclassmatch(classname, STRP("js-pinned"))) | |
ispinned = 1; | |
} else if (state & Item) { | |
- if (!strcmp(t, "div") && isclassmatch(c, STRP("js-stream-tweet… | |
+ if (!strcmp(t, "div") && | |
+ isclassmatch(classname, STRP("js-stream-tweet"))) { | |
state &= ~(Text|Header); | |
state |= Stream; | |
- } else if (!strcmp(t, "a") && isclassmatch(c, STRP("js-action-… | |
+ } else if (!strcmp(t, "a") && | |
+ isclassmatch(classname, STRP("js-action-profile")))… | |
state |= Header; | |
- } else if (!strcmp(t, "span") && isclassmatch(c, STRP("js-shor… | |
+ } else if (!strcmp(t, "span") && | |
+ isclassmatch(classname, STRP("js-short-timestamp")))… | |
state |= Timestamp; | |
strlcpy(timestamp, datatime, sizeof(timestamp)); | |
datatime[0] = '\0'; |