Introduction
Introduction Statistics Contact Development Disclaimer Help
initial support to ignore literals in <script> and <style> - tscrape - twitter …
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit 1ff56f1ce94cd62b0c16ee343917435c9048b8b8
parent 006a11c3aced38fa2cc3915793c1b9e886d0ad41
Author: Hiltjo Posthuma <[email protected]>
Date: Fri, 25 Aug 2017 17:44:37 +0200
initial support to ignore literals in <script> and <style>
Diffstat:
M tscrape.c | 70 +++++++++++++++++++++++++++--…
1 file changed, 61 insertions(+), 9 deletions(-)
---
diff --git a/tscrape.c b/tscrape.c
@@ -38,9 +38,15 @@ static char classname[256];
static char datatime[16];
static char itemid[64];
static char retweetid[64];
-static int state;
+static int isignore, state;
static XMLParser p;
+/* ignored tag, all text between this is interpreted literally and ignored */
+static char *ignoretags[] = {
+ "style",
+ "script",
+};
+
static void
printtweet(void)
{
@@ -94,6 +100,9 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
static void
xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
{
+ if (isignore)
+ return;
+
if (!strcmp(t, "p"))
state &= ~Text;
else if (!strcmp(t, "span"))
@@ -103,35 +112,78 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int iss…
static void
xmltagstart(XMLParser *x, const char *t, size_t tl)
{
+ int i;
+
classname[0] = '\0';
+
+ for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
+ if (!strcasecmp(ignoretags[i], t)) {
+ isignore = 1;
+ break;
+ }
+ }
}
static void
xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
{
- const char *c = classname;
+ char tmp[64];
+ int c, i;
+
+ if (isignore) {
+ /* HACK: ignored tag is parsed, hook into reader and read raw …
+ until literal end tag (without using the normal parser).
+ process (buffered) as xml[c]data (no entity) */
+startignore:
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
+ break;
+ }
+ if (c == EOF)
+ return;
+ if ((c = x->getnext()) != '/')
+ goto startignore;
+ for (i = 0; (c = x->getnext()) != EOF; i++) {
+ if (c == '>')
+ break;
+ if (i + 1 >= sizeof(tmp))
+ goto startignore;
+ tmp[i] = c;
+ }
+ tmp[i] = '\0';
+
+ /* compare against current ignored tag */
+ if (!strcasecmp(t, tmp))
+ isignore = 0;
+ return;
+ }
- if (!strcmp(t, "p") && isclassmatch(c, STRP("js-tweet-text"))) {
+ if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
if (state & (Item | Stream | Header))
state |= Text;
- } else if (!strcmp(t, "div") && isclassmatch(c, STRP("stream-item-foot…
+ } else if (!strcmp(t, "div") &&
+ isclassmatch(classname, STRP("stream-item-footer"))) {
if (text[0] && username[0])
printtweet();
state = 0;
- } else if (!strcmp(t, "li") && isclassmatch(c, STRP("js-stream-item"))…
+ } else if (!strcmp(t, "li") &&
+ isclassmatch(classname, STRP("js-stream-item"))) {
state |= Item;
datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
itemid[0] = itemusername[0] = retweetid[0] = '\0';
ispinned = 0;
- if (isclassmatch(c, STRP("js-pinned")))
+ if (isclassmatch(classname, STRP("js-pinned")))
ispinned = 1;
} else if (state & Item) {
- if (!strcmp(t, "div") && isclassmatch(c, STRP("js-stream-tweet…
+ if (!strcmp(t, "div") &&
+ isclassmatch(classname, STRP("js-stream-tweet"))) {
state &= ~(Text|Header);
state |= Stream;
- } else if (!strcmp(t, "a") && isclassmatch(c, STRP("js-action-…
+ } else if (!strcmp(t, "a") &&
+ isclassmatch(classname, STRP("js-action-profile")))…
state |= Header;
- } else if (!strcmp(t, "span") && isclassmatch(c, STRP("js-shor…
+ } else if (!strcmp(t, "span") &&
+ isclassmatch(classname, STRP("js-short-timestamp")))…
state |= Timestamp;
strlcpy(timestamp, datatime, sizeof(timestamp));
datatime[0] = '\0';
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.