Introduction
Introduction Statistics Contact Development Disclaimer Help
simplify ignore tags parsing - tscrape - twitter scraper
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit cb8ed18e7f5f31e68c9d5ab11a6daa8677af6636
parent 2dc167003132b6d9db8e779f26681c560c07a119
Author: Hiltjo Posthuma <[email protected]>
Date: Sat, 26 Aug 2017 12:43:15 +0200
simplify ignore tags parsing
Diffstat:
M tscrape.c | 79 ++++++++++++-----------------…
1 file changed, 30 insertions(+), 49 deletions(-)
---
diff --git a/tscrape.c b/tscrape.c
@@ -38,15 +38,9 @@ static char classname[256];
static char datatime[16];
static char itemid[64];
static char retweetid[64];
-static int isignore, state;
+static int state;
static XMLParser p;
-/* ignored tag, all text between this is interpreted literally and ignored */
-static char *ignoretags[] = {
- "style",
- "script",
-};
-
static void
printtweet(void)
{
@@ -100,9 +94,6 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
static void
xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
{
- if (isignore)
- return;
-
if (!strcmp(t, "p"))
state &= ~Text;
else if (!strcmp(t, "span"))
@@ -112,51 +103,44 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int iss…
static void
xmltagstart(XMLParser *x, const char *t, size_t tl)
{
- int i;
+ char tmp[64];
+ int c, i;
classname[0] = '\0';
- for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
- if (!strcasecmp(ignoretags[i], t)) {
- isignore = 1;
+ /* HACK: ignored tag is parsed, hook into reader and read raw data
+ until literal end tag (without using the normal parser).
+ process (buffered) as xml[c]data (no entity) */
+ if (strcasecmp(t, "script") && strcasecmp(t, "style"))
+ return;
+
+startignore:
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
break;
- }
}
+ if (c == EOF)
+ return;
+ if ((c = x->getnext()) != '/')
+ goto startignore;
+ for (i = 0; (c = x->getnext()) != EOF; i++) {
+ if (c == '>')
+ break;
+ if (i + 1 >= sizeof(tmp))
+ goto startignore;
+ tmp[i] = c;
+ }
+ tmp[i] = '\0';
+
+ /* compare against current ignored tag */
+ if (strcasecmp(t, tmp))
+ goto startignore;
}
static void
xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
{
- char tmp[64];
- int c, i;
-
- if (isignore) {
- /* HACK: ignored tag is parsed, hook into reader and read raw …
- until literal end tag (without using the normal parser).
- process (buffered) as xml[c]data (no entity) */
-startignore:
- while ((c = x->getnext()) != EOF) {
- if (c == '<')
- break;
- }
- if (c == EOF)
- return;
- if ((c = x->getnext()) != '/')
- goto startignore;
- for (i = 0; (c = x->getnext()) != EOF; i++) {
- if (c == '>')
- break;
- if (i + 1 >= sizeof(tmp))
- goto startignore;
- tmp[i] = c;
- }
- tmp[i] = '\0';
-
- /* compare against current ignored tag */
- if (!strcasecmp(t, tmp))
- isignore = 0;
- return;
- }
+ int i;
if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
if (state & (Item | Stream | Header))
@@ -197,9 +181,6 @@ static void
xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
const char *v, size_t vl)
{
- if (isignore)
- return;
-
/* NOTE: assumes classname attribute is set before data-* in current t…
if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-…
if (!strcmp(a, "data-screen-name")) {
@@ -255,7 +236,7 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, const…
char buf[16];
ssize_t len;
- if (!state || isignore)
+ if (!state)
return;
if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
xmlattr(x, t, tl, a, al, buf, (size_t)len);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.