improve and simplify ignore tag handling - tscrape - twitter scraper | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 7789dc04f4937dd68677a953320537b3da519f3b | |
parent e3bd0af8ac5af175c7dee7c24eadf238f5f4334f | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 26 Aug 2017 15:36:10 +0200 | |
improve and simplify ignore tag handling | |
Diffstat: | |
M tscrape.c | 53 +++++++++++++----------------… | |
1 file changed, 22 insertions(+), 31 deletions(-) | |
--- | |
diff --git a/tscrape.c b/tscrape.c | |
@@ -100,41 +100,20 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int iss… | |
state &= ~(Timestamp); | |
} | |
+static char ignoretag[8]; | |
+static XMLParser xo; /* old context */ | |
+ | |
static void | |
-xmltagstart(XMLParser *x, const char *t, size_t tl) | |
+xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
{ | |
- char tmp[64]; | |
- int c, i; | |
+ if (!strcasecmp(t, ignoretag)) | |
+ memcpy(p, &xo, sizeof(*p)); /* restore context */ | |
+} | |
+static void | |
+xmltagstart(XMLParser *x, const char *t, size_t tl) | |
+{ | |
classname[0] = '\0'; | |
- | |
- /* HACK: ignored tag is parsed, hook into reader and read raw data | |
- until literal end tag (without using the normal parser). | |
- process (buffered) as xml[c]data (no entity) */ | |
- if (strcasecmp(t, "script") && strcasecmp(t, "style")) | |
- return; | |
- | |
-startignore: | |
- while ((c = x->getnext()) != EOF) { | |
- if (c == '<') | |
- break; | |
- } | |
- if (c == EOF) | |
- return; | |
- if ((c = x->getnext()) != '/') | |
- goto startignore; | |
- for (i = 0; (c = x->getnext()) != EOF; i++) { | |
- if (c == '>') | |
- break; | |
- if (i + 1 >= sizeof(tmp)) | |
- goto startignore; | |
- tmp[i] = c; | |
- } | |
- tmp[i] = '\0'; | |
- | |
- /* compare against current ignored tag */ | |
- if (strcasecmp(t, tmp)) | |
- goto startignore; | |
} | |
static void | |
@@ -142,6 +121,18 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, … | |
{ | |
int i; | |
+ /* temporary replace the callback except the reader and end of tag | |
+ restore the context once we receive the same ignored tag in the | |
+ end tag handler */ | |
+ if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) { | |
+ strlcpy(ignoretag, t, sizeof(ignoretag)); | |
+ memcpy(&xo, x, sizeof(xo)); /* store old context */ | |
+ memset(x, 0, sizeof(*x)); | |
+ x->xmltagend = xmlignoretagend; | |
+ x->getnext = xo.getnext; | |
+ return; | |
+ } | |
+ | |
if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))… | |
if (state & (Item | Stream | Header)) | |
state |= Text; |