Introduction
Introduction Statistics Contact Development Disclaimer Help
improve and simplify ignore tag handling - tscrape - twitter scraper
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit 7789dc04f4937dd68677a953320537b3da519f3b
parent e3bd0af8ac5af175c7dee7c24eadf238f5f4334f
Author: Hiltjo Posthuma <[email protected]>
Date: Sat, 26 Aug 2017 15:36:10 +0200
improve and simplify ignore tag handling
Diffstat:
M tscrape.c | 53 +++++++++++++----------------…
1 file changed, 22 insertions(+), 31 deletions(-)
---
diff --git a/tscrape.c b/tscrape.c
@@ -100,41 +100,20 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int iss…
state &= ~(Timestamp);
}
+static char ignoretag[8];
+static XMLParser xo; /* old context */
+
static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
+xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
- char tmp[64];
- int c, i;
+ if (!strcasecmp(t, ignoretag))
+ memcpy(p, &xo, sizeof(*p)); /* restore context */
+}
+static void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
classname[0] = '\0';
-
- /* HACK: ignored tag is parsed, hook into reader and read raw data
- until literal end tag (without using the normal parser).
- process (buffered) as xml[c]data (no entity) */
- if (strcasecmp(t, "script") && strcasecmp(t, "style"))
- return;
-
-startignore:
- while ((c = x->getnext()) != EOF) {
- if (c == '<')
- break;
- }
- if (c == EOF)
- return;
- if ((c = x->getnext()) != '/')
- goto startignore;
- for (i = 0; (c = x->getnext()) != EOF; i++) {
- if (c == '>')
- break;
- if (i + 1 >= sizeof(tmp))
- goto startignore;
- tmp[i] = c;
- }
- tmp[i] = '\0';
-
- /* compare against current ignored tag */
- if (strcasecmp(t, tmp))
- goto startignore;
}
static void
@@ -142,6 +121,18 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, …
{
int i;
+ /* temporary replace the callback except the reader and end of tag
+ restore the context once we receive the same ignored tag in the
+ end tag handler */
+ if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
+ strlcpy(ignoretag, t, sizeof(ignoretag));
+ memcpy(&xo, x, sizeof(xo)); /* store old context */
+ memset(x, 0, sizeof(*x));
+ x->xmltagend = xmlignoretagend;
+ x->getnext = xo.getnext;
+ return;
+ }
+
if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
if (state & (Item | Stream | Header))
state |= Text;
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.