sync xml improvements - tscrape - twitter scraper | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 227743f84d79e15f67b761e2d92e20dbc7083d81 | |
parent 7789dc04f4937dd68677a953320537b3da519f3b | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sun, 11 Mar 2018 18:45:33 +0100 | |
sync xml improvements | |
... better CDATA and comment parsing, etc | |
Diffstat: | |
M xml.c | 90 +++++++++++++++++------------… | |
1 file changed, 50 insertions(+), 40 deletions(-) | |
--- | |
diff --git a/xml.c b/xml.c | |
@@ -1,3 +1,5 @@ | |
+#include <sys/types.h> | |
+ | |
#include <ctype.h> | |
#include <errno.h> | |
#include <limits.h> | |
@@ -90,8 +92,7 @@ xml_parseattrs(XMLParser *x) | |
break; | |
} | |
} | |
- namelen = 0; | |
- endname = 0; | |
+ namelen = endname = 0; | |
} else if (namelen < sizeof(x->name) - 1) { | |
x->name[namelen++] = c; | |
} | |
@@ -108,36 +109,41 @@ xml_parseattrs(XMLParser *x) | |
static void | |
xml_parsecomment(XMLParser *x) | |
{ | |
- static const char *end = "-->"; | |
size_t datalen = 0, i = 0; | |
- char tmp[4]; | |
int c; | |
if (x->xmlcommentstart) | |
x->xmlcommentstart(x); | |
while ((c = x->getnext()) != EOF) { | |
- if (c == end[i]) { | |
- if (end[++i] == '\0') { /* end */ | |
+ if (c == '-' || c == '>') { | |
+ if (x->xmlcomment) { | |
x->data[datalen] = '\0'; | |
+ x->xmlcomment(x, x->data, datalen); | |
+ datalen = 0; | |
+ } | |
+ } | |
+ | |
+ if (c == '-') { | |
+ if (++i > 2) { | |
if (x->xmlcomment) | |
- x->xmlcomment(x, x->data, datalen); | |
- if (x->xmlcommentend) | |
- x->xmlcommentend(x); | |
- return; | |
+ for (; i > 2; i--) | |
+ x->xmlcomment(x, "-", 1); | |
+ i = 2; | |
} | |
+ continue; | |
+ } else if (c == '>' && i == 2) { | |
+ if (x->xmlcommentend) | |
+ x->xmlcommentend(x); | |
+ return; | |
} else if (i) { | |
if (x->xmlcomment) { | |
- x->data[datalen] = '\0'; | |
- if (datalen) | |
- x->xmlcomment(x, x->data, datalen); | |
- memcpy(tmp, end, i); | |
- tmp[i] = '\0'; | |
- x->xmlcomment(x, tmp, i); | |
+ for (; i > 0; i--) | |
+ x->xmlcomment(x, "-", 1); | |
} | |
i = 0; | |
- x->data[0] = c; | |
- datalen = 1; | |
- } else if (datalen < sizeof(x->data) - 1) { | |
+ } | |
+ | |
+ if (datalen < sizeof(x->data) - 1) { | |
x->data[datalen++] = c; | |
} else { | |
x->data[datalen] = '\0'; | |
@@ -152,36 +158,40 @@ xml_parsecomment(XMLParser *x) | |
static void | |
xml_parsecdata(XMLParser *x) | |
{ | |
- static const char *end = "]]>"; | |
size_t datalen = 0, i = 0; | |
- char tmp[4]; | |
int c; | |
if (x->xmlcdatastart) | |
x->xmlcdatastart(x); | |
while ((c = x->getnext()) != EOF) { | |
- if (c == end[i]) { | |
- if (end[++i] == '\0') { /* end */ | |
+ if (c == ']' || c == '>') { | |
+ if (x->xmlcdata) { | |
x->data[datalen] = '\0'; | |
+ x->xmlcdata(x, x->data, datalen); | |
+ datalen = 0; | |
+ } | |
+ } | |
+ | |
+ if (c == ']') { | |
+ if (++i > 2) { | |
if (x->xmlcdata) | |
- x->xmlcdata(x, x->data, datalen); | |
- if (x->xmlcdataend) | |
- x->xmlcdataend(x); | |
- return; | |
+ for (; i > 2; i--) | |
+ x->xmlcdata(x, "]", 1); | |
+ i = 2; | |
} | |
+ continue; | |
+ } else if (c == '>' && i == 2) { | |
+ if (x->xmlcdataend) | |
+ x->xmlcdataend(x); | |
+ return; | |
} else if (i) { | |
- x->data[datalen] = '\0'; | |
- if (x->xmlcdata) { | |
- if (datalen) | |
- x->xmlcdata(x, x->data, datalen); | |
- memcpy(tmp, end, i); | |
- tmp[i] = '\0'; | |
- x->xmlcdata(x, tmp, i); | |
- } | |
+ if (x->xmlcdata) | |
+ for (; i > 0; i--) | |
+ x->xmlcdata(x, "]", 1); | |
i = 0; | |
- x->data[0] = c; | |
- datalen = 1; | |
- } else if (datalen < sizeof(x->data) - 1) { | |
+ } | |
+ | |
+ if (datalen < sizeof(x->data) - 1) { | |
x->data[datalen++] = c; | |
} else { | |
x->data[datalen] = '\0'; | |
@@ -221,7 +231,7 @@ xml_codepointtoutf8(uint32_t cp, uint32_t *utf) | |
ssize_t | |
xml_namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
- const struct { | |
+ static const struct { | |
char *entity; | |
int c; | |
} entities[] = { | |
@@ -268,7 +278,7 @@ xml_numericentitytostr(const char *e, char *buf, size_t buf… | |
return -1; | |
/* not a numeric entity */ | |
- if (!(e[0] == '&' && e[1] == '#')) | |
+ if (e[0] != '&' || e[1] != '#') | |
return 0; | |
/* e[1] == '#', numeric / hexadecimal entity */ |