XML tag parse improvements for PI and end tags - tscrape - twitter scraper | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 0fac9621c44b76c38d911438b1966d665e3b8134 | |
parent 24fad792de3bab17f1cf485450435761fb3b8657 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Mon, 17 Dec 2018 18:25:08 +0100 | |
XML tag parse improvements for PI and end tags | |
- Stricter parsing of tags, no whitespace stripping after <. | |
- For end tags the "internal" context x->tag would be "/sometag". Make sure | |
this matches exactly with the parameter tag. | |
- Reset tagname after parsing an end tag. | |
- Make end tag handling more consistent. | |
- Remove temporary variable taglen. | |
Diffstat: | |
M xml.c | 52 +++++++++++++++++------------… | |
1 file changed, 29 insertions(+), 23 deletions(-) | |
--- | |
diff --git a/xml.c b/xml.c | |
@@ -334,8 +334,8 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
void | |
xml_parse(XMLParser *x) | |
{ | |
- int c, ispi; | |
- size_t datalen, tagdatalen, taglen; | |
+ size_t datalen, tagdatalen; | |
+ int c, isend; | |
if (!x->getnext) | |
return; | |
@@ -367,30 +367,32 @@ xml_parse(XMLParser *x) | |
} | |
} | |
} else { | |
- x->tag[0] = '\0'; | |
- x->taglen = 0; | |
- | |
/* normal tag (open, short open, close), proce… | |
- if (isspace(c)) | |
- while ((c = x->getnext()) != EOF && is… | |
- ; | |
- if (c == EOF) | |
- return; | |
x->tag[0] = c; | |
- ispi = (c == '?') ? 1 : 0; | |
- x->isshorttag = ispi; | |
- taglen = 1; | |
+ x->taglen = 1; | |
+ x->isshorttag = isend = 0; | |
+ | |
+ /* treat processing instruction as shorttag, d… | |
+ if (c == '?') { | |
+ x->isshorttag = 1; | |
+ } else if (c == '/') { | |
+ if ((c = x->getnext()) == EOF) | |
+ return; | |
+ x->tag[0] = c; | |
+ isend = 1; | |
+ } | |
+ | |
while ((c = x->getnext()) != EOF) { | |
if (c == '/') | |
x->isshorttag = 1; /* short ta… | |
else if (c == '>' || isspace(c)) { | |
- x->tag[taglen] = '\0'; | |
- if (x->tag[0] == '/') { /* end… | |
- x->taglen = --taglen; … | |
- if (taglen && x->xmlta… | |
- x->xmltagend(x… | |
+ x->tag[x->taglen] = '\0'; | |
+ if (isend) { /* end tag, start… | |
+ if (x->xmltagend) | |
+ x->xmltagend(x… | |
+ x->tag[0] = '\0'; | |
+ x->taglen = 0; | |
} else { | |
- x->taglen = taglen; | |
/* start tag */ | |
if (x->xmltagstart) | |
x->xmltagstart… | |
@@ -400,11 +402,15 @@ xml_parse(XMLParser *x) | |
x->xmltagstart… | |
} | |
/* call tagend for shortform o… | |
- if ((x->isshorttag || ispi) &&… | |
- x->xmltagend(x, x->tag… | |
+ if (x->isshorttag) { | |
+ if (x->xmltagend) | |
+ x->xmltagend(x… | |
+ x->tag[0] = '\0'; | |
+ x->taglen = 0; | |
+ } | |
break; | |
- } else if (taglen < sizeof(x->tag) - 1) | |
- x->tag[taglen++] = c; /* NOTE:… | |
+ } else if (x->taglen < sizeof(x->tag) … | |
+ x->tag[x->taglen++] = c; /* NO… | |
} | |
} | |
} else { |