sync XML improvements (from sfeed) - xmlparser - XML parser | |
git clone git://git.codemadness.org/xmlparser | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 908a3c3d0c612673b32c2714d9f46bc723c7a38b | |
parent b2078dbb866bea46507ebb9d3d4c12c93c4f39f8 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sun, 16 Jun 2019 22:19:31 +0200 | |
sync XML improvements (from sfeed) | |
Diffstat: | |
M README | 10 +++++++++- | |
M skeleton.c | 2 ++ | |
M xml.c | 125 ++++++++++++++---------------… | |
M xml.h | 7 +++++++ | |
4 files changed, 76 insertions(+), 68 deletions(-) | |
--- | |
diff --git a/README b/README | |
@@ -5,7 +5,7 @@ XML parser | |
Dependencies | |
------------ | |
-- C compiler (C99 expected). | |
+- C compiler (C99). | |
Features | |
@@ -36,6 +36,8 @@ Caveats | |
------- | |
- It is not a compliant XML parser. | |
+- Performance: data is buffered even if a handler is not set: to make parsing | |
+ faster change this code from xml.c. | |
- The XML is not checked for errors so it will continue parsing XML data, this | |
is by design. | |
- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are | |
@@ -59,6 +61,12 @@ Interface / API | |
Should be trivial, see xml.c and xml.h and the examples below. | |
+Examples | |
+-------- | |
+ | |
+See skeleton.c for a base program to start quickly. | |
+ | |
+ | |
License | |
------- | |
diff --git a/skeleton.c b/skeleton.c | |
@@ -114,7 +114,9 @@ main(void) | |
x.xmltagstart = xmltagstart; | |
x.xmltagstartparsed = xmltagstartparsed; | |
+#ifndef GETNEXT | |
x.getnext = getchar; | |
+#endif | |
xml_parse(&x); | |
diff --git a/xml.c b/xml.c | |
@@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x) | |
size_t namelen = 0, valuelen; | |
int c, endsep, endname = 0, valuestart = 0; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (isspace(c)) { | |
if (namelen) | |
endname = 1; | |
@@ -51,7 +51,7 @@ xml_parseattrs(XMLParser *x) | |
goto startvalue; | |
} | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
startvalue: | |
if (c == '&') { /* entities */ | |
x->data[valuelen] = '\0'; | |
@@ -60,7 +60,7 @@ startvalue: | |
x->xmlattr(x, x->tag, x->tagle… | |
x->data[0] = c; | |
valuelen = 1; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == endsep || (endsep == … | |
break; | |
if (valuelen < sizeof(x->data)… | |
@@ -124,9 +124,9 @@ xml_parsecomment(XMLParser *x) | |
if (x->xmlcommentstart) | |
x->xmlcommentstart(x); | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '-' || c == '>') { | |
- if (x->xmlcomment) { | |
+ if (x->xmlcomment && datalen) { | |
x->data[datalen] = '\0'; | |
x->xmlcomment(x, x->data, datalen); | |
datalen = 0; | |
@@ -173,9 +173,9 @@ xml_parsecdata(XMLParser *x) | |
if (x->xmlcdatastart) | |
x->xmlcdatastart(x); | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == ']' || c == '>') { | |
- if (x->xmlcdata) { | |
+ if (x->xmlcdata && datalen) { | |
x->data[datalen] = '\0'; | |
x->xmlcdata(x, x->data, datalen); | |
datalen = 0; | |
@@ -247,19 +247,19 @@ static int | |
namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
static const struct { | |
- char *entity; | |
+ const char *entity; | |
int c; | |
} entities[] = { | |
- { "&", '&' }, | |
- { "<", '<' }, | |
- { ">", '>' }, | |
- { "'", '\'' }, | |
- { """, '"' }, | |
- { "&", '&' }, | |
- { "<", '<' }, | |
- { ">", '>' }, | |
- { "&APOS;", '\'' }, | |
- { """, '"' } | |
+ { "amp;", '&' }, | |
+ { "lt;", '<' }, | |
+ { "gt;", '>' }, | |
+ { "apos;", '\'' }, | |
+ { "quot;", '"' }, | |
+ { "AMP;", '&' }, | |
+ { "LT;", '<' }, | |
+ { "GT;", '>' }, | |
+ { "APOS;", '\'' }, | |
+ { "QUOT;", '"' } | |
}; | |
size_t i; | |
@@ -267,10 +267,6 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
if (bufsiz < 2) | |
return -1; | |
- /* doesn't start with &: can't match */ | |
- if (*e != '&') | |
- return 0; | |
- | |
for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { | |
if (!strcmp(e, entities[i].entity)) { | |
buf[0] = entities[i].c; | |
@@ -292,12 +288,6 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
if (bufsiz < 5) | |
return -1; | |
- /* not a numeric entity */ | |
- if (e[0] != '&' || e[1] != '#') | |
- return 0; | |
- | |
- /* e[1] == '#', numeric / hexadecimal entity */ | |
- e += 2; /* skip "&#" */ | |
errno = 0; | |
/* hex (16) or decimal (10) */ | |
if (*e == 'x') | |
@@ -318,37 +308,32 @@ numericentitytostr(const char *e, char *buf, size_t bufsi… | |
int | |
xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
- /* buffer is too small */ | |
- if (bufsiz < 5) | |
- return -1; | |
/* doesn't start with & */ | |
if (e[0] != '&') | |
return 0; | |
- /* named entity */ | |
- if (e[1] != '#') | |
- return namedentitytostr(e, buf, bufsiz); | |
- else /* numeric entity */ | |
- return numericentitytostr(e, buf, bufsiz); | |
+ /* numeric entity */ | |
+ if (e[1] == '#') | |
+ return numericentitytostr(e + 2, buf, bufsiz); | |
+ else /* named entity */ | |
+ return namedentitytostr(e + 1, buf, bufsiz); | |
} | |
void | |
xml_parse(XMLParser *x) | |
{ | |
- int c, ispi; | |
- size_t datalen, tagdatalen, taglen; | |
+ size_t datalen, tagdatalen; | |
+ int c, isend; | |
- if (!x->getnext) | |
- return; | |
- while ((c = x->getnext()) != EOF && c != '<') | |
+ while ((c = GETNEXT()) != EOF && c != '<') | |
; /* skip until < */ | |
while (c != EOF) { | |
if (c == '<') { /* parse tag */ | |
- if ((c = x->getnext()) == EOF) | |
+ if ((c = GETNEXT()) == EOF) | |
return; | |
if (c == '!') { /* cdata and comments */ | |
- for (tagdatalen = 0; (c = x->getnext()) != EOF… | |
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { | |
/* NOTE: sizeof(x->data) must be atlea… | |
if (tagdatalen <= sizeof("[CDATA[") - … | |
x->data[tagdatalen++] = c; | |
@@ -367,30 +352,32 @@ xml_parse(XMLParser *x) | |
} | |
} | |
} else { | |
- x->tag[0] = '\0'; | |
- x->taglen = 0; | |
- | |
/* normal tag (open, short open, close), proce… | |
- if (isspace(c)) | |
- while ((c = x->getnext()) != EOF && is… | |
- ; | |
- if (c == EOF) | |
- return; | |
x->tag[0] = c; | |
- ispi = (c == '?') ? 1 : 0; | |
- x->isshorttag = ispi; | |
- taglen = 1; | |
- while ((c = x->getnext()) != EOF) { | |
+ x->taglen = 1; | |
+ x->isshorttag = isend = 0; | |
+ | |
+ /* treat processing instruction as shorttag, d… | |
+ if (c == '?') { | |
+ x->isshorttag = 1; | |
+ } else if (c == '/') { | |
+ if ((c = GETNEXT()) == EOF) | |
+ return; | |
+ x->tag[0] = c; | |
+ isend = 1; | |
+ } | |
+ | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '/') | |
x->isshorttag = 1; /* short ta… | |
else if (c == '>' || isspace(c)) { | |
- x->tag[taglen] = '\0'; | |
- if (x->tag[0] == '/') { /* end… | |
- x->taglen = --taglen; … | |
- if (taglen && x->xmlta… | |
- x->xmltagend(x… | |
+ x->tag[x->taglen] = '\0'; | |
+ if (isend) { /* end tag, start… | |
+ if (x->xmltagend) | |
+ x->xmltagend(x… | |
+ x->tag[0] = '\0'; | |
+ x->taglen = 0; | |
} else { | |
- x->taglen = taglen; | |
/* start tag */ | |
if (x->xmltagstart) | |
x->xmltagstart… | |
@@ -400,11 +387,15 @@ xml_parse(XMLParser *x) | |
x->xmltagstart… | |
} | |
/* call tagend for shortform o… | |
- if ((x->isshorttag || ispi) &&… | |
- x->xmltagend(x, x->tag… | |
+ if (x->isshorttag) { | |
+ if (x->xmltagend) | |
+ x->xmltagend(x… | |
+ x->tag[0] = '\0'; | |
+ x->taglen = 0; | |
+ } | |
break; | |
- } else if (taglen < sizeof(x->tag) - 1) | |
- x->tag[taglen++] = c; /* NOTE:… | |
+ } else if (x->taglen < sizeof(x->tag) … | |
+ x->tag[x->taglen++] = c; /* NO… | |
} | |
} | |
} else { | |
@@ -412,7 +403,7 @@ xml_parse(XMLParser *x) | |
datalen = 0; | |
if (x->xmldatastart) | |
x->xmldatastart(x); | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '&') { | |
if (datalen) { | |
x->data[datalen] = '\0'; | |
@@ -421,7 +412,7 @@ xml_parse(XMLParser *x) | |
} | |
x->data[0] = c; | |
datalen = 1; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '<') | |
break; | |
if (datalen < sizeof(x->data) … | |
diff --git a/xml.h b/xml.h | |
@@ -1,3 +1,6 @@ | |
+#ifndef _XML_H | |
+#define _XML_H | |
+ | |
typedef struct xmlparser { | |
/* handlers */ | |
void (*xmlattr)(struct xmlparser *, const char *, size_t, | |
@@ -23,7 +26,10 @@ typedef struct xmlparser { | |
void (*xmltagstartparsed)(struct xmlparser *, const char *, | |
size_t, int); | |
+#ifndef GETNEXT | |
+ #define GETNEXT (x)->getnext | |
int (*getnext)(void); | |
+#endif | |
/* current tag */ | |
char tag[1024]; | |
@@ -38,3 +44,4 @@ typedef struct xmlparser { | |
int xml_entitytostr(const char *, char *, size_t); | |
void xml_parse(XMLParser *); | |
+#endif |