several improvements and more efficient xml parser - xml2tsv - a simple xml-to-… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
commit 60c249ec24ab865c4a55759c7ffde2da99530b1d | |
parent b416c171bb34297d7f8bc4c027de7136a113d144 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Wed, 30 Sep 2020 11:42:07 +0100 | |
several improvements and more efficient xml parser | |
Diffstat: | |
M xml.c | 36 +++--------------------------… | |
M xml.h | 12 +++++------- | |
M xml2tsv.c | 93 ++++++++++-------------------… | |
3 files changed, 38 insertions(+), 103 deletions(-) | |
--- | |
diff --git a/xml.c b/xml.c | |
@@ -116,49 +116,19 @@ startvalue: | |
static void | |
xml_parsecomment(XMLParser *x) | |
{ | |
- size_t datalen = 0, i = 0; | |
+ size_t i = 0; | |
int c; | |
- if (x->xmlcommentstart) | |
- x->xmlcommentstart(x); | |
while ((c = GETNEXT()) != EOF) { | |
- if (c == '-' || c == '>') { | |
- if (x->xmlcomment && datalen) { | |
- x->data[datalen] = '\0'; | |
- x->xmlcomment(x, x->data, datalen); | |
- datalen = 0; | |
- } | |
- } | |
- | |
if (c == '-') { | |
- if (++i > 2) { | |
- if (x->xmlcomment) | |
- for (; i > 2; i--) | |
- x->xmlcomment(x, "-", 1); | |
+ if (++i > 2) | |
i = 2; | |
- } | |
continue; | |
} else if (c == '>' && i == 2) { | |
- if (x->xmlcommentend) | |
- x->xmlcommentend(x); | |
return; | |
} else if (i) { | |
- if (x->xmlcomment) { | |
- for (; i > 0; i--) | |
- x->xmlcomment(x, "-", 1); | |
- } | |
i = 0; | |
} | |
- | |
- if (datalen < sizeof(x->data) - 1) { | |
- x->data[datalen++] = c; | |
- } else { | |
- x->data[datalen] = '\0'; | |
- if (x->xmlcomment) | |
- x->xmlcomment(x, x->data, datalen); | |
- x->data[0] = c; | |
- datalen = 1; | |
- } | |
} | |
} | |
@@ -286,7 +256,7 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
l = strtol(++e, &end, 16); | |
else | |
l = strtol(e, &end, 10); | |
- /* invalid value or not a well-formed entity or invalid codepoint */ | |
+ /* invalid value or not a well-formed entity or invalid code point */ | |
if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff) | |
return -1; | |
len = codepointtoutf8(l, buf); | |
diff --git a/xml.h b/xml.h | |
@@ -1,5 +1,5 @@ | |
-#ifndef _XML_H | |
-#define _XML_H | |
+#ifndef _XML_H_ | |
+#define _XML_H_ | |
#include <stdio.h> | |
@@ -16,9 +16,6 @@ typedef struct xmlparser { | |
void (*xmlcdatastart)(struct xmlparser *); | |
void (*xmlcdata)(struct xmlparser *, const char *, size_t); | |
void (*xmlcdataend)(struct xmlparser *); | |
- void (*xmlcommentstart)(struct xmlparser *); | |
- void (*xmlcomment)(struct xmlparser *, const char *, size_t); | |
- void (*xmlcommentend)(struct xmlparser *); | |
void (*xmldata)(struct xmlparser *, const char *, size_t); | |
void (*xmldataend)(struct xmlparser *); | |
void (*xmldataentity)(struct xmlparser *, const char *, size_t); | |
@@ -29,8 +26,9 @@ typedef struct xmlparser { | |
size_t, int); | |
#ifndef GETNEXT | |
- #define GETNEXT (x)->getnext | |
- int (*getnext)(void); | |
+ /* GETNEXT overridden to reduce function call overhead and | |
+ further context optimizations. */ | |
+ #define GETNEXT getchar | |
#endif | |
/* current tag */ | |
diff --git a/xml2tsv.c b/xml2tsv.c | |
@@ -64,7 +64,7 @@ void stack_init(tstack_t *t){ | |
/* utility functions */ | |
/* quote_print: quote \\, \n, \t, and strip other ctrl chars */ | |
-void quote_print(FILE *f, const char *s){ | |
+void quote_print(const char *s){ | |
const char *tmp = s; | |
size_t len; | |
int i; | |
@@ -72,36 +72,45 @@ void quote_print(FILE *f, const char *s){ | |
len = strcspn(tmp, "\\\n\t"); | |
for(i=0; i<len; i++, tmp++){ | |
if (!iscntrl((unsigned char)*tmp)){ | |
- fwrite(tmp, 1, 1, f); | |
+ putchar(*tmp); | |
} | |
} | |
switch (*tmp){ | |
case '\n': | |
if (len > 0){ | |
- fprintf(f, "\\n"); | |
+ fputs("\\n", stdout); | |
} | |
tmp ++; | |
break; | |
case '\t': | |
- fprintf(f, "\\t"); | |
+ fputs("\\t", stdout); | |
tmp ++; | |
break; | |
case '\r': | |
- fprintf(f, "\\r"); | |
+ fputs("\\r", stdout); | |
tmp ++; | |
break; | |
case '\\': | |
- fprintf(f, "\\\\"); | |
+ fputs("\\\\", stdout); | |
tmp ++; | |
break; | |
} | |
} | |
} | |
-void print_cur_str(FILE *f, tstack_t *t){ | |
+void print_cur_str(tstack_t *t){ | |
int i; | |
for (i=0; i<=t->top; i++){ | |
- fprintf(f, "/%s", t->st[i]); | |
+ putchar('/'); | |
+ fputs(t->st[i], stdout); | |
+ } | |
+} | |
+ | |
+void print_cur_str_fp(FILE *f, tstack_t *t){ | |
+ int i; | |
+ for (i=0; i<=t->top; i++){ | |
+ fputc('/', f); | |
+ fputs(t->st[i], f); | |
} | |
} | |
@@ -110,13 +119,13 @@ void print_cur_str(FILE *f, tstack_t *t){ | |
tstack_t st; | |
char emitsep; | |
-/* xml callbacks */ | |
+/* XML callbacks */ | |
void | |
xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
const char *v, size_t vl) | |
{ | |
- printf("%s", v); | |
+ fputs(v, stdout); | |
} | |
void | |
@@ -133,56 +142,33 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, con… | |
} | |
void | |
-xmlattrend(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) | |
-{ | |
-} | |
- | |
-void | |
xmlattrstart(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) | |
{ | |
- printf("%c%s%c", SEP, a, SATTR); | |
+ putchar(SEP); | |
+ fputs(a, stdout); | |
+ putchar(SATTR); | |
} | |
void | |
xmlcdatastart(XMLParser *x) | |
{ | |
- printf("%c", SEP); | |
+ putchar(SEP); | |
} | |
void | |
xmlcdata(XMLParser *x, const char *d, size_t dl) | |
{ | |
- quote_print(stdout, d); | |
-} | |
- | |
-void | |
-xmlcdataend(XMLParser *x) | |
-{ | |
-} | |
- | |
-void | |
-xmlcommentstart(XMLParser *x) | |
-{ | |
-} | |
- | |
-void | |
-xmlcomment(XMLParser *x, const char *c, size_t cl) | |
-{ | |
-} | |
- | |
-void | |
-xmlcommentend(XMLParser *x) | |
-{ | |
+ quote_print(d); | |
} | |
void | |
xmldata(XMLParser *x, const char *d, size_t dl) | |
{ | |
if (strcspn(d, " \t\n") && emitsep){ | |
- printf("%c", SEP); | |
+ putchar(SEP); | |
emitsep = FALSE; | |
} | |
- quote_print(stdout, d); | |
+ quote_print(d); | |
} | |
void | |
@@ -220,12 +206,6 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int issh… | |
if (strcmp(t, tag)){ | |
fprintf(stderr, "Error: tag-end '%s' closes tag '%s'\n", t, ta… | |
} | |
- | |
-/* if (isshort) { | |
- printf("\n"); | |
- print_cur_str(stdout, &st); | |
- } | |
-*/ | |
} | |
void | |
@@ -235,13 +215,8 @@ xmltagstart(XMLParser *x, const char *t, size_t tl) | |
fprintf(stderr, "Error: stack full. Ignoring tag '%s' (parent … | |
return; | |
} | |
- printf("\n"); | |
- print_cur_str(stdout, &st); | |
-} | |
- | |
-void | |
-xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) | |
-{ | |
+ putchar('\n'); | |
+ print_cur_str(&st); | |
} | |
int | |
@@ -252,30 +227,22 @@ main(void) | |
XMLParser x = { 0 }; | |
x.xmlattr = xmlattr; | |
- x.xmlattrend = xmlattrend; | |
x.xmlattrstart = xmlattrstart; | |
x.xmlattrentity = xmlattrentity; | |
x.xmlcdatastart = xmlcdatastart; | |
x.xmlcdata = xmlcdata; | |
- x.xmlcdataend = xmlcdataend; | |
- x.xmlcommentstart = xmlcommentstart; | |
- x.xmlcomment = xmlcomment; | |
- x.xmlcommentend = xmlcommentend; | |
x.xmldata = xmldata; | |
x.xmldataend = xmldataend; | |
x.xmldataentity = xmldataentity; | |
x.xmldatastart = xmldatastart; | |
x.xmltagend = xmltagend; | |
x.xmltagstart = xmltagstart; | |
- x.xmltagstartparsed = xmltagstartparsed; | |
- | |
- x.getnext = getchar; | |
xml_parse(&x); | |
- printf("\n"); | |
+ putchar('\n'); | |
if (! stack_empty(&st)) { | |
fprintf(stderr, "Error: tags still open at EOF: "); | |
- print_cur_str(stderr, &st); | |
+ print_cur_str_fp(stderr, &st); | |
fprintf(stderr, "\n"); | |
} | |
return 0; |