Introduction
Introduction Statistics Contact Development Disclaimer Help
sync XML parser and some improvements - sub - subscene.com subtitle search
git clone git://git.codemadness.org/sub
Log
Files
Refs
README
LICENSE
---
commit 919b13a33a111b5f946652c2e2ce0a07200a3fe3
parent 6ef7f7e85bfb08f37166b9c8c450afb43bc7fc50
Author: Hiltjo Posthuma <[email protected]>
Date: Sun, 11 Mar 2018 18:51:49 +0100
sync XML parser and some improvements
Diffstat:
M sub.c | 8 +++++---
M xml.c | 439 ++++++++++++++++++++---------…
M xml.h | 79 +++++++++++++++--------------…
3 files changed, 322 insertions(+), 204 deletions(-)
---
diff --git a/sub.c b/sub.c
@@ -1,3 +1,5 @@
+#include <sys/types.h>
+
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
@@ -165,14 +167,14 @@ main(void)
return 1;
}
- xmlparser_init(&parser, stdin);
-
parser.xmltagstart = xml_handler_start_element;
parser.xmltagend = xml_handler_end_element;
parser.xmlattr = xml_handler_attr;
parser.xmldata = xml_handler_data;
- xmlparser_parse(&parser);
+ parser.getnext = getchar;
+
+ xml_parse(&parser);
return 0;
}
diff --git a/xml.c b/xml.c
@@ -1,110 +1,104 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdint.h>
#include <stdio.h>
-#include <string.h>
#include <stdlib.h>
-#include <ctype.h>
+#include <string.h>
#include "xml.h"
-static __inline__ int /* like getc(), but do some smart buffering */
-xmlparser_getnext(XMLParser *x) {
- return fgetc(x->fp);
-#if 0
- if(x->readoffset >= x->readlastbytes) {
- x->readoffset = 0;
- if(!(x->readlastbytes = fread(x->readbuf, 1, sizeof(x->readbuf…
- return EOF; /* 0 bytes read, assume EOF */
- }
- return (int)x->readbuf[x->readoffset++];
-#endif
-}
-
-static __inline__ void
-xmlparser_parseattrs(XMLParser *x) {
+static void
+xml_parseattrs(XMLParser *x)
+{
size_t namelen = 0, valuelen;
int c, endsep, endname = 0;
- while((c = xmlparser_getnext(x)) != EOF) {
- if(isspace(c)) { /* TODO: simplify endname ? */
- if(namelen)
+ while ((c = x->getnext()) != EOF) {
+ if (isspace(c)) { /* TODO: simplify endname ? */
+ if (namelen)
endname = 1;
continue;
}
- if(c == '?')
+ if (c == '?')
; /* ignore */
- else if(c == '=') {
+ else if (c == '=') {
x->name[namelen] = '\0';
- } else if(namelen && ((endname && isalpha(c)) || (c == '>' || …
+ } else if (namelen && ((endname && isalpha(c)) || (c == '>' ||…
/* attribute without value */
x->name[namelen] = '\0';
- if(x->xmlattrstart)
+ if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name,…
- if(x->xmlattr)
+ if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, name…
- if(x->xmlattrend)
+ if (x->xmlattrend)
x->xmlattrend(x, x->tag, x->taglen, x->name, n…
endname = 0;
x->name[0] = c;
namelen = 1;
- } else if(namelen && (c == '\'' || c == '"')) {
+ } else if (namelen && (c == '\'' || c == '"')) {
/* attribute with value */
endsep = c; /* c is end separator */
- if(x->xmlattrstart)
+ if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name,…
- for(valuelen = 0; (c = xmlparser_getnext(x)) != EOF;) {
- if(c == '&' && x->xmlattrentity) { /* entities…
+ for (valuelen = 0; (c = x->getnext()) != EOF;) {
+ if (c == '&') { /* entities */
x->data[valuelen] = '\0';
/* call data function with data before…
- if(valuelen && x->xmlattr)
+ if (valuelen && x->xmlattr)
x->xmlattr(x, x->tag, x->tagle…
x->data[0] = c;
valuelen = 1;
- while((c = xmlparser_getnext(x)) != EO…
- if(c == endsep)
+ while ((c = x->getnext()) != EOF) {
+ if (c == endsep)
break;
- if(valuelen < sizeof(x->data) …
+ if (valuelen < sizeof(x->data)…
x->data[valuelen++] = …
else {
/* TODO: entity too lo…
x->data[valuelen] = '\…
- if(x->xmlattr)
+ if (x->xmlattr)
x->xmlattr(x, …
valuelen = 0;
break;
}
- if(c == ';') {
+ if (c == ';') {
x->data[valuelen] = '\…
- x->xmlattrentity(x, x-…
+ if (x->xmlattrentity)
+ x->xmlattrenti…
valuelen = 0;
break;
}
}
- } else if(c != endsep) {
- if(valuelen < sizeof(x->data) - 1) {
+ } else if (c != endsep) {
+ if (valuelen < sizeof(x->data) - 1) {
x->data[valuelen++] = c;
} else {
x->data[valuelen] = '\0';
- if(x->xmlattr)
+ if (x->xmlattr)
x->xmlattr(x, x->tag, …
x->data[0] = c;
valuelen = 1;
}
}
- if(c == endsep) {
+ if (c == endsep) {
x->data[valuelen] = '\0';
- if(x->xmlattr)
+ if (x->xmlattr)
x->xmlattr(x, x->tag, x->tagle…
- if(x->xmlattrend)
+ if (x->xmlattrend)
x->xmlattrend(x, x->tag, x->ta…
break;
}
}
- namelen = 0;
- endname = 0;
- } else if(namelen < sizeof(x->name) - 1)
+ namelen = endname = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
x->name[namelen++] = c;
- if(c == '>') {
+ }
+ if (c == '>') {
break;
- } else if(c == '/') {
+ } else if (c == '/') {
x->isshorttag = 1;
namelen = 0;
x->name[0] = '\0';
@@ -112,37 +106,48 @@ xmlparser_parseattrs(XMLParser *x) {
}
}
-static __inline__ void
-xmlparser_parsecomment(XMLParser *x) {
+static void
+xml_parsecomment(XMLParser *x)
+{
size_t datalen = 0, i = 0;
int c;
- if(x->xmlcommentstart)
+ if (x->xmlcommentstart)
x->xmlcommentstart(x);
- while((c = xmlparser_getnext(x)) != EOF) {
- if(c == '-' && i < 2)
- i++;
- else if(c == '>') {
- if(i == 2) { /* -- */
- if(datalen >= 2) {
- datalen -= 2;
- x->data[datalen] = '\0';
- if(x->xmlcomment)
- x->xmlcomment(x, x->data, data…
- }
- if(x->xmlcommentend)
- x->xmlcommentend(x);
- break;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
}
i = 0;
}
- /* || (c == '-' && d >= sizeof(x->data) - 4)) { */
- /* TODO: what if the end has --, and it's cut on the boundary,…
- if(datalen < sizeof(x->data) - 1)
+
+ if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
- else {
+ } else {
x->data[datalen] = '\0';
- if(x->xmlcomment)
+ if (x->xmlcomment)
x->xmlcomment(x, x->data, datalen);
x->data[0] = c;
datalen = 1;
@@ -150,43 +155,47 @@ xmlparser_parsecomment(XMLParser *x) {
}
}
-/* TODO:
- * <test><![CDATA[1234567dddd8]]]>
- *
- * with x->data of sizeof(15) gives 2 ] at end of cdata, should be 1
- * test comment function too for similar bug?
- *
- */
-static __inline__ void
-xmlparser_parsecdata(XMLParser *x) {
+static void
+xml_parsecdata(XMLParser *x)
+{
size_t datalen = 0, i = 0;
int c;
- if(x->xmlcdatastart)
+ if (x->xmlcdatastart)
x->xmlcdatastart(x);
- while((c = xmlparser_getnext(x)) != EOF) {
- if(c == ']' && i < 2) {
- i++;
- } else if(c == '>') {
- if(i == 2) { /* ]] */
- if(datalen >= 2) {
- datalen -= 2;
- x->data[datalen] = '\0';
- if(x->xmlcdata && datalen)
- x->xmlcdata(x, x->data, datale…
- }
- if(x->xmlcdataend)
- x->xmlcdataend(x);
- break;
+ while ((c = x->getnext()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
}
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
i = 0;
}
- /* TODO: what if the end has ]>, and it's cut on the boundary …
- if(datalen < sizeof(x->data) - 1) {
+
+ if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
- if(x->xmlcdata)
+ if (x->xmlcdata)
x->xmlcdata(x, x->data, datalen);
x->data[0] = c;
datalen = 1;
@@ -194,128 +203,240 @@ xmlparser_parsecdata(XMLParser *x) {
}
}
-void
-xmlparser_init(XMLParser *x, FILE *fp) {
- memset(x, 0, sizeof(XMLParser));
- x->fp = fp;
+int
+xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
+{
+ if (cp >= 0x10000) {
+ /* 4 bytes */
+ *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 4;
+ } else if (cp >= 0x00800) {
+ /* 3 bytes */
+ *utf = 0xe08080 |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 3;
+ } else if (cp >= 0x80) {
+ /* 2 bytes */
+ *utf = 0xc080 |
+ ((cp & 0xfc0) << 2) | (cp & 0x3f);
+ return 2;
+ }
+ *utf = cp & 0xff;
+ return *utf ? 1 : 0; /* 1 byte */
+}
+
+ssize_t
+xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ char *entity;
+ int c;
+ } entities[] = {
+ { .entity = "&amp;", .c = '&' },
+ { .entity = "&lt;", .c = '<' },
+ { .entity = "&gt;", .c = '>' },
+ { .entity = "&apos;", .c = '\'' },
+ { .entity = "&quot;", .c = '"' },
+ { .entity = "&AMP;", .c = '&' },
+ { .entity = "&LT;", .c = '<' },
+ { .entity = "&GT;", .c = '>' },
+ { .entity = "&APOS;", .c = '\'' },
+ { .entity = "&QUOT;", .c = '"' }
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ /* doesn't start with &: can't match */
+ if (*e != '&')
+ return 0;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return 0;
+}
+
+ssize_t
+xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ uint32_t l = 0, cp = 0;
+ size_t b, len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ /* not a numeric entity */
+ if (e[0] != '&' || e[1] != '#')
+ return 0;
+
+ /* e[1] == '#', numeric / hexadecimal entity */
+ e += 2; /* skip "&#" */
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtoul(e + 1, &end, 16);
+ else
+ l = strtoul(e, &end, 10);
+ /* invalid value or not a well-formed entity */
+ if (errno || *end != ';')
+ return 0;
+ len = xml_codepointtoutf8(l, &cp);
+ /* make string */
+ for (b = 0; b < len; b++)
+ buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
+ buf[len] = '\0';
+
+ return (ssize_t)len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+ssize_t
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return 0;
+ /* named entity */
+ if (e[1] != '#')
+ return xml_namedentitytostr(e, buf, bufsiz);
+ else /* numeric entity */
+ return xml_numericentitytostr(e, buf, bufsiz);
}
void
-xmlparser_parse(XMLParser *x) {
+xml_parse(XMLParser *x)
+{
int c, ispi;
size_t datalen, tagdatalen, taglen;
- while((c = xmlparser_getnext(x)) != EOF && c != '<'); /* skip until < …
+ if (!x->getnext)
+ return;
+ while ((c = x->getnext()) != EOF && c != '<')
+ ; /* skip until < */
- while(c != EOF) {
- if(c == '<') { /* parse tag */
- if((c = xmlparser_getnext(x)) == EOF)
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = x->getnext()) == EOF)
return;
x->tag[0] = '\0';
x->taglen = 0;
- if(c == '!') { /* cdata and comments */
- for(tagdatalen = 0; (c = xmlparser_getnext(x))…
- if(tagdatalen <= strlen("[CDATA[")) /*…
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = x->getnext()) != EOF…
+ if (tagdatalen <= sizeof("[CDATA[") - …
x->data[tagdatalen++] = c; /* …
- if(c == '>')
+ if (c == '>')
break;
- else if(c == '-' && tagdatalen == strl…
- (x->data[0] == '-')) {…
- xmlparser_parsecomment(x);
+ else if (c == '-' && tagdatalen == siz…
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
break;
- } else if(c == '[') {
- if(tagdatalen == strlen("[CDAT…
- x->data[1] == 'C' && x…
- x->data[3] == 'A' && x…
- x->data[5] == 'A' && x…
- xmlparser_parsecdata(x…
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDA…
+ !strncmp(x->data, "[CDATA[…
+ xml_parsecdata(x);
break;
- #if 0
- } else {
- /* TODO ? */
- /* markup declaration …
- while((c = xmlparser_g…
- #endif
}
}
}
- } else { /* normal tag (open, short open, close), proc…
- if(isspace(c))
- while((c = xmlparser_getnext(x)) != EO…
- if(c == EOF)
+ } else {
+ /* normal tag (open, short open, close), proce…
+ if (isspace(c))
+ while ((c = x->getnext()) != EOF && is…
+ ;
+ if (c == EOF)
return;
x->tag[0] = c;
ispi = (c == '?') ? 1 : 0;
x->isshorttag = ispi;
taglen = 1;
- while((c = xmlparser_getnext(x)) != EOF) {
- if(c == '/') /* TODO: simplify short t…
+ while ((c = x->getnext()) != EOF) {
+ if (c == '/') /* TODO: simplify short …
x->isshorttag = 1; /* short ta…
- else if(c == '>' || isspace(c)) {
+ else if (c == '>' || isspace(c)) {
x->tag[taglen] = '\0';
- if(x->tag[0] == '/') { /* end …
+ if (x->tag[0] == '/') { /* end…
x->taglen = --taglen; …
- if(taglen && x->xmltag…
+ if (taglen && x->xmlta…
x->xmltagend(x…
} else {
x->taglen = taglen;
- if(x->xmltagstart)
- x->xmltagstart…
- if(isspace(c))
- xmlparser_pars…
- if(x->xmltagstartparse…
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart…
+ if (isspace(c))
+ xml_parseattrs…
+ if (x->xmltagstartpars…
x->xmltagstart…
}
- if((x->isshorttag || ispi) && …
+ /* call tagend for shortform o…
+ if ((x->isshorttag || ispi) &&…
x->xmltagend(x, x->tag…
break;
- } else if(taglen < sizeof(x->tag) - 1)
+ } else if (taglen < sizeof(x->tag) - 1)
x->tag[taglen++] = c;
}
}
} else {
- /* parse data */
+ /* parse tag data */
datalen = 0;
- if(x->xmldatastart)
+ if (x->xmldatastart)
x->xmldatastart(x);
- while((c = xmlparser_getnext(x)) != EOF) {
- if(c == '&' && x->xmldataentity) {
- if(datalen) {
+ while ((c = x->getnext()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
x->data[datalen] = '\0';
- x->xmldata(x, x->data, datalen…
+ if (x->xmldata)
+ x->xmldata(x, x->data,…
}
x->data[0] = c;
datalen = 1;
- while((c = xmlparser_getnext(x)) != EO…
- if(c == '<')
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
break;
- if(datalen < sizeof(x->data) -…
+ if (datalen < sizeof(x->data) …
x->data[datalen++] = c;
- if(isspace(c))
+ if (isspace(c))
break;
- else if(c == ';') {
+ else if (c == ';') {
x->data[datalen] = '\0…
- x->xmldataentity(x, x-…
+ if (x->xmldataentity)
+ x->xmldataenti…
datalen = 0;
break;
}
}
- } else if(c != '<') {
- if(datalen < sizeof(x->data) - 1) {
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
- if(x->xmldata)
+ if (x->xmldata)
x->xmldata(x, x->data,…
x->data[0] = c;
datalen = 1;
}
}
- if(c == '<') {
+ if (c == '<') {
x->data[datalen] = '\0';
- if(x->xmldata && datalen)
+ if (x->xmldata && datalen)
x->xmldata(x, x->data, datalen…
- if(x->xmldataend)
+ if (x->xmldataend)
x->xmldataend(x);
break;
}
diff --git a/xml.h b/xml.h
@@ -1,49 +1,44 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
typedef struct xmlparser {
/* handlers */
- void (*xmltagstart)(struct xmlparser *p, const char *tag, size_t tagle…
- void (*xmltagstartparsed)(struct xmlparser *p, const char *tag,
- size_t taglen, int isshort);
- void (*xmltagend)(struct xmlparser *p, const char *tag, size_t taglen,
- int isshort);
- void (*xmldatastart)(struct xmlparser *p);
- void (*xmldata)(struct xmlparser *p, const char *data, size_t datalen);
- void (*xmldataend)(struct xmlparser *p);
- void (*xmldataentity)(struct xmlparser *p, const char *data,
- size_t datalen);
- void (*xmlattrstart)(struct xmlparser *p, const char *tag, size_t tagl…
- const char *name, size_t namelen);
- void (*xmlattr)(struct xmlparser *p, const char *tag, size_t taglen,
- const char *name, size_t namelen, const char *value,
- size_t valuelen);
- void (*xmlattrend)(struct xmlparser *p, const char *tag, size_t taglen,
- const char *name, size_t namelen);
- void (*xmlattrentity)(struct xmlparser *p, const char *tag, size_t tag…
- const char *name, size_t namelen, const char *value,
- size_t valuelen);
- void (*xmlcdatastart)(struct xmlparser *p);
- void (*xmlcdata)(struct xmlparser *p, const char *data, size_t datalen…
- void (*xmlcdataend)(struct xmlparser *p);
- void (*xmlcommentstart)(struct xmlparser *p);
- void (*xmlcomment)(struct xmlparser *p, const char *comment,
- size_t commentlen);
- void (*xmlcommentend)(struct xmlparser *p);
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
- FILE *fp; /* file stream to read from */
+ int (*getnext)(void);
- /* private; internal state */
- char tag[1024]; /* current tag */
- int isshorttag; /* current tag is in short form ? */
+ /* current tag */
+ char tag[1024];
size_t taglen;
- char name[256]; /* current attribute name */
- char data[BUFSIZ]; /* data buffer used for tag and attribute data */
- size_t readoffset;
- size_t readlastbytes;
- unsigned char readbuf[BUFSIZ]; /* read buffer used by xmlparser_getnex…
+ /* current tag is in short form ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[256];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
} XMLParser;
-void xmlparser_init(XMLParser *x, FILE *fp);
-void xmlparser_parse(XMLParser *x);
+int xml_codepointtoutf8(uint32_t, uint32_t *);
+ssize_t xml_entitytostr(const char *, char *, size_t);
+ssize_t xml_namedentitytostr(const char *, char *, size_t);
+ssize_t xml_numericentitytostr(const char *, char *, size_t);
+
+void xml_parse(XMLParser *);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.