xml: sync many XML parser improvements - grabtitle - stupid HTML title grabber | |
git clone git://git.codemadness.org/grabtitle | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 3fae05df6ed35c258c73314f9daa07b92314e03e | |
parent 0af2d13062af1f2bb254de507233ed28e8f8c459 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sun, 26 Aug 2018 15:27:55 +0200 | |
xml: sync many XML parser improvements | |
Diffstat: | |
M grabtitle.c | 1 - | |
M xml.c | 162 ++++++++++++++++++-----------… | |
M xml.h | 10 +++------- | |
3 files changed, 95 insertions(+), 78 deletions(-) | |
--- | |
diff --git a/grabtitle.c b/grabtitle.c | |
@@ -1,7 +1,6 @@ | |
#include <sys/types.h> | |
#include <errno.h> | |
-#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
diff --git a/xml.c b/xml.c | |
@@ -3,7 +3,6 @@ | |
#include <ctype.h> | |
#include <errno.h> | |
#include <limits.h> | |
-#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
@@ -14,19 +13,20 @@ static void | |
xml_parseattrs(XMLParser *x) | |
{ | |
size_t namelen = 0, valuelen; | |
- int c, endsep, endname = 0; | |
+ int c, endsep, endname = 0, valuestart = 0; | |
while ((c = x->getnext()) != EOF) { | |
- if (isspace(c)) { /* TODO: simplify endname ? */ | |
+ if (isspace(c)) { | |
if (namelen) | |
endname = 1; | |
continue; | |
- } | |
- if (c == '?') | |
+ } else if (c == '?') | |
; /* ignore */ | |
else if (c == '=') { | |
x->name[namelen] = '\0'; | |
- } else if (namelen && ((endname && isalpha(c)) || (c == '>' ||… | |
+ valuestart = 1; | |
+ endname = 1; | |
+ } else if (namelen && ((endname && !valuestart && isalpha(c)) … | |
/* attribute without value */ | |
x->name[namelen] = '\0'; | |
if (x->xmlattrstart) | |
@@ -38,12 +38,21 @@ xml_parseattrs(XMLParser *x) | |
endname = 0; | |
x->name[0] = c; | |
namelen = 1; | |
- } else if (namelen && (c == '\'' || c == '"')) { | |
+ } else if (namelen && valuestart) { | |
/* attribute with value */ | |
- endsep = c; /* c is end separator */ | |
if (x->xmlattrstart) | |
x->xmlattrstart(x, x->tag, x->taglen, x->name,… | |
- for (valuelen = 0; (c = x->getnext()) != EOF;) { | |
+ | |
+ valuelen = 0; | |
+ if (c == '\'' || c == '"') { | |
+ endsep = c; | |
+ } else { | |
+ endsep = ' '; /* isspace() */ | |
+ goto startvalue; | |
+ } | |
+ | |
+ while ((c = x->getnext()) != EOF) { | |
+startvalue: | |
if (c == '&') { /* entities */ | |
x->data[valuelen] = '\0'; | |
/* call data function with data before… | |
@@ -52,16 +61,17 @@ xml_parseattrs(XMLParser *x) | |
x->data[0] = c; | |
valuelen = 1; | |
while ((c = x->getnext()) != EOF) { | |
- if (c == endsep) | |
+ if (c == endsep || (endsep == … | |
break; | |
if (valuelen < sizeof(x->data)… | |
x->data[valuelen++] = … | |
else { | |
- /* TODO: entity too lo… | |
+ /* entity too long for… | |
x->data[valuelen] = '\… | |
if (x->xmlattr) | |
x->xmlattr(x, … | |
- valuelen = 0; | |
+ x->data[0] = c; | |
+ valuelen = 1; | |
break; | |
} | |
if (c == ';') { | |
@@ -72,7 +82,7 @@ xml_parseattrs(XMLParser *x) | |
break; | |
} | |
} | |
- } else if (c != endsep) { | |
+ } else if (c != endsep && !(endsep == ' ' && (… | |
if (valuelen < sizeof(x->data) - 1) { | |
x->data[valuelen++] = c; | |
} else { | |
@@ -83,7 +93,7 @@ xml_parseattrs(XMLParser *x) | |
valuelen = 1; | |
} | |
} | |
- if (c == endsep) { | |
+ if (c == endsep || (endsep == ' ' && (c == '>'… | |
x->data[valuelen] = '\0'; | |
if (x->xmlattr) | |
x->xmlattr(x, x->tag, x->tagle… | |
@@ -92,7 +102,7 @@ xml_parseattrs(XMLParser *x) | |
break; | |
} | |
} | |
- namelen = endname = 0; | |
+ namelen = endname = valuestart = 0; | |
} else if (namelen < sizeof(x->name) - 1) { | |
x->name[namelen++] = c; | |
} | |
@@ -100,8 +110,8 @@ xml_parseattrs(XMLParser *x) | |
break; | |
} else if (c == '/') { | |
x->isshorttag = 1; | |
- namelen = 0; | |
x->name[0] = '\0'; | |
+ namelen = 0; | |
} | |
} | |
} | |
@@ -203,48 +213,53 @@ xml_parsecdata(XMLParser *x) | |
} | |
} | |
-int | |
-xml_codepointtoutf8(uint32_t cp, uint32_t *utf) | |
+static int | |
+codepointtoutf8(long r, char *s) | |
{ | |
- if (cp >= 0x10000) { | |
- /* 4 bytes */ | |
- *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) | | |
- ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | | |
- (cp & 0x3f); | |
- return 4; | |
- } else if (cp >= 0x00800) { | |
- /* 3 bytes */ | |
- *utf = 0xe08080 | | |
- ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | | |
- (cp & 0x3f); | |
- return 3; | |
- } else if (cp >= 0x80) { | |
- /* 2 bytes */ | |
- *utf = 0xc080 | | |
- ((cp & 0xfc0) << 2) | (cp & 0x3f); | |
+ if (r == 0) { | |
+ return 0; /* NUL byte */ | |
+ } else if (r <= 0x7F) { | |
+ /* 1 byte: 0aaaaaaa */ | |
+ s[0] = r; | |
+ return 1; | |
+ } else if (r <= 0x07FF) { | |
+ /* 2 bytes: 00000aaa aabbbbbb */ | |
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
return 2; | |
+ } else if (r <= 0xFFFF) { | |
+ /* 3 bytes: aaaabbbb bbcccccc */ | |
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
+ return 3; | |
+ } else { | |
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
+ return 4; | |
} | |
- *utf = cp & 0xff; | |
- return *utf ? 1 : 0; /* 1 byte */ | |
} | |
-ssize_t | |
-xml_namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
+static int | |
+namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
static const struct { | |
char *entity; | |
int c; | |
} entities[] = { | |
- { .entity = "&", .c = '&' }, | |
- { .entity = "<", .c = '<' }, | |
- { .entity = ">", .c = '>' }, | |
- { .entity = "'", .c = '\'' }, | |
- { .entity = """, .c = '"' }, | |
- { .entity = "&", .c = '&' }, | |
- { .entity = "<", .c = '<' }, | |
- { .entity = ">", .c = '>' }, | |
- { .entity = "&APOS;", .c = '\'' }, | |
- { .entity = """, .c = '"' } | |
+ { "&", '&' }, | |
+ { "<", '<' }, | |
+ { ">", '>' }, | |
+ { "'", '\'' }, | |
+ { """, '"' }, | |
+ { "&", '&' }, | |
+ { "<", '<' }, | |
+ { ">", '>' }, | |
+ { "&APOS;", '\'' }, | |
+ { """, '"' } | |
}; | |
size_t i; | |
@@ -266,11 +281,11 @@ xml_namedentitytostr(const char *e, char *buf, size_t buf… | |
return 0; | |
} | |
-ssize_t | |
-xml_numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
+static int | |
+numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
- uint32_t l = 0, cp = 0; | |
- size_t b, len; | |
+ long l; | |
+ int len; | |
char *end; | |
/* buffer is too small */ | |
@@ -289,21 +304,18 @@ xml_numericentitytostr(const char *e, char *buf, size_t b… | |
l = strtoul(e + 1, &end, 16); | |
else | |
l = strtoul(e, &end, 10); | |
- /* invalid value or not a well-formed entity */ | |
- if (errno || *end != ';') | |
+ /* invalid value or not a well-formed entity or too high codepoint */ | |
+ if (errno || *end != ';' || l > 0x10FFFF) | |
return 0; | |
- len = xml_codepointtoutf8(l, &cp); | |
- /* make string */ | |
- for (b = 0; b < len; b++) | |
- buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff; | |
+ len = codepointtoutf8(l, buf); | |
buf[len] = '\0'; | |
- return (ssize_t)len; | |
+ return len; | |
} | |
/* convert named- or numeric entity string to buffer string | |
* returns byte-length of string. */ | |
-ssize_t | |
+int | |
xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
/* buffer is too small */ | |
@@ -314,9 +326,9 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
return 0; | |
/* named entity */ | |
if (e[1] != '#') | |
- return xml_namedentitytostr(e, buf, bufsiz); | |
+ return namedentitytostr(e, buf, bufsiz); | |
else /* numeric entity */ | |
- return xml_numericentitytostr(e, buf, bufsiz); | |
+ return numericentitytostr(e, buf, bufsiz); | |
} | |
void | |
@@ -334,12 +346,12 @@ xml_parse(XMLParser *x) | |
if (c == '<') { /* parse tag */ | |
if ((c = x->getnext()) == EOF) | |
return; | |
- x->tag[0] = '\0'; | |
- x->taglen = 0; | |
+ | |
if (c == '!') { /* cdata and comments */ | |
for (tagdatalen = 0; (c = x->getnext()) != EOF… | |
- if (tagdatalen <= sizeof("[CDATA[") - … | |
- x->data[tagdatalen++] = c; /* … | |
+ /* NOTE: sizeof(x->data) must be atlea… | |
+ if (tagdatalen <= sizeof("[CDATA[") - … | |
+ x->data[tagdatalen++] = c; | |
if (c == '>') | |
break; | |
else if (c == '-' && tagdatalen == siz… | |
@@ -355,6 +367,9 @@ xml_parse(XMLParser *x) | |
} | |
} | |
} else { | |
+ x->tag[0] = '\0'; | |
+ x->taglen = 0; | |
+ | |
/* normal tag (open, short open, close), proce… | |
if (isspace(c)) | |
while ((c = x->getnext()) != EOF && is… | |
@@ -366,7 +381,7 @@ xml_parse(XMLParser *x) | |
x->isshorttag = ispi; | |
taglen = 1; | |
while ((c = x->getnext()) != EOF) { | |
- if (c == '/') /* TODO: simplify short … | |
+ if (c == '/') | |
x->isshorttag = 1; /* short ta… | |
else if (c == '>' || isspace(c)) { | |
x->tag[taglen] = '\0'; | |
@@ -389,7 +404,7 @@ xml_parse(XMLParser *x) | |
x->xmltagend(x, x->tag… | |
break; | |
} else if (taglen < sizeof(x->tag) - 1) | |
- x->tag[taglen++] = c; | |
+ x->tag[taglen++] = c; /* NOTE:… | |
} | |
} | |
} else { | |
@@ -411,9 +426,16 @@ xml_parse(XMLParser *x) | |
break; | |
if (datalen < sizeof(x->data) … | |
x->data[datalen++] = c; | |
- if (isspace(c)) | |
+ else { | |
+ /* entity too long for… | |
+ x->data[datalen] = '\0… | |
+ if (x->xmldata) | |
+ x->xmldata(x, … | |
+ x->data[0] = c; | |
+ datalen = 1; | |
break; | |
- else if (c == ';') { | |
+ } | |
+ if (c == ';') { | |
x->data[datalen] = '\0… | |
if (x->xmldataentity) | |
x->xmldataenti… | |
diff --git a/xml.h b/xml.h | |
@@ -31,14 +31,10 @@ typedef struct xmlparser { | |
/* current tag is in short form ? <tag /> */ | |
int isshorttag; | |
/* current attribute name */ | |
- char name[256]; | |
+ char name[1024]; | |
/* data buffer used for tag data, cdata and attribute data */ | |
char data[BUFSIZ]; | |
} XMLParser; | |
-int xml_codepointtoutf8(uint32_t, uint32_t *); | |
-ssize_t xml_entitytostr(const char *, char *, size_t); | |
-ssize_t xml_namedentitytostr(const char *, char *, size_t); | |
-ssize_t xml_numericentitytostr(const char *, char *, size_t); | |
- | |
-void xml_parse(XMLParser *); | |
+int xml_entitytostr(const char *, char *, size_t); | |
+void xml_parse(XMLParser *); |