sync xml.{c,h} - grabtitle - stupid HTML title grabber | |
git clone git://git.codemadness.org/grabtitle | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 8e2bee7e85c6a6fbdb2b9ef84c69f8f74ab5b77c | |
parent 0ffe161701f6f9ecde66204f5784e6709d647a1e | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 30 May 2020 13:36:43 +0200 | |
sync xml.{c,h} | |
Diffstat: | |
M xml.c | 113 ++++++++++++++---------------… | |
M xml.h | 5 +++++ | |
2 files changed, 55 insertions(+), 63 deletions(-) | |
--- | |
diff --git a/xml.c b/xml.c | |
@@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x) | |
size_t namelen = 0; | |
int c, endsep, endname = 0, valuestart = 0; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (isspace(c)) { | |
if (namelen) | |
endname = 1; | |
@@ -32,12 +32,12 @@ xml_parseattrs(XMLParser *x) | |
/* attribute with value */ | |
if (c == '\'' || c == '"') { | |
endsep = c; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == endsep) | |
break; | |
} | |
} else { | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '>' || isspace(c)) | |
break; | |
} | |
@@ -61,7 +61,7 @@ xml_parsecomment(XMLParser *x) | |
size_t i = 0; | |
int c; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '-') { | |
if (i < 2) | |
i++; | |
@@ -79,7 +79,7 @@ xml_parsecdata(XMLParser *x) | |
size_t datalen = 0, i = 0; | |
int c; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == ']' || c == '>') { | |
if (x->xmlcdata) { | |
x->data[datalen] = '\0'; | |
@@ -147,44 +147,42 @@ codepointtoutf8(long r, char *s) | |
} | |
} | |
+struct namedentity { | |
+ const char *entity; | |
+ long cp; | |
+}; | |
+ | |
+int | |
+namedentitycmp(const void *v1, const void *v2) | |
+{ | |
+ struct namedentity *n1 = (struct namedentity *)v1; | |
+ struct namedentity *n2 = (struct namedentity *)v2; | |
+ | |
+ return strcmp(n1->entity, n2->entity); | |
+} | |
+ | |
static int | |
namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
- static const struct { | |
- char *entity; | |
- int c; | |
- } entities[] = { | |
- { "&", '&' }, | |
- { "<", '<' }, | |
- { ">", '>' }, | |
- { "'", '\'' }, | |
- { """, '"' }, | |
- { " ", ' ' }, | |
- { "&", '&' }, | |
- { "<", '<' }, | |
- { ">", '>' }, | |
- { "&APOS;", '\'' }, | |
- { """, '"' }, | |
- { "&NBSP;", ' ' }, | |
+ static const struct namedentity entities[] = { | |
+#include "namedentities.h" | |
}; | |
+ struct namedentity find, *found; | |
size_t i; | |
/* buffer is too small */ | |
- if (bufsiz < 2) | |
+ if (bufsiz < 5) | |
return -1; | |
- /* doesn't start with &: can't match */ | |
- if (*e != '&') | |
- return 0; | |
- | |
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { | |
- if (!strcmp(e, entities[i].entity)) { | |
- buf[0] = entities[i].c; | |
- buf[1] = '\0'; | |
- return 1; | |
- } | |
+ find.entity = e; | |
+ found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities), | |
+ sizeof(*entities), namedentitycmp); | |
+ if (found) { | |
+ i = codepointtoutf8(found->cp, buf); | |
+ buf[i] = '\0'; | |
+ return i; | |
} | |
- return 0; | |
+ return -1; | |
} | |
static int | |
@@ -198,21 +196,15 @@ numericentitytostr(const char *e, char *buf, size_t bufsi… | |
if (bufsiz < 5) | |
return -1; | |
- /* not a numeric entity */ | |
- if (e[0] != '&' || e[1] != '#') | |
- return 0; | |
- | |
- /* e[1] == '#', numeric / hexadecimal entity */ | |
- e += 2; /* skip "&#" */ | |
errno = 0; | |
/* hex (16) or decimal (10) */ | |
if (*e == 'x') | |
- l = strtoul(e + 1, &end, 16); | |
+ l = strtol(++e, &end, 16); | |
else | |
- l = strtoul(e, &end, 10); | |
- /* invalid value or not a well-formed entity or too high codepoint */ | |
- if (errno || *end != ';' || l > 0x10FFFF) | |
- return 0; | |
+ l = strtol(e, &end, 10); | |
+ /* invalid value or not a well-formed entity or invalid codepoint */ | |
+ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff) | |
+ return -1; | |
len = codepointtoutf8(l, buf); | |
buf[len] = '\0'; | |
@@ -220,21 +212,18 @@ numericentitytostr(const char *e, char *buf, size_t bufsi… | |
} | |
/* convert named- or numeric entity string to buffer string | |
- * returns byte-length of string. */ | |
+ * returns byte-length of string or -1 on failure. */ | |
int | |
xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
{ | |
- /* buffer is too small */ | |
- if (bufsiz < 5) | |
- return -1; | |
/* doesn't start with & */ | |
if (e[0] != '&') | |
- return 0; | |
- /* named entity */ | |
- if (e[1] != '#') | |
- return namedentitytostr(e, buf, bufsiz); | |
- else /* numeric entity */ | |
- return numericentitytostr(e, buf, bufsiz); | |
+ return -1; | |
+ /* numeric entity */ | |
+ if (e[1] == '#') | |
+ return numericentitytostr(e + 2, buf, bufsiz); | |
+ else /* named entity */ | |
+ return namedentitytostr(e + 1, buf, bufsiz); | |
} | |
void | |
@@ -243,18 +232,16 @@ xml_parse(XMLParser *x) | |
size_t datalen, tagdatalen; | |
int c, isend; | |
- if (!x->getnext) | |
- return; | |
- while ((c = x->getnext()) != EOF && c != '<') | |
+ while ((c = GETNEXT()) != EOF && c != '<') | |
; /* skip until < */ | |
while (c != EOF) { | |
if (c == '<') { /* parse tag */ | |
- if ((c = x->getnext()) == EOF) | |
+ if ((c = GETNEXT()) == EOF) | |
return; | |
if (c == '!') { /* cdata and comments */ | |
- for (tagdatalen = 0; (c = x->getnext()) != EOF… | |
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { | |
/* NOTE: sizeof(x->data) must be atlea… | |
if (tagdatalen <= sizeof("[CDATA[") - … | |
x->data[tagdatalen++] = c; | |
@@ -282,13 +269,13 @@ xml_parse(XMLParser *x) | |
if (c == '?') { | |
x->isshorttag = 1; | |
} else if (c == '/') { | |
- if ((c = x->getnext()) == EOF) | |
+ if ((c = GETNEXT()) == EOF) | |
return; | |
x->tag[0] = c; | |
isend = 1; | |
} | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '/') | |
x->isshorttag = 1; /* short ta… | |
else if (c == '>' || isspace(c)) { | |
@@ -320,7 +307,7 @@ xml_parse(XMLParser *x) | |
} else { | |
/* parse tag data */ | |
datalen = 0; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '&') { | |
if (datalen) { | |
x->data[datalen] = '\0'; | |
@@ -329,7 +316,7 @@ xml_parse(XMLParser *x) | |
} | |
x->data[0] = c; | |
datalen = 1; | |
- while ((c = x->getnext()) != EOF) { | |
+ while ((c = GETNEXT()) != EOF) { | |
if (c == '<') | |
break; | |
if (datalen < sizeof(x->data) … | |
diff --git a/xml.h b/xml.h | |
@@ -1,3 +1,6 @@ | |
+#ifndef _XML_H | |
+#define _XML_H | |
+ | |
typedef struct xmlparser { | |
/* handlers */ | |
void (*xmlcdata)(struct xmlparser *, const char *, size_t); | |
@@ -6,6 +9,7 @@ typedef struct xmlparser { | |
void (*xmltagend)(struct xmlparser *, const char *, size_t, int); | |
void (*xmltagstart)(struct xmlparser *, const char *, size_t); | |
+#define GETNEXT (x)->getnext | |
int (*getnext)(void); | |
/* current tag */ | |
@@ -19,3 +23,4 @@ typedef struct xmlparser { | |
int xml_entitytostr(const char *, char *, size_t); | |
void xml_parse(XMLParser *); | |
+#endif |