Introduction
Introduction Statistics Contact Development Disclaimer Help
make separate repo for shared XML code "library" - xmlparser - XML parser
git clone git://git.codemadness.org/xmlparser
Log
Files
Refs
README
LICENSE
---
commit 287a59a2d0fc7c1f98d33e6142409c755fd39216
Author: Hiltjo Posthuma <[email protected]>
Date: Fri, 2 Nov 2018 17:48:53 +0100
make separate repo for shared XML code "library"
Diffstat:
A LICENSE | 15 +++++++++++++++
A README | 66 +++++++++++++++++++++++++++++…
A skeleton.c | 122 +++++++++++++++++++++++++++++…
A xml.c | 468 +++++++++++++++++++++++++++++…
A xml.h | 40 +++++++++++++++++++++++++++++…
5 files changed, 711 insertions(+), 0 deletions(-)
---
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2011-2018 Hiltjo Posthuma <[email protected]>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/README b/README
@@ -0,0 +1,66 @@
+XML parser
+==========
+
+
+Dependencies
+------------
+
+- C compiler (C99 expected).
+
+
+Features
+--------
+
+- Relatively small parser.
+- Pretty simple API.
+- Pretty fast.
+- Portable
+- No dynamic memory allocation.
+
+
+Supports
+--------
+
+- Tags in short-form (<img src="lolcat.jpg" title="Meow" />).
+- Tag attributes.
+- Short attributes without an explicity set value (<input type="checkbox" chec…
+- Comments
+- CDATA sections.
+- Helper function (xml_entitytostr) to convert XML 1.0 / HTML 2.0 named entiti…
+ and numeric entities to UTF-8.
+- Reading XML from a fd, string buffer or implement a custom reader:
+ see: XMLParser.getnext.
+
+
+Caveats
+-------
+
+- It is not a compliant XML parser.
+- The XML is not checked for errors so it will continue parsing XML data, this
+ is by design.
+- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are
+ called multiple times for the same tag if the data size is bigger than the
+ internal buffer size (sizeof(XMLParser.data)). To differentiate between new
+ calls for data you can use the xml*start and xml*end handlers.
+- The XML specification has no limits on tag and attribute names. For
+ simplicity/sanity sake this XML parser takes some liberties. Tag and
+ attribute names are truncated.
+
+
+Files used
+----------
+
+xml.c and xml.h
+
+
+Interface / API
+---------------
+
+Should be trivial, see xml.c and xml.h and the examples below.
+
+
+License
+-------
+
+ISC, see LICENSE file.
+
diff --git a/skeleton.c b/skeleton.c
@@ -0,0 +1,122 @@
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include "xml.h"
+
+void
+xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl)
+{
+}
+
+void
+xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl)
+{
+}
+
+void
+xmlattrend(XMLParser *x, const char *t, size_t tl, const char *a, size_t al)
+{
+}
+
+void
+xmlattrstart(XMLParser *x, const char *t, size_t tl, const char *a, size_t al)
+{
+}
+
+void
+xmlcdatastart(XMLParser *x)
+{
+}
+
+void
+xmlcdata(XMLParser *x, const char *d, size_t dl)
+{
+}
+
+void
+xmlcdataend(XMLParser *x)
+{
+}
+
+void
+xmlcommentstart(XMLParser *x)
+{
+}
+
+void
+xmlcomment(XMLParser *x, const char *c, size_t cl)
+{
+}
+
+void
+xmlcommentend(XMLParser *x)
+{
+}
+
+void
+xmldata(XMLParser *x, const char *d, size_t dl)
+{
+}
+
+void
+xmldataend(XMLParser *x)
+{
+}
+
+void
+xmldataentity(XMLParser *x, const char *d, size_t dl)
+{
+}
+
+void
+xmldatastart(XMLParser *x)
+{
+}
+
+void
+xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+}
+
+void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+}
+
+void
+xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+}
+
+int
+main(void)
+{
+ XMLParser x = { 0 };
+
+ x.xmlattr = xmlattr;
+ x.xmlattrend = xmlattrend;
+ x.xmlattrstart = xmlattrstart;
+ x.xmlattrentity = xmlattrentity;
+ x.xmlcdatastart = xmlcdatastart;
+ x.xmlcdata = xmlcdata;
+ x.xmlcdataend =xmlcdataend;
+ x.xmlcommentstart = xmlcommentstart;
+ x.xmlcomment = xmlcomment;
+ x.xmlcommentend = xmlcommentend;
+ x.xmldata = xmldata;
+ x.xmldataend = xmldataend;
+ x.xmldataentity = xmldataentity;
+ x.xmldatastart = xmldatastart;
+ x.xmltagend = xmltagend;
+ x.xmltagstart = xmltagstart;
+ x.xmltagstartparsed = xmltagstartparsed;
+
+ x.getnext = getchar;
+
+ xml_parse(&x);
+
+ return 0;
+}
diff --git a/xml.c b/xml.c
@@ -0,0 +1,468 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xml.h"
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0, valuestart = 0;
+
+ while ((c = x->getnext()) != EOF) {
+ if (isspace(c)) {
+ if (namelen)
+ endname = 1;
+ continue;
+ } else if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && isalpha(c)) …
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, name…
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, n…
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && valuestart) {
+ /* attribute with value */
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* isspace() */
+ goto startvalue;
+ }
+
+ while ((c = x->getnext()) != EOF) {
+startvalue:
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before…
+ if (valuelen && x->xmlattr)
+ x->xmlattr(x, x->tag, x->tagle…
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == endsep || (endsep == …
+ break;
+ if (valuelen < sizeof(x->data)…
+ x->data[valuelen++] = …
+ else {
+ /* entity too long for…
+ x->data[valuelen] = '\…
+ if (x->xmlattr)
+ x->xmlattr(x, …
+ x->data[0] = c;
+ valuelen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\…
+ if (x->xmlattrentity)
+ x->xmlattrenti…
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep && !(endsep == ' ' && (…
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, …
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep || (endsep == ' ' && (c == '>'…
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->tagle…
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->ta…
+ break;
+ }
+ }
+ namelen = endname = valuestart = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ x->name[0] = '\0';
+ namelen = 0;
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcommentstart)
+ x->xmlcommentstart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
+ }
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcomment)
+ x->xmlcomment(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcdatastart)
+ x->xmlcdatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcdata)
+ x->xmlcdata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ char *entity;
+ int c;
+ } entities[] = {
+ { "&amp;", '&' },
+ { "&lt;", '<' },
+ { "&gt;", '>' },
+ { "&apos;", '\'' },
+ { "&quot;", '"' },
+ { "&AMP;", '&' },
+ { "&LT;", '<' },
+ { "&GT;", '>' },
+ { "&APOS;", '\'' },
+ { "&QUOT;", '"' }
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ /* doesn't start with &: can't match */
+ if (*e != '&')
+ return 0;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ long l;
+ int len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ /* not a numeric entity */
+ if (e[0] != '&' || e[1] != '#')
+ return 0;
+
+ /* e[1] == '#', numeric / hexadecimal entity */
+ e += 2; /* skip "&#" */
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtoul(e + 1, &end, 16);
+ else
+ l = strtoul(e, &end, 10);
+ /* invalid value or not a well-formed entity or too high codepoint */
+ if (errno || *end != ';' || l > 0x10FFFF)
+ return 0;
+ len = codepointtoutf8(l, buf);
+ buf[len] = '\0';
+
+ return len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+int
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return 0;
+ /* named entity */
+ if (e[1] != '#')
+ return namedentitytostr(e, buf, bufsiz);
+ else /* numeric entity */
+ return numericentitytostr(e, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ int c, ispi;
+ size_t datalen, tagdatalen, taglen;
+
+ if (!x->getnext)
+ return;
+ while ((c = x->getnext()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = x->getnext()) == EOF)
+ return;
+
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = x->getnext()) != EOF…
+ /* NOTE: sizeof(x->data) must be atlea…
+ if (tagdatalen <= sizeof("[CDATA[") - …
+ x->data[tagdatalen++] = c;
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == siz…
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDA…
+ !strncmp(x->data, "[CDATA[…
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ x->tag[0] = '\0';
+ x->taglen = 0;
+
+ /* normal tag (open, short open, close), proce…
+ if (isspace(c))
+ while ((c = x->getnext()) != EOF && is…
+ ;
+ if (c == EOF)
+ return;
+ x->tag[0] = c;
+ ispi = (c == '?') ? 1 : 0;
+ x->isshorttag = ispi;
+ taglen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '/')
+ x->isshorttag = 1; /* short ta…
+ else if (c == '>' || isspace(c)) {
+ x->tag[taglen] = '\0';
+ if (x->tag[0] == '/') { /* end…
+ x->taglen = --taglen; …
+ if (taglen && x->xmlta…
+ x->xmltagend(x…
+ } else {
+ x->taglen = taglen;
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart…
+ if (isspace(c))
+ xml_parseattrs…
+ if (x->xmltagstartpars…
+ x->xmltagstart…
+ }
+ /* call tagend for shortform o…
+ if ((x->isshorttag || ispi) &&…
+ x->xmltagend(x, x->tag…
+ break;
+ } else if (taglen < sizeof(x->tag) - 1)
+ x->tag[taglen++] = c; /* NOTE:…
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ if (x->xmldatastart)
+ x->xmldatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data,…
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) …
+ x->data[datalen++] = c;
+ else {
+ /* entity too long for…
+ x->data[datalen] = '\0…
+ if (x->xmldata)
+ x->xmldata(x, …
+ x->data[0] = c;
+ datalen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[datalen] = '\0…
+ if (x->xmldataentity)
+ x->xmldataenti…
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data,…
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (x->xmldata && datalen)
+ x->xmldata(x, x->data, datalen…
+ if (x->xmldataend)
+ x->xmldataend(x);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/xml.h b/xml.h
@@ -0,0 +1,40 @@
+typedef struct xmlparser {
+ /* handlers */
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
+
+ int (*getnext)(void);
+
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is in short form ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[1024];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+int xml_entitytostr(const char *, char *, size_t);
+void xml_parse(XMLParser *);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.