GopherProxy

	initial insertion - grabtitle - stupid HTML title grabber
	git clone git://git.codemadness.org/grabtitle
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit 5c21827b86be877d3d5df7f7a9b810822e4f8e22
	Author: Hiltjo Posthuma <[email protected]>
	Date: Sat, 31 Mar 2018 12:59:22 +0200

	initial insertion

	Diffstat:
	A .gitignore \| 3 +++
	A Makefile \| 5 +++++
	A run.sh \| 10 ++++++++++
	A title.c \| 86 ++++++++++++++++++++++++++++++
	A xml.c \| 446 ++++++++++++++++++++++++++++++
	A xml.h \| 44 +++++++++++++++++++++++++++++…

	6 files changed, 594 insertions(+), 0 deletions(-)
	---
	diff --git a/.gitignore b/.gitignore
	@@ -0,0 +1,3 @@
	+title
	+*.o
	+*.core
	diff --git a/Makefile b/Makefile
	@@ -0,0 +1,5 @@
	+build:
	+ cc xml.c title.c -o title
	+
	+clean:
	+ rm -f title
	diff --git a/run.sh b/run.sh
	@@ -0,0 +1,10 @@
	+#!/bin/sh
	+
	+url="http://codemadness.org"
	+curl \
	+ -s \
	+ -L --max-redirs 3 \
	+ -m 5 \
	+ -H 'User-Agent:' \
	+ "$url" \| \
	+ ./title
	diff --git a/title.c b/title.c
	@@ -0,0 +1,86 @@
	+#include <sys/types.h>
	+
	+#include <errno.h>
	+#include <stdint.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <strings.h>
	+
	+#include "xml.h"
	+
	+#ifdef USE_PLEDGE
	+#include <unistd.h>
	+#else
	+#define pledge(a,b) 0
	+#endif
	+
	+static XMLParser parser;
	+static int istitle, titlelen;
	+static char title[4096];
	+
	+static void
	+xmltagstart(XMLParser p, const char t, size_t tl)
	+{
	+ if (tl == 5 && !strcasecmp(t, "title"))
	+ istitle = 1;
	+}
	+
	+static void
	+xmltagend(XMLParser p, const char t, size_t tl, int isshort)
	+{
	+ if (istitle && tl == 5 && !strcasecmp(t, "title")) {
	+ puts(title);
	+ exit(0);
	+ }
	+}
	+
	+/* data and CDATA */
	+static void
	+xmldata(XMLParser p, const char d, size_t dl)
	+{
	+ if (!istitle)
	+ return;
	+ if (titlelen + dl + 1 >= sizeof(title))
	+ return;
	+ memcpy(title + titlelen, d, dl);
	+ titlelen += dl;
	+ title[titlelen] = '\0';
	+}
	+
	+static void
	+xmldataentity(XMLParser p, const char d, size_t dl)
	+{
	+ char buf[16];
	+
	+ if (xml_entitytostr(d, buf, sizeof(buf))) {
	+ d = buf;
	+ dl = strlen(buf);
	+ }
	+
	+ if (titlelen + dl + 1 >= sizeof(title))
	+ return;
	+ memcpy(title + titlelen, d, dl);
	+ titlelen += dl;
	+ title[titlelen] = '\0';
	+}
	+
	+int
	+main(int argc, char *argv[])
	+{
	+ if (pledge("stdio", NULL) == -1) {
	+ fprintf(stderr, "pledge: %s\n", strerror(errno));
	+ return 2;
	+ }
	+
	+ parser.xmltagstart = xmltagstart;
	+ parser.xmltagend = xmltagend;
	+ parser.xmldata = xmldata;
	+ parser.xmlcdata = xmldata;
	+ parser.xmldataentity = xmldataentity;
	+
	+ parser.getnext = getchar;
	+ xml_parse(&parser);
	+
	+ return 1;
	+}
	diff --git a/xml.c b/xml.c
	@@ -0,0 +1,446 @@
	+#include <sys/types.h>
	+
	+#include <ctype.h>
	+#include <errno.h>
	+#include <limits.h>
	+#include <stdint.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+
	+#include "xml.h"
	+
	+static void
	+xml_parseattrs(XMLParser *x)
	+{
	+ size_t namelen = 0, valuelen;
	+ int c, endsep, endname = 0;
	+
	+ while ((c = x->getnext()) != EOF) {
	+ if (isspace(c)) { /* TODO: simplify endname ? */
	+ if (namelen)
	+ endname = 1;
	+ continue;
	+ }
	+ if (c == '?')
	+ ; /* ignore */
	+ else if (c == '=') {
	+ x->name[namelen] = '\0';
	+ } else if (namelen && ((endname && isalpha(c)) \|\| (c == '>' \|\|…
	+ /* attribute without value */
	+ x->name[namelen] = '\0';
	+ if (x->xmlattrstart)
	+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
	+ if (x->xmlattr)
	+ x->xmlattr(x, x->tag, x->taglen, x->name, name…
	+ if (x->xmlattrend)
	+ x->xmlattrend(x, x->tag, x->taglen, x->name, n…
	+ endname = 0;
	+ x->name[0] = c;
	+ namelen = 1;
	+ } else if (namelen && (c == '\'' \|\| c == '"')) {
	+ /* attribute with value */
	+ endsep = c; /* c is end separator */
	+ if (x->xmlattrstart)
	+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
	+ for (valuelen = 0; (c = x->getnext()) != EOF;) {
	+ if (c == '&') { /* entities */
	+ x->data[valuelen] = '\0';
	+ /* call data function with data before…
	+ if (valuelen && x->xmlattr)
	+ x->xmlattr(x, x->tag, x->tagle…
	+ x->data[0] = c;
	+ valuelen = 1;
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == endsep)
	+ break;
	+ if (valuelen < sizeof(x->data)…
	+ x->data[valuelen++] = …
	+ else {
	+ /* TODO: entity too lo…
	+ x->data[valuelen] = '\…
	+ if (x->xmlattr)
	+ x->xmlattr(x, …
	+ valuelen = 0;
	+ break;
	+ }
	+ if (c == ';') {
	+ x->data[valuelen] = '\…
	+ if (x->xmlattrentity)
	+ x->xmlattrenti…
	+ valuelen = 0;
	+ break;
	+ }
	+ }
	+ } else if (c != endsep) {
	+ if (valuelen < sizeof(x->data) - 1) {
	+ x->data[valuelen++] = c;
	+ } else {
	+ x->data[valuelen] = '\0';
	+ if (x->xmlattr)
	+ x->xmlattr(x, x->tag, …
	+ x->data[0] = c;
	+ valuelen = 1;
	+ }
	+ }
	+ if (c == endsep) {
	+ x->data[valuelen] = '\0';
	+ if (x->xmlattr)
	+ x->xmlattr(x, x->tag, x->tagle…
	+ if (x->xmlattrend)
	+ x->xmlattrend(x, x->tag, x->ta…
	+ break;
	+ }
	+ }
	+ namelen = endname = 0;
	+ } else if (namelen < sizeof(x->name) - 1) {
	+ x->name[namelen++] = c;
	+ }
	+ if (c == '>') {
	+ break;
	+ } else if (c == '/') {
	+ x->isshorttag = 1;
	+ namelen = 0;
	+ x->name[0] = '\0';
	+ }
	+ }
	+}
	+
	+static void
	+xml_parsecomment(XMLParser *x)
	+{
	+ size_t datalen = 0, i = 0;
	+ int c;
	+
	+ if (x->xmlcommentstart)
	+ x->xmlcommentstart(x);
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == '-' \|\| c == '>') {
	+ if (x->xmlcomment) {
	+ x->data[datalen] = '\0';
	+ x->xmlcomment(x, x->data, datalen);
	+ datalen = 0;
	+ }
	+ }
	+
	+ if (c == '-') {
	+ if (++i > 2) {
	+ if (x->xmlcomment)
	+ for (; i > 2; i--)
	+ x->xmlcomment(x, "-", 1);
	+ i = 2;
	+ }
	+ continue;
	+ } else if (c == '>' && i == 2) {
	+ if (x->xmlcommentend)
	+ x->xmlcommentend(x);
	+ return;
	+ } else if (i) {
	+ if (x->xmlcomment) {
	+ for (; i > 0; i--)
	+ x->xmlcomment(x, "-", 1);
	+ }
	+ i = 0;
	+ }
	+
	+ if (datalen < sizeof(x->data) - 1) {
	+ x->data[datalen++] = c;
	+ } else {
	+ x->data[datalen] = '\0';
	+ if (x->xmlcomment)
	+ x->xmlcomment(x, x->data, datalen);
	+ x->data[0] = c;
	+ datalen = 1;
	+ }
	+ }
	+}
	+
	+static void
	+xml_parsecdata(XMLParser *x)
	+{
	+ size_t datalen = 0, i = 0;
	+ int c;
	+
	+ if (x->xmlcdatastart)
	+ x->xmlcdatastart(x);
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == ']' \|\| c == '>') {
	+ if (x->xmlcdata) {
	+ x->data[datalen] = '\0';
	+ x->xmlcdata(x, x->data, datalen);
	+ datalen = 0;
	+ }
	+ }
	+
	+ if (c == ']') {
	+ if (++i > 2) {
	+ if (x->xmlcdata)
	+ for (; i > 2; i--)
	+ x->xmlcdata(x, "]", 1);
	+ i = 2;
	+ }
	+ continue;
	+ } else if (c == '>' && i == 2) {
	+ if (x->xmlcdataend)
	+ x->xmlcdataend(x);
	+ return;
	+ } else if (i) {
	+ if (x->xmlcdata)
	+ for (; i > 0; i--)
	+ x->xmlcdata(x, "]", 1);
	+ i = 0;
	+ }
	+
	+ if (datalen < sizeof(x->data) - 1) {
	+ x->data[datalen++] = c;
	+ } else {
	+ x->data[datalen] = '\0';
	+ if (x->xmlcdata)
	+ x->xmlcdata(x, x->data, datalen);
	+ x->data[0] = c;
	+ datalen = 1;
	+ }
	+ }
	+}
	+
	+int
	+xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
	+{
	+ if (cp >= 0x10000) {
	+ /* 4 bytes */
	+ *utf = 0xf0808080 \| ((cp & 0xfc0000) << 6) \|
	+ ((cp & 0x3f000) << 4) \| ((cp & 0xfc0) << 2) \|
	+ (cp & 0x3f);
	+ return 4;
	+ } else if (cp >= 0x00800) {
	+ /* 3 bytes */
	+ *utf = 0xe08080 \|
	+ ((cp & 0x3f000) << 4) \| ((cp & 0xfc0) << 2) \|
	+ (cp & 0x3f);
	+ return 3;
	+ } else if (cp >= 0x80) {
	+ /* 2 bytes */
	+ *utf = 0xc080 \|
	+ ((cp & 0xfc0) << 2) \| (cp & 0x3f);
	+ return 2;
	+ }
	+ *utf = cp & 0xff;
	+ return utf ? 1 : 0; / 1 byte */
	+}
	+
	+ssize_t
	+xml_namedentitytostr(const char e, char buf, size_t bufsiz)
	+{
	+ static const struct {
	+ char *entity;
	+ int c;
	+ } entities[] = {
	+ { .entity = "&", .c = '&' },
	+ { .entity = "<", .c = '<' },
	+ { .entity = ">", .c = '>' },
	+ { .entity = "'", .c = '\'' },
	+ { .entity = """, .c = '"' },
	+ { .entity = "&AMP;", .c = '&' },
	+ { .entity = "&LT;", .c = '<' },
	+ { .entity = "&GT;", .c = '>' },
	+ { .entity = "&APOS;", .c = '\'' },
	+ { .entity = "&QUOT;", .c = '"' }
	+ };
	+ size_t i;
	+
	+ /* buffer is too small */
	+ if (bufsiz < 2)
	+ return -1;
	+
	+ /* doesn't start with &: can't match */
	+ if (*e != '&')
	+ return 0;
	+
	+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
	+ if (!strcmp(e, entities[i].entity)) {
	+ buf[0] = entities[i].c;
	+ buf[1] = '\0';
	+ return 1;
	+ }
	+ }
	+ return 0;
	+}
	+
	+ssize_t
	+xml_numericentitytostr(const char e, char buf, size_t bufsiz)
	+{
	+ uint32_t l = 0, cp = 0;
	+ size_t b, len;
	+ char *end;
	+
	+ /* buffer is too small */
	+ if (bufsiz < 5)
	+ return -1;
	+
	+ /* not a numeric entity */
	+ if (e[0] != '&' \|\| e[1] != '#')
	+ return 0;
	+
	+ /* e[1] == '#', numeric / hexadecimal entity */
	+ e += 2; /* skip "&#" */
	+ errno = 0;
	+ /* hex (16) or decimal (10) */
	+ if (*e == 'x')
	+ l = strtoul(e + 1, &end, 16);
	+ else
	+ l = strtoul(e, &end, 10);
	+ /* invalid value or not a well-formed entity */
	+ if (errno \|\| *end != ';')
	+ return 0;
	+ len = xml_codepointtoutf8(l, &cp);
	+ /* make string */
	+ for (b = 0; b < len; b++)
	+ buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
	+ buf[len] = '\0';
	+
	+ return (ssize_t)len;
	+}
	+
	+/* convert named- or numeric entity string to buffer string
	+ * returns byte-length of string. */
	+ssize_t
	+xml_entitytostr(const char e, char buf, size_t bufsiz)
	+{
	+ /* buffer is too small */
	+ if (bufsiz < 5)
	+ return -1;
	+ /* doesn't start with & */
	+ if (e[0] != '&')
	+ return 0;
	+ /* named entity */
	+ if (e[1] != '#')
	+ return xml_namedentitytostr(e, buf, bufsiz);
	+ else /* numeric entity */
	+ return xml_numericentitytostr(e, buf, bufsiz);
	+}
	+
	+void
	+xml_parse(XMLParser *x)
	+{
	+ int c, ispi;
	+ size_t datalen, tagdatalen, taglen;
	+
	+ if (!x->getnext)
	+ return;
	+ while ((c = x->getnext()) != EOF && c != '<')
	+ ; /* skip until < */
	+
	+ while (c != EOF) {
	+ if (c == '<') { /* parse tag */
	+ if ((c = x->getnext()) == EOF)
	+ return;
	+ x->tag[0] = '\0';
	+ x->taglen = 0;
	+ if (c == '!') { /* cdata and comments */
	+ for (tagdatalen = 0; (c = x->getnext()) != EOF…
	+ if (tagdatalen <= sizeof("[CDATA[") - …
	+ x->data[tagdatalen++] = c; /* …
	+ if (c == '>')
	+ break;
	+ else if (c == '-' && tagdatalen == siz…
	+ (x->data[0] == '-')) {
	+ xml_parsecomment(x);
	+ break;
	+ } else if (c == '[') {
	+ if (tagdatalen == sizeof("[CDA…
	+ !strncmp(x->data, "[CDATA[…
	+ xml_parsecdata(x);
	+ break;
	+ }
	+ }
	+ }
	+ } else {
	+ /* normal tag (open, short open, close), proce…
	+ if (isspace(c))
	+ while ((c = x->getnext()) != EOF && is…
	+ ;
	+ if (c == EOF)
	+ return;
	+ x->tag[0] = c;
	+ ispi = (c == '?') ? 1 : 0;
	+ x->isshorttag = ispi;
	+ taglen = 1;
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == '/') /* TODO: simplify short …
	+ x->isshorttag = 1; /* short ta…
	+ else if (c == '>' \|\| isspace(c)) {
	+ x->tag[taglen] = '\0';
	+ if (x->tag[0] == '/') { /* end…
	+ x->taglen = --taglen; …
	+ if (taglen && x->xmlta…
	+ x->xmltagend(x…
	+ } else {
	+ x->taglen = taglen;
	+ /* start tag */
	+ if (x->xmltagstart)
	+ x->xmltagstart…
	+ if (isspace(c))
	+ xml_parseattrs…
	+ if (x->xmltagstartpars…
	+ x->xmltagstart…
	+ }
	+ /* call tagend for shortform o…
	+ if ((x->isshorttag \|\| ispi) &&…
	+ x->xmltagend(x, x->tag…
	+ break;
	+ } else if (taglen < sizeof(x->tag) - 1)
	+ x->tag[taglen++] = c;
	+ }
	+ }
	+ } else {
	+ /* parse tag data */
	+ datalen = 0;
	+ if (x->xmldatastart)
	+ x->xmldatastart(x);
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == '&') {
	+ if (datalen) {
	+ x->data[datalen] = '\0';
	+ if (x->xmldata)
	+ x->xmldata(x, x->data,…
	+ }
	+ x->data[0] = c;
	+ datalen = 1;
	+ while ((c = x->getnext()) != EOF) {
	+ if (c == '<')
	+ break;
	+ if (datalen < sizeof(x->data) …
	+ x->data[datalen++] = c;
	+ if (isspace(c))
	+ break;
	+ else if (c == ';') {
	+ x->data[datalen] = '\0…
	+ if (x->xmldataentity)
	+ x->xmldataenti…
	+ datalen = 0;
	+ break;
	+ }
	+ }
	+ } else if (c != '<') {
	+ if (datalen < sizeof(x->data) - 1) {
	+ x->data[datalen++] = c;
	+ } else {
	+ x->data[datalen] = '\0';
	+ if (x->xmldata)
	+ x->xmldata(x, x->data,…
	+ x->data[0] = c;
	+ datalen = 1;
	+ }
	+ }
	+ if (c == '<') {
	+ x->data[datalen] = '\0';
	+ if (x->xmldata && datalen)
	+ x->xmldata(x, x->data, datalen…
	+ if (x->xmldataend)
	+ x->xmldataend(x);
	+ break;
	+ }
	+ }
	+ }
	+ }
	+}
	diff --git a/xml.h b/xml.h
	@@ -0,0 +1,44 @@
	+typedef struct xmlparser {
	+ /* handlers */
	+ void (xmlattr)(struct xmlparser , const char *, size_t,
	+ const char , size_t, const char , size_t);
	+ void (xmlattrend)(struct xmlparser , const char *, size_t,
	+ const char *, size_t);
	+ void (xmlattrstart)(struct xmlparser , const char *, size_t,
	+ const char *, size_t);
	+ void (xmlattrentity)(struct xmlparser , const char *, size_t,
	+ const char , size_t, const char , size_t);
	+ void (xmlcdatastart)(struct xmlparser );
	+ void (xmlcdata)(struct xmlparser , const char *, size_t);
	+ void (xmlcdataend)(struct xmlparser );
	+ void (xmlcommentstart)(struct xmlparser );
	+ void (xmlcomment)(struct xmlparser , const char *, size_t);
	+ void (xmlcommentend)(struct xmlparser );
	+ void (xmldata)(struct xmlparser , const char *, size_t);
	+ void (xmldataend)(struct xmlparser );
	+ void (xmldataentity)(struct xmlparser , const char *, size_t);
	+ void (xmldatastart)(struct xmlparser );
	+ void (xmltagend)(struct xmlparser , const char *, size_t, int);
	+ void (xmltagstart)(struct xmlparser , const char *, size_t);
	+ void (xmltagstartparsed)(struct xmlparser , const char *,
	+ size_t, int);
	+
	+ int (*getnext)(void);
	+
	+ /* current tag */
	+ char tag[1024];
	+ size_t taglen;
	+ /* current tag is in short form ? <tag /> */
	+ int isshorttag;
	+ /* current attribute name */
	+ char name[256];
	+ /* data buffer used for tag data, cdata and attribute data */
	+ char data[BUFSIZ];
	+} XMLParser;
	+
	+int xml_codepointtoutf8(uint32_t, uint32_t *);
	+ssize_t xml_entitytostr(const char , char , size_t);
	+ssize_t xml_namedentitytostr(const char , char , size_t);
	+ssize_t xml_numericentitytostr(const char , char , size_t);
	+
	+void xml_parse(XMLParser *);