Introduction
Introduction Statistics Contact Development Disclaimer Help
initial insertion - grabtitle - stupid HTML title grabber
git clone git://git.codemadness.org/grabtitle
Log
Files
Refs
README
LICENSE
---
commit 5c21827b86be877d3d5df7f7a9b810822e4f8e22
Author: Hiltjo Posthuma <[email protected]>
Date: Sat, 31 Mar 2018 12:59:22 +0200
initial insertion
Diffstat:
A .gitignore | 3 +++
A Makefile | 5 +++++
A run.sh | 10 ++++++++++
A title.c | 86 ++++++++++++++++++++++++++++++
A xml.c | 446 ++++++++++++++++++++++++++++++
A xml.h | 44 +++++++++++++++++++++++++++++…
6 files changed, 594 insertions(+), 0 deletions(-)
---
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+title
+*.o
+*.core
diff --git a/Makefile b/Makefile
@@ -0,0 +1,5 @@
+build:
+ cc xml.c title.c -o title
+
+clean:
+ rm -f title
diff --git a/run.sh b/run.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+url="http://codemadness.org"
+curl \
+ -s \
+ -L --max-redirs 3 \
+ -m 5 \
+ -H 'User-Agent:' \
+ "$url" | \
+ ./title
diff --git a/title.c b/title.c
@@ -0,0 +1,86 @@
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "xml.h"
+
+#ifdef USE_PLEDGE
+#include <unistd.h>
+#else
+#define pledge(a,b) 0
+#endif
+
+static XMLParser parser;
+static int istitle, titlelen;
+static char title[4096];
+
+static void
+xmltagstart(XMLParser *p, const char *t, size_t tl)
+{
+ if (tl == 5 && !strcasecmp(t, "title"))
+ istitle = 1;
+}
+
+static void
+xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
+{
+ if (istitle && tl == 5 && !strcasecmp(t, "title")) {
+ puts(title);
+ exit(0);
+ }
+}
+
+/* data and CDATA */
+static void
+xmldata(XMLParser *p, const char *d, size_t dl)
+{
+ if (!istitle)
+ return;
+ if (titlelen + dl + 1 >= sizeof(title))
+ return;
+ memcpy(title + titlelen, d, dl);
+ titlelen += dl;
+ title[titlelen] = '\0';
+}
+
+static void
+xmldataentity(XMLParser *p, const char *d, size_t dl)
+{
+ char buf[16];
+
+ if (xml_entitytostr(d, buf, sizeof(buf))) {
+ d = buf;
+ dl = strlen(buf);
+ }
+
+ if (titlelen + dl + 1 >= sizeof(title))
+ return;
+ memcpy(title + titlelen, d, dl);
+ titlelen += dl;
+ title[titlelen] = '\0';
+}
+
+int
+main(int argc, char *argv[])
+{
+ if (pledge("stdio", NULL) == -1) {
+ fprintf(stderr, "pledge: %s\n", strerror(errno));
+ return 2;
+ }
+
+ parser.xmltagstart = xmltagstart;
+ parser.xmltagend = xmltagend;
+ parser.xmldata = xmldata;
+ parser.xmlcdata = xmldata;
+ parser.xmldataentity = xmldataentity;
+
+ parser.getnext = getchar;
+ xml_parse(&parser);
+
+ return 1;
+}
diff --git a/xml.c b/xml.c
@@ -0,0 +1,446 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xml.h"
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0;
+
+ while ((c = x->getnext()) != EOF) {
+ if (isspace(c)) { /* TODO: simplify endname ? */
+ if (namelen)
+ endname = 1;
+ continue;
+ }
+ if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ } else if (namelen && ((endname && isalpha(c)) || (c == '>' ||…
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, name…
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, n…
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && (c == '\'' || c == '"')) {
+ /* attribute with value */
+ endsep = c; /* c is end separator */
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name,…
+ for (valuelen = 0; (c = x->getnext()) != EOF;) {
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before…
+ if (valuelen && x->xmlattr)
+ x->xmlattr(x, x->tag, x->tagle…
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == endsep)
+ break;
+ if (valuelen < sizeof(x->data)…
+ x->data[valuelen++] = …
+ else {
+ /* TODO: entity too lo…
+ x->data[valuelen] = '\…
+ if (x->xmlattr)
+ x->xmlattr(x, …
+ valuelen = 0;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\…
+ if (x->xmlattrentity)
+ x->xmlattrenti…
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, …
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep) {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->tagle…
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->ta…
+ break;
+ }
+ }
+ namelen = endname = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ namelen = 0;
+ x->name[0] = '\0';
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcommentstart)
+ x->xmlcommentstart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
+ }
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcomment)
+ x->xmlcomment(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcdatastart)
+ x->xmlcdatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcdata)
+ x->xmlcdata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+int
+xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
+{
+ if (cp >= 0x10000) {
+ /* 4 bytes */
+ *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 4;
+ } else if (cp >= 0x00800) {
+ /* 3 bytes */
+ *utf = 0xe08080 |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 3;
+ } else if (cp >= 0x80) {
+ /* 2 bytes */
+ *utf = 0xc080 |
+ ((cp & 0xfc0) << 2) | (cp & 0x3f);
+ return 2;
+ }
+ *utf = cp & 0xff;
+ return *utf ? 1 : 0; /* 1 byte */
+}
+
+ssize_t
+xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ char *entity;
+ int c;
+ } entities[] = {
+ { .entity = "&amp;", .c = '&' },
+ { .entity = "&lt;", .c = '<' },
+ { .entity = "&gt;", .c = '>' },
+ { .entity = "&apos;", .c = '\'' },
+ { .entity = "&quot;", .c = '"' },
+ { .entity = "&AMP;", .c = '&' },
+ { .entity = "&LT;", .c = '<' },
+ { .entity = "&GT;", .c = '>' },
+ { .entity = "&APOS;", .c = '\'' },
+ { .entity = "&QUOT;", .c = '"' }
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ /* doesn't start with &: can't match */
+ if (*e != '&')
+ return 0;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return 0;
+}
+
+ssize_t
+xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ uint32_t l = 0, cp = 0;
+ size_t b, len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ /* not a numeric entity */
+ if (e[0] != '&' || e[1] != '#')
+ return 0;
+
+ /* e[1] == '#', numeric / hexadecimal entity */
+ e += 2; /* skip "&#" */
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtoul(e + 1, &end, 16);
+ else
+ l = strtoul(e, &end, 10);
+ /* invalid value or not a well-formed entity */
+ if (errno || *end != ';')
+ return 0;
+ len = xml_codepointtoutf8(l, &cp);
+ /* make string */
+ for (b = 0; b < len; b++)
+ buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
+ buf[len] = '\0';
+
+ return (ssize_t)len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+ssize_t
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return 0;
+ /* named entity */
+ if (e[1] != '#')
+ return xml_namedentitytostr(e, buf, bufsiz);
+ else /* numeric entity */
+ return xml_numericentitytostr(e, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ int c, ispi;
+ size_t datalen, tagdatalen, taglen;
+
+ if (!x->getnext)
+ return;
+ while ((c = x->getnext()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = x->getnext()) == EOF)
+ return;
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = x->getnext()) != EOF…
+ if (tagdatalen <= sizeof("[CDATA[") - …
+ x->data[tagdatalen++] = c; /* …
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == siz…
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDA…
+ !strncmp(x->data, "[CDATA[…
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), proce…
+ if (isspace(c))
+ while ((c = x->getnext()) != EOF && is…
+ ;
+ if (c == EOF)
+ return;
+ x->tag[0] = c;
+ ispi = (c == '?') ? 1 : 0;
+ x->isshorttag = ispi;
+ taglen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '/') /* TODO: simplify short …
+ x->isshorttag = 1; /* short ta…
+ else if (c == '>' || isspace(c)) {
+ x->tag[taglen] = '\0';
+ if (x->tag[0] == '/') { /* end…
+ x->taglen = --taglen; …
+ if (taglen && x->xmlta…
+ x->xmltagend(x…
+ } else {
+ x->taglen = taglen;
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart…
+ if (isspace(c))
+ xml_parseattrs…
+ if (x->xmltagstartpars…
+ x->xmltagstart…
+ }
+ /* call tagend for shortform o…
+ if ((x->isshorttag || ispi) &&…
+ x->xmltagend(x, x->tag…
+ break;
+ } else if (taglen < sizeof(x->tag) - 1)
+ x->tag[taglen++] = c;
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ if (x->xmldatastart)
+ x->xmldatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data,…
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) …
+ x->data[datalen++] = c;
+ if (isspace(c))
+ break;
+ else if (c == ';') {
+ x->data[datalen] = '\0…
+ if (x->xmldataentity)
+ x->xmldataenti…
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data,…
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (x->xmldata && datalen)
+ x->xmldata(x, x->data, datalen…
+ if (x->xmldataend)
+ x->xmldataend(x);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/xml.h b/xml.h
@@ -0,0 +1,44 @@
+typedef struct xmlparser {
+ /* handlers */
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
+
+ int (*getnext)(void);
+
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is in short form ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[256];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+int xml_codepointtoutf8(uint32_t, uint32_t *);
+ssize_t xml_entitytostr(const char *, char *, size_t);
+ssize_t xml_namedentitytostr(const char *, char *, size_t);
+ssize_t xml_numericentitytostr(const char *, char *, size_t);
+
+void xml_parse(XMLParser *);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.