GopherProxy

	work-in-progress: support the new Twitter site - tscrape - twitter scraper
	git clone git://git.codemadness.org/tscrape
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit c3e76b0f57c58b284cd13ce008c082525c8ee28a
	parent 663dab7d9883a291ed570a743fb89a16e1a01d85
	Author: Hiltjo Posthuma <[email protected]>
	Date: Fri, 5 Jun 2020 14:51:58 +0200

	work-in-progress: support the new Twitter site

	Scraping doesn't work anymore. Use the Twitter JSON API.

	Major thanks to leot for helping with this.

	Diffstat:
	M Makefile \| 18 +++++++++---------
	M README \| 28 ++++++++++++----------------
	A json.c \| 313 +++++++++++++++++++++++++++++…
	A json.h \| 26 ++++++++++++++++++++++++++
	M tscrape.c \| 591 +++++++++++++++++++----------…
	M tscrape_plain.c \| 2 +-
	M tscrape_update \| 31 +++++++++++++++++++++++++++++…
	M tscraperc.example \| 8 ++++----
	M util.c \| 40 -----------------------------…
	M util.h \| 2 --
	D xml.c \| 451 -----------------------------…
	D xml.h \| 49 -----------------------------…

	12 files changed, 755 insertions(+), 804 deletions(-)
	---
	diff --git a/Makefile b/Makefile
	@@ -25,17 +25,17 @@ SCRIPTS = \
	SRC = ${BIN:=.c}
	HDR = \
	util.h\
	- xml.h
	+ json.h

	LIBUTIL = libutil.a
	LIBUTILSRC = \
	util.c
	LIBUTILOBJ = ${LIBUTILSRC:.c=.o}

	-LIBXML = libxml.a
	-LIBXMLSRC = \
	- xml.c
	-LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
	+LIBJSON = libjson.a
	+LIBJSONSRC = \
	+ json.c
	+LIBJSONOBJ = ${LIBJSONSRC:.c=.o}

	COMPATSRC = \
	strlcat.c\
	@@ -44,7 +44,7 @@ COMPATOBJ =\
	strlcat.o\
	strlcpy.o

	-LIB = ${LIBUTIL} ${LIBXML} ${COMPATOBJ}
	+LIB = ${LIBUTIL} ${LIBJSON} ${COMPATOBJ}

	MAN1 = ${BIN:=.1}\
	${SCRIPTS:=.1}
	@@ -59,7 +59,7 @@ all: $(BIN)

	${BIN}: ${LIB} ${@:=.o}

	-OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
	+OBJ = ${SRC:.c=.o} ${LIBJSONOBJ} ${LIBUTILOBJ} ${COMPATOBJ}

	${OBJ}: ${HDR}

	@@ -73,7 +73,7 @@ ${LIBUTIL}: ${LIBUTILOBJ}
	${AR} rc $@ $?
	${RANLIB} $@

	-${LIBXML}: ${LIBXMLOBJ}
	+${LIBJSON}: ${LIBJSONOBJ}
	${AR} rc $@ $?
	${RANLIB} $@

	@@ -81,7 +81,7 @@ dist:
	rm -rf "${NAME}-${VERSION}"
	mkdir -p "${NAME}-${VERSION}"
	cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
	- ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
	+ ${SRC} ${LIBJSONSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
	Makefile \
	tscraperc.example style.css \
	"${NAME}-${VERSION}"
	diff --git a/README b/README
	@@ -1,13 +1,16 @@
	tscrape
	-------

	-Twitter feed HTML scraper.
	+Twitter feed parser.

	-It scrapes HTML from stdin and outputs it to a TAB-separated format that can be
	-easier parsed with various (UNIX) tools. There are formatting programs included
	-to convert this TAB-separated format to various other formats. There are also
	-some programs and scripts included to import and export OPML and to fetch,
	-filter, merge and order items.
	+It parses JSON from stdin and outputs it to a TAB-separated format that can be
	+processed easier with various (UNIX) tools. There are formatting programs
	+included to convert this TAB-separated format to various other formats. There
	+are also some programs and scripts included to import and export OPML and to
	+fetch, filter, merge and order items.
	+
	+The name tscrape is used because it used to scrape the HTML from the Twitter
	+page. It is now using the JSON API contents.


	Build and install
	@@ -20,20 +23,13 @@ $ make
	Usage
	-----

	- curl -H 'User-Agent:' -s 'https://twitter.com/namehere' \| tscrape
	-
	-or
	-
	- ftp -o - -U '' 'https://twitter.com/namehere' 2>/dev/null \| tscrape
	-
	-or
	-
	- hurl 'https://twitter.com/namehere' \| tscrape
	+* Create a tscraperc configuration file in ~/.tscrape/tscraperc, see tscraperc…
	+* Run tscrape_update


	Using sfeed to convert the tscrape TSV output to an Atom feed:

	- hurl 'https://twitter.com/namehere' \| tscrape \| \
	+ tscrape < ~/.tscrape/feeds/name \| \
	awk 'BEGIN { OFS = FS = "\t"; }
	{
	print $1 OFS $4 OFS "https://twitter.com/" $6 "/status/" $5 \
	diff --git a/json.c b/json.c
	@@ -0,0 +1,313 @@
	+#include <ctype.h>
	+#include <errno.h>
	+#include <stdint.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+
	+#define GETNEXT getchar
	+
	+#include "json.h"
	+
	+static int
	+codepointtoutf8(long r, char *s)
	+{
	+ if (r == 0) {
	+ return 0; /* NUL byte */
	+ } else if (r <= 0x7F) {
	+ /* 1 byte: 0aaaaaaa */
	+ s[0] = r;
	+ return 1;
	+ } else if (r <= 0x07FF) {
	+ /* 2 bytes: 00000aaa aabbbbbb */
	+ s[0] = 0xC0 \| ((r & 0x0007C0) >> 6); /* 110aaaaa */
	+ s[1] = 0x80 \| (r & 0x00003F); /* 10bbbbbb */
	+ return 2;
	+ } else if (r <= 0xFFFF) {
	+ /* 3 bytes: aaaabbbb bbcccccc */
	+ s[0] = 0xE0 \| ((r & 0x00F000) >> 12); /* 1110aaaa */
	+ s[1] = 0x80 \| ((r & 0x000FC0) >> 6); /* 10bbbbbb */
	+ s[2] = 0x80 \| (r & 0x00003F); /* 10cccccc */
	+ return 3;
	+ } else {
	+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
	+ s[0] = 0xF0 \| ((r & 0x1C0000) >> 18); /* 11110aaa */
	+ s[1] = 0x80 \| ((r & 0x03F000) >> 12); /* 10bbbbbb */
	+ s[2] = 0x80 \| ((r & 0x000FC0) >> 6); /* 10cccccc */
	+ s[3] = 0x80 \| (r & 0x00003F); /* 10dddddd */
	+ return 4;
	+ }
	+}
	+
	+static int
	+hexdigit(int c)
	+{
	+ if (c >= '0' && c <= '9')
	+ return c - '0';
	+ else if (c >= 'a' && c <= 'f')
	+ return 10 + (c - 'a');
	+ else if (c >= 'A' && c <= 'F')
	+ return 10 + (c - 'A');
	+ return 0;
	+}
	+
	+static int
	+capacity(char *value, size_t sz, size_t cur, size_t inc)
	+{
	+ size_t need, newsiz;
	+ char *newp;
	+
	+ /* check for addition overflow */
	+ if (cur > SIZE_MAX - inc) {
	+ errno = EOVERFLOW;
	+ return -1;
	+ }
	+ need = cur + inc;
	+
	+ if (need > *sz) {
	+ if (need > SIZE_MAX / 2) {
	+ newsiz = SIZE_MAX;
	+ } else {
	+ for (newsiz = sz < 64 ? 64 : sz; newsiz <= need; new…
	+ ;
	+ }
	+ if (!(newp = realloc(*value, newsiz)))
	+ return -1; /* up to caller to free value /
	+ *value = newp;
	+ *sz = newsiz;
	+ }
	+ return 0;
	+}
	+
	+#define EXPECT_VALUE "{[\"-0123456789tfn"
	+#define EXPECT_STRING "\""
	+#define EXPECT_END "}],"
	+#define EXPECT_OBJECT_STRING EXPECT_STRING "}"
	+#define EXPECT_OBJECT_KEY ":"
	+#define EXPECT_ARRAY_VALUE EXPECT_VALUE "]"
	+
	+#define JSON_INVALID() do { ret = JSON_ERROR_INVALID; goto end; } while …
	+
	+int
	+parsejson(void (cb)(struct json_node , size_t, const char *))
	+{
	+ struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
	+ size_t depth = 0, p = 0, len, sz = 0;
	+ long cp, hi, lo;
	+ char pri[128], *str = NULL;
	+ int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
	+ const char *expect = EXPECT_VALUE;
	+
	+ if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
	+ goto end;
	+ nodes[0].name[0] = '\0';
	+
	+ while (1) {
	+ c = GETNEXT();
	+handlechr:
	+ if (c == EOF)
	+ break;
	+
	+ /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
	+ if (c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\r')
	+ continue;
	+
	+ if (!c \|\| !strchr(expect, c))
	+ JSON_INVALID();
	+
	+ switch (c) {
	+ case ':':
	+ iskey = 0;
	+ expect = EXPECT_VALUE;
	+ break;
	+ case '"':
	+ nodes[depth].type = JSON_TYPE_STRING;
	+ escape = 0;
	+ len = 0;
	+ while (1) {
	+ c = GETNEXT();
	+chr:
	+ /* EOF or control char: 0x7f is not defined as…
	+ if (c < 0x20)
	+ JSON_INVALID();
	+
	+ if (escape) {
	+escchr:
	+ escape = 0;
	+ switch (c) {
	+ case '"': /* FALLTHROUGH */
	+ case '\\':
	+ case '/': break;
	+ case 'b': c = '\b'; break;
	+ case 'f': c = '\f'; break;
	+ case 'n': c = '\n'; break;
	+ case 'r': c = '\r'; break;
	+ case 't': c = '\t'; break;
	+ case 'u': /* hex hex hex hex */
	+ if (capacity(&str, &sz, len, 4…
	+ goto end;
	+ for (i = 12, cp = 0; i >= 0; i…
	+ if ((c = GETNEXT()) ==…
	+ JSON_INVALID()…
	+ cp \|= (hexdigit(c) << …
	+ }
	+ /* RFC8259 - 7. Strings - surr…
	+ * 0xd800 - 0xdb7f - high surr…
	+ if (cp >= 0xd800 && cp <= 0xdb…
	+ if ((c = GETNEXT()) !=…
	+ len += codepoi…
	+ goto chr;
	+ }
	+ if ((c = GETNEXT()) !=…
	+ len += codepoi…
	+ goto escchr;
	+ }
	+ for (hi = cp, i = 12, …
	+ if ((c = GETNE…
	+ JSON_I…
	+ lo \|= (hexdigi…
	+ }
	+ /* 0xdc00 - 0xdfff - l…
	+ if (lo >= 0xdc00 && lo…
	+ cp = (hi << 10…
	+ } else {
	+ /* handle grac…
	+ len += codepoi…
	+ if (capacity(&…
	+ goto e…
	+ len += codepoi…
	+ continue;
	+ }
	+ }
	+ len += codepointtoutf8(cp, &st…
	+ continue;
	+ default:
	+ JSON_INVALID(); /* invalid esc…
	+ }
	+ if (capacity(&str, &sz, len, 1) == -1)
	+ goto end;
	+ str[len++] = c;
	+ } else if (c == '\\') {
	+ escape = 1;
	+ } else if (c == '"') {
	+ if (capacity(&str, &sz, len, 1) == -1)
	+ goto end;
	+ str[len++] = '\0';
	+
	+ if (iskey) {
	+ /* copy string as key, includi…
	+ if (capacity(&(nodes[depth].na…
	+ goto end;
	+ memcpy(nodes[depth].name, str,…
	+ } else {
	+ cb(nodes, depth + 1, str);
	+ }
	+ break;
	+ } else {
	+ if (capacity(&str, &sz, len, 1) == -1)
	+ goto end;
	+ str[len++] = c;
	+ }
	+ }
	+ if (iskey)
	+ expect = EXPECT_OBJECT_KEY;
	+ else
	+ expect = EXPECT_END;
	+ break;
	+ case '[':
	+ case '{':
	+ if (depth + 1 >= JSON_MAX_NODE_DEPTH)
	+ JSON_INVALID(); /* too deep */
	+
	+ nodes[depth].index = 0;
	+ if (c == '[') {
	+ nodes[depth].type = JSON_TYPE_ARRAY;
	+ expect = EXPECT_ARRAY_VALUE;
	+ } else if (c == '{') {
	+ iskey = 1;
	+ nodes[depth].type = JSON_TYPE_OBJECT;
	+ expect = EXPECT_OBJECT_STRING;
	+ }
	+
	+ cb(nodes, depth + 1, "");
	+
	+ depth++;
	+ nodes[depth].index = 0;
	+ if (capacity(&(nodes[depth].name), &(nodes[depth].name…
	+ goto end;
	+ nodes[depth].name[0] = '\0';
	+ break;
	+ case ']':
	+ case '}':
	+ if (!depth \|\|
	+ (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARR…
	+ (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJ…
	+ JSON_INVALID(); /* unbalanced nodes */
	+
	+ nodes[--depth].index++;
	+ expect = EXPECT_END;
	+ break;
	+ case ',':
	+ if (!depth)
	+ JSON_INVALID(); /* unbalanced nodes */
	+
	+ nodes[depth - 1].index++;
	+ if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
	+ iskey = 1;
	+ expect = EXPECT_STRING;
	+ } else {
	+ expect = EXPECT_VALUE;
	+ }
	+ break;
	+ case 't': /* true */
	+ if (GETNEXT() != 'r' \|\| GETNEXT() != 'u' \|\| GETNEXT() …
	+ JSON_INVALID();
	+ nodes[depth].type = JSON_TYPE_BOOL;
	+ cb(nodes, depth + 1, "true");
	+ expect = EXPECT_END;
	+ break;
	+ case 'f': /* false */
	+ if (GETNEXT() != 'a' \|\| GETNEXT() != 'l' \|\| GETNEXT() …
	+ GETNEXT() != 'e')
	+ JSON_INVALID();
	+ nodes[depth].type = JSON_TYPE_BOOL;
	+ cb(nodes, depth + 1, "false");
	+ expect = EXPECT_END;
	+ break;
	+ case 'n': /* null */
	+ if (GETNEXT() != 'u' \|\| GETNEXT() != 'l' \|\| GETNEXT() …
	+ JSON_INVALID();
	+ nodes[depth].type = JSON_TYPE_NULL;
	+ cb(nodes, depth + 1, "null");
	+ expect = EXPECT_END;
	+ break;
	+ default: /* number */
	+ nodes[depth].type = JSON_TYPE_NUMBER;
	+ p = 0;
	+ pri[p++] = c;
	+ expect = EXPECT_END;
	+ while (1) {
	+ c = GETNEXT();
	+ if (c == EOF \|\|
	+ !c \|\| !strchr("0123456789eE+-.", c) \|\|
	+ p + 1 >= sizeof(pri)) {
	+ pri[p] = '\0';
	+ cb(nodes, depth + 1, pri);
	+ goto handlechr; /* do not read next ch…
	+ } else {
	+ pri[p++] = c;
	+ }
	+ }
	+ }
	+ }
	+ if (depth)
	+ JSON_INVALID(); /* unbalanced nodes */
	+
	+ ret = 0; /* success */
	+end:
	+ for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
	+ free(nodes[depth].name);
	+ free(str);
	+
	+ return ret;
	+}
	diff --git a/json.h b/json.h
	@@ -0,0 +1,26 @@
	+#include <stddef.h>
	+
	+enum JSONType {
	+ JSON_TYPE_ARRAY = 'a',
	+ JSON_TYPE_OBJECT = 'o',
	+ JSON_TYPE_STRING = 's',
	+ JSON_TYPE_BOOL = 'b',
	+ JSON_TYPE_NULL = '?',
	+ JSON_TYPE_NUMBER = 'n'
	+};
	+
	+enum JSONError {
	+ JSON_ERROR_MEM = -2,
	+ JSON_ERROR_INVALID = -1
	+};
	+
	+#define JSON_MAX_NODE_DEPTH 64
	+
	+struct json_node {
	+ enum JSONType type;
	+ char *name;
	+ size_t namesiz;
	+ size_t index; /* count/index for array or object type */
	+};
	+
	+int parsejson(void (cb)(struct json_node , size_t, const char *));
	diff --git a/tscrape.c b/tscrape.c
	@@ -2,107 +2,52 @@

	#include <ctype.h>
	#include <err.h>
	+#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <strings.h>
	+#include <time.h>
	#include <unistd.h>

	-#include "xml.h"
	+#include "json.h"
	#include "util.h"

	#define STRP(s) s,sizeof(s)-1

	-/* states */
	-enum {
	- Item = 1,
	- Stream = 2,
	- Header = 4,
	- Timestamp = 8,
	- Text = 16
	+/* a tweet */
	+struct tweet {
	+ char fullname[1024];
	+ int ispinned;
	+ char itemusername[1024];
	+ char itemfullname[1024];
	+ char full_text[4096];
	+ char username[1024];
	+ time_t timestamp;
	+ char datatime[16];
	+ char itemid[64];
	+ char retweetid[64];
	+
	+ struct tweet *next;
	};

	-/* data */
	-static char fullname[1024];
	-static int ispinned;
	-static char itemusername[1024];
	-static char itemfullname[1024];
	-static char timestamp[16];
	-static char text[4096];
	-static char username[1024];
	-
	-static char classname[256];
	-static char datatime[16];
	-static char itemid[64];
	-static char retweetid[64];
	-static int state;
	-static XMLParser p;
	-
	-static const char ignorestate, endtag;
	-static int (*getnext)(void);
	-
	-/* return a space for all data until some case-insensitive string occurs. This
	- is used to parse incorrect HTML/XML that contains unescaped HTML in script
	- or style tags. If you see some </script> tag in a CDATA or comment
	- section then e-mail W3C and tell them the web is too complex. */
	-static inline int
	-getnext_ignore(void)
	-{
	- int c;
	-
	- if ((c = getnext()) == EOF)
	- return EOF;
	+/* url entities and their replacements */
	+struct url {
	+ char url[256];
	+ size_t url_len;
	+ char expanded_url[1024];

	- if (tolower(c) == tolower((unsigned char)*ignorestate)) {
	- ignorestate++;
	- if (*ignorestate == '\0') {
	- p.getnext = getnext; /* restore */
	- return c;
	- }
	- } else {
	- ignorestate = endtag;
	- }
	-
	- return ' ';
	-}
	-
	-static void
	-printtweet(void)
	-{
	- char buf[32];
	- time_t t;
	-
	- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
	- printf("%lld", (long long)t);
	- putchar('\t');
	- printescape(username);
	- putchar('\t');
	- printescape(fullname);
	- putchar('\t');
	- printescape(text);
	- putchar('\t');
	- printescape(itemid);
	- putchar('\t');
	- printescape(itemusername);
	- putchar('\t');
	- printescape(itemfullname);
	- putchar('\t');
	- printescape(retweetid);
	- putchar('\t');
	- printf("%d", ispinned);
	- putchar('\n');
	-}
	+ struct url *next;
	+};

	-static int
	-isclassmatch(const char classes, const char clss, size_t len)
	-{
	- const char *p;
	+static struct tweet tweets, tc;
	+static struct url urls, uc;
	+static char url[256];

	- if (!(p = strstr(classes, clss)))
	- return 0;
	- return (p == classes \|\| isspace((unsigned char)p[-1])) &&
	- (isspace((unsigned char)p[len]) \|\| !p[len]);
	-}
	+#define MAX_PINNED 5
	+static char pinnedids[MAX_PINNED][64];
	+static size_t npinned;

	+#if 0
	/* convert XML and some HTML entities */
	static int
	html_entitytostr(const char s, char buf, size_t bufsiz)
	@@ -115,192 +60,378 @@ html_entitytostr(const char s, char buf, size_t bufsiz)
	return (ssize_t)strlcpy(buf, " ", bufsiz);
	return len;
	}
	+#endif

	-static void
	-xmltagend(XMLParser x, const char t, size_t tl, int isshort)
	+long long
	+datetounix(long long year, int mon, int day, int hour, int min, int sec)
	{
	- if (!strcmp(t, "p"))
	- state &= ~Text;
	- else if (!strcmp(t, "span"))
	- state &= ~(Timestamp);
	+ static const int secs_through_month[] = {
	+ 0, 31 * 86400, 59 * 86400, 90 * 86400,
	+ 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
	+ 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
	+ int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
	+ long long t;
	+
	+ if (year - 2ULL <= 136) {
	+ leaps = (year - 68) >> 2;
	+ if (!((year - 68) & 3)) {
	+ leaps--;
	+ is_leap = 1;
	+ } else {
	+ is_leap = 0;
	+ }
	+ t = 31536000 * (year - 70) + 86400 * leaps;
	+ } else {
	+ cycles = (year - 100) / 400;
	+ rem = (year - 100) % 400;
	+ if (rem < 0) {
	+ cycles--;
	+ rem += 400;
	+ }
	+ if (!rem) {
	+ is_leap = 1;
	+ } else {
	+ if (rem >= 300)
	+ centuries = 3, rem -= 300;
	+ else if (rem >= 200)
	+ centuries = 2, rem -= 200;
	+ else if (rem >= 100)
	+ centuries = 1, rem -= 100;
	+ if (rem) {
	+ leaps = rem / 4U;
	+ rem %= 4U;
	+ is_leap = !rem;
	+ }
	+ }
	+ leaps += 97 * cycles + 24 * centuries - is_leap;
	+ t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + …
	+ }
	+ t += secs_through_month[mon];
	+ if (is_leap && mon >= 2)
	+ t += 86400;
	+ t += 86400LL * (day - 1);
	+ t += 3600LL * hour;
	+ t += 60LL * min;
	+ t += sec;
	+
	+ return t;
	}

	-static void
	-xmltagstart(XMLParser x, const char t, size_t tl)
	+/* parse time format: "Wed May 27 04:12:34 +0000 2020"
	+ assumes tz offset is "+0000" */
	+static int
	+parsetime(const char s, time_t tp)
	{
	- classname[0] = '\0';
	+ static char *mons[] = {
	+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
	+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
	+ };
	+ int year, mon = 0, mday, hour, min, sec, i;
	+ char tzbuf[6], monbuf[4], wdaybuf[4];
	+
	+ for (; s && isspace((unsigned char)s); s++)
	+ ;
	+ i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
	+ wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
	+ if (i != 8)
	+ return -1;
	+ for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
	+ if (!strcmp(mons[i], monbuf)) {
	+ mon = i + 1;
	+ break;
	+ }
	+ }
	+ if (mon == 0)
	+ return -1;
	+
	+ /* invalid range */
	+ if (year < 0 \|\| year > 9999 \|\|
	+ mon < 1 \|\| mon > 12 \|\|
	+ mday < 1 \|\| mday > 31 \|\|
	+ hour < 0 \|\| hour > 23 \|\|
	+ min < 0 \|\| min> 59 \|\|
	+ sec < 0 \|\| sec > 59)
	+ return -1;
	+
	+ if (tp)
	+ *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
	+ return 0;
	}

	static void
	-xmltagstartparsed(XMLParser x, const char t, size_t tl, int isshort)
	+printescape(const char *s)
	{
	- /* temporary replace the callback except the reader and end of tag
	- restore the context once we receive the same ignored tag in the
	- end tag handler */
	- if (!strcasecmp(t, "script")) {
	- ignorestate = endtag = "</script>";
	- getnext = x->getnext; /* for restore */
	- x->getnext = getnext_ignore;
	- return;
	- } else if (!strcasecmp(t, "style")) {
	- ignorestate = endtag = "</style>";
	- getnext = x->getnext; /* for restore */
	- x->getnext = getnext_ignore;
	- return;
	+ for (; *s; s++) {
	+ if (!iscntrl((unsigned char)*s))
	+ putchar(*s);
	}
	-
	- if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
	- if (state & (Item \| Stream \| Header))
	- state \|= Text;
	- } else if (!strcmp(t, "div") &&
	- isclassmatch(classname, STRP("stream-item-footer"))) {
	- if (text[0] && username[0])
	- printtweet();
	- state = 0;
	- } else if (!strcmp(t, "li") &&
	- isclassmatch(classname, STRP("js-stream-item"))) {
	- if (state & Item)
	- return;
	- state \|= Item;
	- datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
	- itemid[0] = itemusername[0] = retweetid[0] = '\0';
	- ispinned = 0;
	- if (isclassmatch(classname, STRP("js-pinned")))
	- ispinned = 1;
	- } else if (state & Item) {
	- if (!strcmp(t, "div") &&
	- isclassmatch(classname, STRP("js-stream-tweet"))) {
	- state &= ~(Text\|Header);
	- state \|= Stream;
	- } else if (!strcmp(t, "a") &&
	- isclassmatch(classname, STRP("js-action-profile")))…
	- state \|= Header;
	- } else if (!strcmp(t, "span") &&
	- isclassmatch(classname, STRP("js-short-timestamp")))…
	- state \|= Timestamp;
	- strlcpy(timestamp, datatime, sizeof(timestamp));
	- datatime[0] = '\0';
	- }
	- }
	- if ((state & Text) && !strcmp(t, "a") && !isspace((unsigned char)text[…
	- strlcat(text, " ", sizeof(text));
	}

	+/* print text and expand urls */
	static void
	-xmlattr(XMLParser x, const char t, size_t tl, const char *a, size_t al,
	- const char *v, size_t vl)
	+printexpand(const char *s)
	{
	- /* NOTE: assumes classname attribute is set before data-* in current t…
	- if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-…
	- if (!strcmp(a, "data-screen-name")) {
	- strlcat(username, " ", sizeof(username));
	- strlcat(username, v, sizeof(username));
	- } else if (!strcmp(a, "data-name")) {
	- strlcat(fullname, " ", sizeof(fullname));
	- strlcat(fullname, v, sizeof(fullname));
	- }
	- }
	-
	- if (!strcmp(a, "class")) {
	- strlcat(classname, v, sizeof(classname));
	- } else if (state & Item) {
	- if (!strcmp(t, "div")) {
	- if (!strcmp(a, "data-item-id"))
	- strlcpy(itemid, v, sizeof(itemid));
	- else if (!strcmp(a, "data-retweet-id"))
	- strlcpy(retweetid, v, sizeof(retweetid));
	-
	- if (isclassmatch(classname, STRP("js-stream-tweet"))) {
	- if (!strcmp(a, "data-screen-name")) {
	- strlcat(itemusername, " ", sizeof(item…
	- strlcat(itemusername, v, sizeof(itemus…
	- } else if (!strcmp(a, "data-name")) {
	- strlcat(itemfullname, " ", sizeof(item…
	- strlcat(itemfullname, v, sizeof(itemfu…
	- }
	+ struct url *u;
	+
	+ for (; *s; s++) {
	+ if (iscntrl((unsigned char)*s))
	+ continue;
	+ for (u = urls; u; u = u->next) {
	+ if (!strncmp(s, u->url, u->url_len)) {
	+ s += u->url_len;
	+ printescape(u->expanded_url);
	+ break;
	}
	- } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) {
	- /* UNIX timestamp */
	- strlcpy(datatime, v, sizeof(datatime));
	- }
	- /* NOTE: can be <div data-image-url>. */
	- if (!strcmp(a, "data-image-url")) {
	- strlcat(text, " ", sizeof(text));
	- strlcat(text, v, sizeof(text));
	- }
	-
	- /* indication it has a video */
	- if (itemid[0] && !strcmp(a, "data-playable-media-url")) {
	- strlcat(text, " ", sizeof(text));
	- strlcat(text, "https://twitter.com/i/videos/", sizeof(…
	- strlcat(text, itemid, sizeof(text));
	}
	+ if (!u)
	+ putchar(*s);
	}
	}

	static void
	-xmlattrentity(XMLParser x, const char t, size_t tl, const char *a, size_t al,
	- const char *v, size_t vl)
	+printtweet(struct tweet *t)
	{
	- char buf[16];
	- int len;
	+ if (t->timestamp != -1)
	+ printf("%lld", (long long)t->timestamp);
	+ putchar('\t');
	+ printescape(t->username);
	+ putchar('\t');
	+ printescape(t->fullname);
	+ putchar('\t');
	+ printexpand(t->full_text);
	+ putchar('\t');
	+ printescape(t->itemid);
	+ putchar('\t');
	+ if (t->itemusername[0])
	+ printescape(t->itemusername);
	+ else
	+ printescape(t->username);
	+ putchar('\t');
	+ if (t->itemfullname[0])
	+ printescape(t->itemfullname);
	+ else
	+ printescape(t->fullname);
	+ putchar('\t');
	+ printescape(t->retweetid);
	+ putchar('\t');
	+ printf("%d", t->ispinned);
	+ putchar('\n');
	+}

	- if (!state)
	+void
	+addpinned(const char *str)
	+{
	+ if (npinned + 1 >= MAX_PINNED)
	return;
	- if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
	- xmlattr(x, t, tl, a, al, buf, (size_t)len);
	- else
	- xmlattr(x, t, tl, a, al, v, vl);
	+ strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
	+ npinned++;
	}

	-static void
	-xmldata(XMLParser x, const char d, size_t dl)
	+void
	+addtweet(void)
	{
	- if (state & Text) {
	- if (!isclassmatch(classname, STRP("u-hidden")))
	- strlcat(text, d, sizeof(text));
	- }
	+ struct tweet *t;
	+
	+ if (!(t = calloc(1, sizeof(*t))))
	+ err(1, "calloc");
	+ t->timestamp = -1;
	+ if (tweets)
	+ tc = tc->next = t;
	+ else
	+ tweets = tc = t;
	}

	-static void
	-xmldataentity(XMLParser x, const char d, size_t dl)
	+void
	+addurl(const char url, const char expanded_url)
	{
	- char buf[16];
	- int len;
	+ struct url *u;

	- if (!(state & Text))
	- return;
	- if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0)
	- xmldata(x, buf, (size_t)len);
	+ if (!(u = calloc(1, sizeof(*u))))
	+ err(1, "calloc");
	+ strlcpy(u->url, url, sizeof(u->url));
	+ u->url_len = strlen(u->url);
	+ strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
	+
	+ if (urls)
	+ uc = uc->next = u;
	else
	- xmldata(x, d, dl);
	+ urls = uc = u;
	}

	-static void
	-xmlcdata(XMLParser x, const char d, size_t dl)
	+void
	+processnodes(struct json_node nodes, size_t depth, const char str)
	{
	- xmldata(x, d, dl);
	+ if (depth == 2 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT) {
	+ addtweet();
	+ }
	+
	+ if (tc) {
	+ if (depth == 3 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_STRING) {
	+ if (!strcmp(nodes[2].name, "created_at")) {
	+ parsetime(str, &tc->timestamp);
	+ } else if (!strcmp(nodes[2].name, "id_str")) {
	+ strlcpy(tc->itemid, str, sizeof(tc->itemid));
	+ } else if (!strcmp(nodes[2].name, "full_text")) {
	+ /* if set by retweet text don't override */
	+ if (!tc->full_text[0])
	+ strlcpy(tc->full_text, str, sizeof(tc-…
	+ }
	+ }
	+ if (depth == 4 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ !strcmp(nodes[2].name, "user")) {
	+ if (nodes[3].type == JSON_TYPE_STRING) {
	+ if (!strcmp(nodes[3].name, "name")) {
	+ strlcpy(tc->fullname, str, sizeof(tc->…
	+ } else if (!strcmp(nodes[3].name, "screen_name…
	+ strlcpy(tc->username, str, sizeof(tc->…
	+ }
	+ }
	+ }
	+
	+ if (depth == 4 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ nodes[3].type == JSON_TYPE_STRING &&
	+ !strcmp(nodes[2].name, "retweeted_status")) {
	+ if (!strcmp(nodes[3].name, "id_str")) {
	+// printf("DEBUG: retweet: id: %s\n", str);
	+ strlcpy(tc->retweetid, str, sizeof(tc->retweet…
	+ } else if (!strcmp(nodes[3].name, "full_text")) {
	+ strlcpy(tc->full_text, str, sizeof(tc->full_te…
	+// printf("DEBUG: retweet: full_text: %s\n", st…
	+ }
	+ }
	+
	+ if (depth == 5 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ nodes[3].type == JSON_TYPE_OBJECT &&
	+ nodes[4].type == JSON_TYPE_STRING &&
	+ !strcmp(nodes[2].name, "retweeted_status") &&
	+ !strcmp(nodes[3].name, "user")) {
	+ if (!strcmp(nodes[4].name, "name")) {
	+ strlcpy(tc->itemfullname, str, sizeof(tc->item…
	+// printf("DEBUG: retweeted_status.user.name: %…
	+ } else if (!strcmp(nodes[4].name, "screen_name")) {
	+ strlcpy(tc->itemusername, str, sizeof(tc->item…
	+// printf("DEBUG: retweeted_status.user.screen_…
	+ }
	+ }
	+ }
	+
	+ if (depth == 5 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ !strcmp(nodes[2].name, "user")) {
	+ if (nodes[3].type == JSON_TYPE_ARRAY &&
	+ !strcmp(nodes[3].name, "pinned_tweet_ids")) {
	+ if (nodes[4].type == JSON_TYPE_NUMBER) {
	+ addpinned(str);
	+// printf("DEBUG: pinned_tweets_ids[%zu]: %s\n",
	+// nodes[4].index, str);
	+ }
	+ }
	+ }
	+
	+ if (depth == 6 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ nodes[3].type == JSON_TYPE_ARRAY &&
	+ nodes[4].type == JSON_TYPE_OBJECT &&
	+ nodes[5].type == JSON_TYPE_STRING &&
	+ !strcmp(nodes[2].name, "entities") &&
	+ !strcmp(nodes[3].name, "urls")) {
	+ if (!strcmp(nodes[5].name, "url")) {
	+// printf("DEBUG: url: %s\n", str);
	+ strlcpy(url, str, sizeof(url));
	+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
	+// printf("DEBUG: expanded_url: %s\n", str);
	+ /* assumes "expanded_url" is specified after "url" */
	+ addurl(url, str);
	+ url[0] = '\0';
	+ }
	+ }
	+
	+ /* [].entities.media[].url */
	+ if (depth == 6 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ nodes[3].type == JSON_TYPE_ARRAY &&
	+ nodes[4].type == JSON_TYPE_OBJECT &&
	+ nodes[5].type == JSON_TYPE_STRING &&
	+ !strcmp(nodes[2].name, "entities") &&
	+ !strcmp(nodes[3].name, "media")) {
	+ if (!strcmp(nodes[5].name, "url")) {
	+// printf("DEBUG: url: %s\n", str);
	+ strlcpy(url, str, sizeof(url));
	+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
	+// printf("DEBUG: expanded_url: %s\n", str);
	+ /* assumes "expanded_url" is specified after "url" */
	+ addurl(url, str);
	+ url[0] = '\0';
	+ }
	+ }
	+
	+// TODO: retweeted.status.entities.urls[]
	+#if 0
	+ if (depth == 6 &&
	+ nodes[0].type == JSON_TYPE_ARRAY &&
	+ nodes[1].type == JSON_TYPE_OBJECT &&
	+ nodes[2].type == JSON_TYPE_OBJECT &&
	+ nodes[3].type == JSON_TYPE_OBJECT &&
	+ nodes[4].type == JSON_TYPE_ARRAY &&
	+ nodes[5].type == JSON_TYPE_STRING &&
	+ !strcmp(nodes[2].name, "retweeted_status") &&
	+ !strcmp(nodes[3].name, "entities") &&
	+ !strcmp(nodes[4].name, "urls")) {
	+ if (!strcmp(nodes[5].name, "url")) {
	+ printf("DEBUG: url: %s\n", str);
	+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
	+ printf("DEBUG: expanded_url: %s\n", str);
	+ }
	+ }
	+#endif
	}

	int
	main(void)
	{
	+ struct tweet *t;
	+ size_t i;
	+ int r;
	+
	if (pledge("stdio", NULL) == -1)
	err(1, "pledge");

	- /* handlers */
	- p.xmlattr = xmlattr;
	- p.xmlattrentity = xmlattrentity;
	- p.xmlcdata = xmlcdata;
	- p.xmldata = xmldata;
	- p.xmldataentity = xmldataentity;
	- p.xmltagstart = xmltagstart;
	- p.xmltagend = xmltagend;
	- p.xmltagstartparsed = xmltagstartparsed;
	- /* reader (stdin) */
	- p.getnext = getchar;
	-
	- xml_parse(&p);
	+ r = parsejson(processnodes);
	+ if (r != 0)
	+ errx(1, "invalid JSON");
	+
	+ // TODO: TEST: make sure the last tweet is printed too (addtweet() log…
	+ for (t = tweets; t; t = t->next) {
	+ /* check for pinned tweets */
	+ for (i = 0; i < npinned; i++) {
	+ if (!strcmp(t->itemid, pinnedids[i])) {
	+// printf("DEBUG: pinned: %s\n", pinnedids[i]);
	+ t->ispinned = 1;
	+ break;
	+ }
	+ }
	+ printtweet(t);
	+ }

	return 0;
	}
	diff --git a/tscrape_plain.c b/tscrape_plain.c
	@@ -51,7 +51,7 @@ printfeed(FILE fp, const char feedname)

	printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
	fputs(" ", stdout);
	- printescape(fields[FieldText]);
	+ fputs(fields[FieldText], stdout);
	putchar('\n');
	}
	}
	diff --git a/tscrape_update b/tscrape_update
	@@ -9,6 +9,12 @@ tscrapepath="$HOME/.tscrape/feeds"
	# feeds are finished at a time.
	maxjobs=8

	+# Twitter authentication bearer (seems to be static).
	+bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk…
	+
	+# guest token.
	+token=""
	+
	# load config (evaluate shellscript).
	# loadconfig(configfile)
	loadconfig() {
	@@ -36,12 +42,26 @@ log() {
	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
	}

	+# acquire guest token.
	+# guesttoken()
	+guesttoken() {
	+ # fail on redirects, hide User-Agent, timeout is 15 seconds.
	+ curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
	+ -H "Authorization: Bearer ${bearer}" \
	+ 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null …
	+ sed -nE 's@.{"guest_token":"([^"])"}.*@\1@p'
	+}
	+
	# fetch a feed via HTTP/HTTPS etc.
	-# fetch(name, url, feedfile)
	+# fetch(name, twittername, feedfile)
	fetch() {
	+ url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_na…
	+
	# fail on redirects, hide User-Agent, timeout is 15 seconds.
	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
	- "$2" 2>/dev/null
	+ -H "Authorization: Bearer ${bearer}" \
	+ -H "x-guest-token: $token" \
	+ "${url}" 2>/dev/null
	}

	# filter fields.
	@@ -151,6 +171,13 @@ feeds() {
	echo "See tscraperc.example for an example." >&2
	}

	+# get quest token.
	+token=$(guesttoken)
	+if [ -z "${token}" ]; then
	+ echo "Failed to acquire guest token" >&2
	+ exit 1
	+fi
	+
	# job counter.
	curjobs=0
	# signal number received for parent.
	diff --git a/tscraperc.example b/tscraperc.example
	@@ -2,8 +2,8 @@

	# list of feeds to fetch:
	feeds() {
	- # feed <name> <feedurl>
	- feed "Rich Felker" "https://twitter.com/richfelker"
	- feed "Internet of shit" "https://twitter.com/internetofshit"
	- feed "Donald Trump" "https://twitter.com/realdonaldtrump"
	+ # feed <name> <twittername>
	+ feed "Rich Felker" "richfelker"
	+ feed "Internet of shit" "internetofshit"
	+ feed "Donald Trump" "realdonaldtrump"
	}
	diff --git a/util.c b/util.c
	@@ -106,43 +106,3 @@ printutf8pad(FILE fp, const char s, size_t len, int pad)
	for (; col < len; ++col)
	putc(pad, fp);
	}
	-
	-void
	-printescape(const char *s)
	-{
	- int r;
	- const char *e;
	-
	- /* strip leading and trailing white-space */
	- for (; s && isspace((unsigned char)s); s++)
	- ;
	- for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
	- ;
	-
	- for (r = 0; *s && s < e; s++) {
	- if (iscntrl((unsigned char)s) \|\| isspace((unsigned char)s)) {
	- r = 1;
	- continue;
	- }
	- if (r) {
	- r = 0;
	- putchar(' ');
	- }
	- putchar(*s);
	- }
	-}
	-
	-int
	-parsetime(const char s, time_t t, char *buf, size_t bufsiz)
	-{
	- struct tm *tm;
	-
	- if (strtotime(s, t))
	- return -1;
	- if (!(tm = localtime(t)))
	- return -1;
	- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
	- return -1;
	-
	- return 0;
	-}
	diff --git a/util.h b/util.h
	@@ -30,8 +30,6 @@ enum {
	};

	size_t parseline(char , char [FieldLast]);
	-int parsetime(const char , time_t , char *, size_t);
	-void printescape(const char *);
	void printutf8pad(FILE , const char , size_t, int);
	int strtotime(const char , time_t );
	void xmlencode(const char , FILE );
	diff --git a/xml.c b/xml.c
	@@ -1,451 +0,0 @@
	-#include <ctype.h>
	-#include <errno.h>
	-#include <stdio.h>
	-#include <stdlib.h>
	-#include <string.h>
	-
	-#include "xml.h"
	-
	-static void
	-xml_parseattrs(XMLParser *x)
	-{
	- size_t namelen = 0, valuelen;
	- int c, endsep, endname = 0, valuestart = 0;
	-
	- while ((c = GETNEXT()) != EOF) {
	- if (isspace(c)) {
	- if (namelen)
	- endname = 1;
	- continue;
	- } else if (c == '?')
	- ; /* ignore */
	- else if (c == '=') {
	- x->name[namelen] = '\0';
	- valuestart = 1;
	- endname = 1;
	- } else if (namelen && ((endname && !valuestart && isalpha(c)) …
	- /* attribute without value */
	- x->name[namelen] = '\0';
	- if (x->xmlattrstart)
	- x->xmlattrstart(x, x->tag, x->taglen, x->name,…
	- if (x->xmlattr)
	- x->xmlattr(x, x->tag, x->taglen, x->name, name…
	- if (x->xmlattrend)
	- x->xmlattrend(x, x->tag, x->taglen, x->name, n…
	- endname = 0;
	- x->name[0] = c;
	- namelen = 1;
	- } else if (namelen && valuestart) {
	- /* attribute with value */
	- if (x->xmlattrstart)
	- x->xmlattrstart(x, x->tag, x->taglen, x->name,…
	-
	- valuelen = 0;
	- if (c == '\'' \|\| c == '"') {
	- endsep = c;
	- } else {
	- endsep = ' '; /* isspace() */
	- goto startvalue;
	- }
	-
	- while ((c = GETNEXT()) != EOF) {
	-startvalue:
	- if (c == '&') { /* entities */
	- x->data[valuelen] = '\0';
	- /* call data function with data before…
	- if (valuelen && x->xmlattr)
	- x->xmlattr(x, x->tag, x->tagle…
	- x->data[0] = c;
	- valuelen = 1;
	- while ((c = GETNEXT()) != EOF) {
	- if (c == endsep \|\| (endsep == …
	- break;
	- if (valuelen < sizeof(x->data)…
	- x->data[valuelen++] = …
	- else {
	- /* entity too long for…
	- x->data[valuelen] = '\…
	- if (x->xmlattr)
	- x->xmlattr(x, …
	- x->data[0] = c;
	- valuelen = 1;
	- break;
	- }
	- if (c == ';') {
	- x->data[valuelen] = '\…
	- if (x->xmlattrentity)
	- x->xmlattrenti…
	- valuelen = 0;
	- break;
	- }
	- }
	- } else if (c != endsep && !(endsep == ' ' && (…
	- if (valuelen < sizeof(x->data) - 1) {
	- x->data[valuelen++] = c;
	- } else {
	- x->data[valuelen] = '\0';
	- if (x->xmlattr)
	- x->xmlattr(x, x->tag, …
	- x->data[0] = c;
	- valuelen = 1;
	- }
	- }
	- if (c == endsep \|\| (endsep == ' ' && (c == '>'…
	- x->data[valuelen] = '\0';
	- if (x->xmlattr)
	- x->xmlattr(x, x->tag, x->tagle…
	- if (x->xmlattrend)
	- x->xmlattrend(x, x->tag, x->ta…
	- break;
	- }
	- }
	- namelen = endname = valuestart = 0;
	- } else if (namelen < sizeof(x->name) - 1) {
	- x->name[namelen++] = c;
	- }
	- if (c == '>') {
	- break;
	- } else if (c == '/') {
	- x->isshorttag = 1;
	- x->name[0] = '\0';
	- namelen = 0;
	- }
	- }
	-}
	-
	-static void
	-xml_parsecomment(XMLParser *x)
	-{
	- size_t datalen = 0, i = 0;
	- int c;
	-
	- if (x->xmlcommentstart)
	- x->xmlcommentstart(x);
	- while ((c = GETNEXT()) != EOF) {
	- if (c == '-' \|\| c == '>') {
	- if (x->xmlcomment && datalen) {
	- x->data[datalen] = '\0';
	- x->xmlcomment(x, x->data, datalen);
	- datalen = 0;
	- }
	- }
	-
	- if (c == '-') {
	- if (++i > 2) {
	- if (x->xmlcomment)
	- for (; i > 2; i--)
	- x->xmlcomment(x, "-", 1);
	- i = 2;
	- }
	- continue;
	- } else if (c == '>' && i == 2) {
	- if (x->xmlcommentend)
	- x->xmlcommentend(x);
	- return;
	- } else if (i) {
	- if (x->xmlcomment) {
	- for (; i > 0; i--)
	- x->xmlcomment(x, "-", 1);
	- }
	- i = 0;
	- }
	-
	- if (datalen < sizeof(x->data) - 1) {
	- x->data[datalen++] = c;
	- } else {
	- x->data[datalen] = '\0';
	- if (x->xmlcomment)
	- x->xmlcomment(x, x->data, datalen);
	- x->data[0] = c;
	- datalen = 1;
	- }
	- }
	-}
	-
	-static void
	-xml_parsecdata(XMLParser *x)
	-{
	- size_t datalen = 0, i = 0;
	- int c;
	-
	- if (x->xmlcdatastart)
	- x->xmlcdatastart(x);
	- while ((c = GETNEXT()) != EOF) {
	- if (c == ']' \|\| c == '>') {
	- if (x->xmlcdata && datalen) {
	- x->data[datalen] = '\0';
	- x->xmlcdata(x, x->data, datalen);
	- datalen = 0;
	- }
	- }
	-
	- if (c == ']') {
	- if (++i > 2) {
	- if (x->xmlcdata)
	- for (; i > 2; i--)
	- x->xmlcdata(x, "]", 1);
	- i = 2;
	- }
	- continue;
	- } else if (c == '>' && i == 2) {
	- if (x->xmlcdataend)
	- x->xmlcdataend(x);
	- return;
	- } else if (i) {
	- if (x->xmlcdata)
	- for (; i > 0; i--)
	- x->xmlcdata(x, "]", 1);
	- i = 0;
	- }
	-
	- if (datalen < sizeof(x->data) - 1) {
	- x->data[datalen++] = c;
	- } else {
	- x->data[datalen] = '\0';
	- if (x->xmlcdata)
	- x->xmlcdata(x, x->data, datalen);
	- x->data[0] = c;
	- datalen = 1;
	- }
	- }
	-}
	-
	-static int
	-codepointtoutf8(long r, char *s)
	-{
	- if (r == 0) {
	- return 0; /* NUL byte */
	- } else if (r <= 0x7F) {
	- /* 1 byte: 0aaaaaaa */
	- s[0] = r;
	- return 1;
	- } else if (r <= 0x07FF) {
	- /* 2 bytes: 00000aaa aabbbbbb */
	- s[0] = 0xC0 \| ((r & 0x0007C0) >> 6); /* 110aaaaa */
	- s[1] = 0x80 \| (r & 0x00003F); /* 10bbbbbb */
	- return 2;
	- } else if (r <= 0xFFFF) {
	- /* 3 bytes: aaaabbbb bbcccccc */
	- s[0] = 0xE0 \| ((r & 0x00F000) >> 12); /* 1110aaaa */
	- s[1] = 0x80 \| ((r & 0x000FC0) >> 6); /* 10bbbbbb */
	- s[2] = 0x80 \| (r & 0x00003F); /* 10cccccc */
	- return 3;
	- } else {
	- /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
	- s[0] = 0xF0 \| ((r & 0x1C0000) >> 18); /* 11110aaa */
	- s[1] = 0x80 \| ((r & 0x03F000) >> 12); /* 10bbbbbb */
	- s[2] = 0x80 \| ((r & 0x000FC0) >> 6); /* 10cccccc */
	- s[3] = 0x80 \| (r & 0x00003F); /* 10dddddd */
	- return 4;
	- }
	-}
	-
	-static int
	-namedentitytostr(const char e, char buf, size_t bufsiz)
	-{
	- static const struct {
	- const char *entity;
	- int c;
	- } entities[] = {
	- { "amp;", '&' },
	- { "lt;", '<' },
	- { "gt;", '>' },
	- { "apos;", '\'' },
	- { "quot;", '"' },
	- };
	- size_t i;
	-
	- /* buffer is too small */
	- if (bufsiz < 2)
	- return -1;
	-
	- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
	- if (!strcmp(e, entities[i].entity)) {
	- buf[0] = entities[i].c;
	- buf[1] = '\0';
	- return 1;
	- }
	- }
	- return -1;
	-}
	-
	-static int
	-numericentitytostr(const char e, char buf, size_t bufsiz)
	-{
	- long l;
	- int len;
	- char *end;
	-
	- /* buffer is too small */
	- if (bufsiz < 5)
	- return -1;
	-
	- errno = 0;
	- /* hex (16) or decimal (10) */
	- if (*e == 'x')
	- l = strtol(++e, &end, 16);
	- else
	- l = strtol(e, &end, 10);
	- /* invalid value or not a well-formed entity or invalid code point */
	- if (errno \|\| e == end \|\| *end != ';' \|\| l < 0 \|\| l > 0x10ffff)
	- return -1;
	- len = codepointtoutf8(l, buf);
	- buf[len] = '\0';
	-
	- return len;
	-}
	-
	-/* convert named- or numeric entity string to buffer string
	- * returns byte-length of string or -1 on failure. */
	-int
	-xml_entitytostr(const char e, char buf, size_t bufsiz)
	-{
	- /* doesn't start with & */
	- if (e[0] != '&')
	- return -1;
	- /* numeric entity */
	- if (e[1] == '#')
	- return numericentitytostr(e + 2, buf, bufsiz);
	- else /* named entity */
	- return namedentitytostr(e + 1, buf, bufsiz);
	-}
	-
	-void
	-xml_parse(XMLParser *x)
	-{
	- size_t datalen, tagdatalen;
	- int c, isend;
	-
	- while ((c = GETNEXT()) != EOF && c != '<')
	- ; /* skip until < */
	-
	- while (c != EOF) {
	- if (c == '<') { /* parse tag */
	- if ((c = GETNEXT()) == EOF)
	- return;
	-
	- if (c == '!') { /* cdata and comments */
	- for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
	- /* NOTE: sizeof(x->data) must be atlea…
	- if (tagdatalen <= sizeof("[CDATA[") - …
	- x->data[tagdatalen++] = c;
	- if (c == '>')
	- break;
	- else if (c == '-' && tagdatalen == siz…
	- (x->data[0] == '-')) {
	- xml_parsecomment(x);
	- break;
	- } else if (c == '[') {
	- if (tagdatalen == sizeof("[CDA…
	- !strncmp(x->data, "[CDATA[…
	- xml_parsecdata(x);
	- break;
	- }
	- }
	- }
	- } else {
	- /* normal tag (open, short open, close), proce…
	- x->tag[0] = c;
	- x->taglen = 1;
	- x->isshorttag = isend = 0;
	-
	- /* treat processing instruction as shorttag, d…
	- if (c == '?') {
	- x->isshorttag = 1;
	- } else if (c == '/') {
	- if ((c = GETNEXT()) == EOF)
	- return;
	- x->tag[0] = c;
	- isend = 1;
	- }
	-
	- while ((c = GETNEXT()) != EOF) {
	- if (c == '/')
	- x->isshorttag = 1; /* short ta…
	- else if (c == '>' \|\| isspace(c)) {
	- x->tag[x->taglen] = '\0';
	- if (isend) { /* end tag, start…
	- if (x->xmltagend)
	- x->xmltagend(x…
	- x->tag[0] = '\0';
	- x->taglen = 0;
	- } else {
	- /* start tag */
	- if (x->xmltagstart)
	- x->xmltagstart…
	- if (isspace(c))
	- xml_parseattrs…
	- if (x->xmltagstartpars…
	- x->xmltagstart…
	- }
	- /* call tagend for shortform o…
	- if (x->isshorttag) {
	- if (x->xmltagend)
	- x->xmltagend(x…
	- x->tag[0] = '\0';
	- x->taglen = 0;
	- }
	- break;
	- } else if (x->taglen < sizeof(x->tag) …
	- x->tag[x->taglen++] = c; /* NO…
	- }
	- }
	- } else {
	- /* parse tag data */
	- datalen = 0;
	- if (x->xmldatastart)
	- x->xmldatastart(x);
	- while ((c = GETNEXT()) != EOF) {
	- if (c == '&') {
	- if (datalen) {
	- x->data[datalen] = '\0';
	- if (x->xmldata)
	- x->xmldata(x, x->data,…
	- }
	- x->data[0] = c;
	- datalen = 1;
	- while ((c = GETNEXT()) != EOF) {
	- if (c == '<')
	- break;
	- if (datalen < sizeof(x->data) …
	- x->data[datalen++] = c;
	- else {
	- /* entity too long for…
	- x->data[datalen] = '\0…
	- if (x->xmldata)
	- x->xmldata(x, …
	- x->data[0] = c;
	- datalen = 1;
	- break;
	- }
	- if (c == ';') {
	- x->data[datalen] = '\0…
	- if (x->xmldataentity)
	- x->xmldataenti…
	- datalen = 0;
	- break;
	- }
	- }
	- } else if (c != '<') {
	- if (datalen < sizeof(x->data) - 1) {
	- x->data[datalen++] = c;
	- } else {
	- x->data[datalen] = '\0';
	- if (x->xmldata)
	- x->xmldata(x, x->data,…
	- x->data[0] = c;
	- datalen = 1;
	- }
	- }
	- if (c == '<') {
	- x->data[datalen] = '\0';
	- if (x->xmldata && datalen)
	- x->xmldata(x, x->data, datalen…
	- if (x->xmldataend)
	- x->xmldataend(x);
	- break;
	- }
	- }
	- }
	- }
	-}
	diff --git a/xml.h b/xml.h
	@@ -1,49 +0,0 @@
	-#ifndef _XML_H
	-#define _XML_H
	-
	-#include <stdio.h>
	-
	-typedef struct xmlparser {
	- /* handlers */
	- void (xmlattr)(struct xmlparser , const char *, size_t,
	- const char , size_t, const char , size_t);
	- void (xmlattrend)(struct xmlparser , const char *, size_t,
	- const char *, size_t);
	- void (xmlattrstart)(struct xmlparser , const char *, size_t,
	- const char *, size_t);
	- void (xmlattrentity)(struct xmlparser , const char *, size_t,
	- const char , size_t, const char , size_t);
	- void (xmlcdatastart)(struct xmlparser );
	- void (xmlcdata)(struct xmlparser , const char *, size_t);
	- void (xmlcdataend)(struct xmlparser );
	- void (xmlcommentstart)(struct xmlparser );
	- void (xmlcomment)(struct xmlparser , const char *, size_t);
	- void (xmlcommentend)(struct xmlparser );
	- void (xmldata)(struct xmlparser , const char *, size_t);
	- void (xmldataend)(struct xmlparser );
	- void (xmldataentity)(struct xmlparser , const char *, size_t);
	- void (xmldatastart)(struct xmlparser );
	- void (xmltagend)(struct xmlparser , const char *, size_t, int);
	- void (xmltagstart)(struct xmlparser , const char *, size_t);
	- void (xmltagstartparsed)(struct xmlparser , const char *,
	- size_t, int);
	-
	-#ifndef GETNEXT
	- #define GETNEXT (x)->getnext
	- int (*getnext)(void);
	-#endif
	-
	- /* current tag */
	- char tag[1024];
	- size_t taglen;
	- /* current tag is in short form ? <tag /> */
	- int isshorttag;
	- /* current attribute name */
	- char name[1024];
	- /* data buffer used for tag data, cdata and attribute data */
	- char data[BUFSIZ];
	-} XMLParser;
	-
	-int xml_entitytostr(const char , char , size_t);
	-void xml_parse(XMLParser *);
	-#endif