Introduction
Introduction Statistics Contact Development Disclaimer Help
work-in-progress: support the new Twitter site - tscrape - twitter scraper
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit c3e76b0f57c58b284cd13ce008c082525c8ee28a
parent 663dab7d9883a291ed570a743fb89a16e1a01d85
Author: Hiltjo Posthuma <[email protected]>
Date: Fri, 5 Jun 2020 14:51:58 +0200
work-in-progress: support the new Twitter site
Scraping doesn't work anymore. Use the Twitter JSON API.
Major thanks to leot for helping with this.
Diffstat:
M Makefile | 18 +++++++++---------
M README | 28 ++++++++++++----------------
A json.c | 313 +++++++++++++++++++++++++++++…
A json.h | 26 ++++++++++++++++++++++++++
M tscrape.c | 591 +++++++++++++++++++----------…
M tscrape_plain.c | 2 +-
M tscrape_update | 31 +++++++++++++++++++++++++++++…
M tscraperc.example | 8 ++++----
M util.c | 40 -----------------------------…
M util.h | 2 --
D xml.c | 451 -----------------------------…
D xml.h | 49 -----------------------------…
12 files changed, 755 insertions(+), 804 deletions(-)
---
diff --git a/Makefile b/Makefile
@@ -25,17 +25,17 @@ SCRIPTS = \
SRC = ${BIN:=.c}
HDR = \
util.h\
- xml.h
+ json.h
LIBUTIL = libutil.a
LIBUTILSRC = \
util.c
LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
-LIBXML = libxml.a
-LIBXMLSRC = \
- xml.c
-LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
+LIBJSON = libjson.a
+LIBJSONSRC = \
+ json.c
+LIBJSONOBJ = ${LIBJSONSRC:.c=.o}
COMPATSRC = \
strlcat.c\
@@ -44,7 +44,7 @@ COMPATOBJ =\
strlcat.o\
strlcpy.o
-LIB = ${LIBUTIL} ${LIBXML} ${COMPATOBJ}
+LIB = ${LIBUTIL} ${LIBJSON} ${COMPATOBJ}
MAN1 = ${BIN:=.1}\
${SCRIPTS:=.1}
@@ -59,7 +59,7 @@ all: $(BIN)
${BIN}: ${LIB} ${@:=.o}
-OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
+OBJ = ${SRC:.c=.o} ${LIBJSONOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
${OBJ}: ${HDR}
@@ -73,7 +73,7 @@ ${LIBUTIL}: ${LIBUTILOBJ}
${AR} rc $@ $?
${RANLIB} $@
-${LIBXML}: ${LIBXMLOBJ}
+${LIBJSON}: ${LIBJSONOBJ}
${AR} rc $@ $?
${RANLIB} $@
@@ -81,7 +81,7 @@ dist:
rm -rf "${NAME}-${VERSION}"
mkdir -p "${NAME}-${VERSION}"
cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
- ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
+ ${SRC} ${LIBJSONSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
Makefile \
tscraperc.example style.css \
"${NAME}-${VERSION}"
diff --git a/README b/README
@@ -1,13 +1,16 @@
tscrape
-------
-Twitter feed HTML scraper.
+Twitter feed parser.
-It scrapes HTML from stdin and outputs it to a TAB-separated format that can be
-easier parsed with various (UNIX) tools. There are formatting programs included
-to convert this TAB-separated format to various other formats. There are also
-some programs and scripts included to import and export OPML and to fetch,
-filter, merge and order items.
+It parses JSON from stdin and outputs it to a TAB-separated format that can be
+processed easier with various (UNIX) tools. There are formatting programs
+included to convert this TAB-separated format to various other formats. There
+are also some programs and scripts included to import and export OPML and to
+fetch, filter, merge and order items.
+
+The name tscrape is used because it used to scrape the HTML from the Twitter
+page. It is now using the JSON API contents.
Build and install
@@ -20,20 +23,13 @@ $ make
Usage
-----
- curl -H 'User-Agent:' -s 'https://twitter.com/namehere' | tscrape
-
-or
-
- ftp -o - -U '' 'https://twitter.com/namehere' 2>/dev/null | tscrape
-
-or
-
- hurl 'https://twitter.com/namehere' | tscrape
+* Create a tscraperc configuration file in ~/.tscrape/tscraperc, see tscraperc…
+* Run tscrape_update
Using sfeed to convert the tscrape TSV output to an Atom feed:
- hurl 'https://twitter.com/namehere' | tscrape | \
+ tscrape < ~/.tscrape/feeds/name | \
awk 'BEGIN { OFS = FS = "\t"; }
{
print $1 OFS $4 OFS "https://twitter.com/" $6 "/status/" $5 \
diff --git a/json.c b/json.c
@@ -0,0 +1,313 @@
+#include <ctype.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GETNEXT getchar
+
+#include "json.h"
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+hexdigit(int c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return 10 + (c - 'a');
+ else if (c >= 'A' && c <= 'F')
+ return 10 + (c - 'A');
+ return 0;
+}
+
+static int
+capacity(char **value, size_t *sz, size_t cur, size_t inc)
+{
+ size_t need, newsiz;
+ char *newp;
+
+ /* check for addition overflow */
+ if (cur > SIZE_MAX - inc) {
+ errno = EOVERFLOW;
+ return -1;
+ }
+ need = cur + inc;
+
+ if (need > *sz) {
+ if (need > SIZE_MAX / 2) {
+ newsiz = SIZE_MAX;
+ } else {
+ for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; new…
+ ;
+ }
+ if (!(newp = realloc(*value, newsiz)))
+ return -1; /* up to caller to free *value */
+ *value = newp;
+ *sz = newsiz;
+ }
+ return 0;
+}
+
+#define EXPECT_VALUE "{[\"-0123456789tfn"
+#define EXPECT_STRING "\""
+#define EXPECT_END "}],"
+#define EXPECT_OBJECT_STRING EXPECT_STRING "}"
+#define EXPECT_OBJECT_KEY ":"
+#define EXPECT_ARRAY_VALUE EXPECT_VALUE "]"
+
+#define JSON_INVALID() do { ret = JSON_ERROR_INVALID; goto end; } while …
+
+int
+parsejson(void (*cb)(struct json_node *, size_t, const char *))
+{
+ struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
+ size_t depth = 0, p = 0, len, sz = 0;
+ long cp, hi, lo;
+ char pri[128], *str = NULL;
+ int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
+ const char *expect = EXPECT_VALUE;
+
+ if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
+ goto end;
+ nodes[0].name[0] = '\0';
+
+ while (1) {
+ c = GETNEXT();
+handlechr:
+ if (c == EOF)
+ break;
+
+ /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+ continue;
+
+ if (!c || !strchr(expect, c))
+ JSON_INVALID();
+
+ switch (c) {
+ case ':':
+ iskey = 0;
+ expect = EXPECT_VALUE;
+ break;
+ case '"':
+ nodes[depth].type = JSON_TYPE_STRING;
+ escape = 0;
+ len = 0;
+ while (1) {
+ c = GETNEXT();
+chr:
+ /* EOF or control char: 0x7f is not defined as…
+ if (c < 0x20)
+ JSON_INVALID();
+
+ if (escape) {
+escchr:
+ escape = 0;
+ switch (c) {
+ case '"': /* FALLTHROUGH */
+ case '\\':
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'u': /* hex hex hex hex */
+ if (capacity(&str, &sz, len, 4…
+ goto end;
+ for (i = 12, cp = 0; i >= 0; i…
+ if ((c = GETNEXT()) ==…
+ JSON_INVALID()…
+ cp |= (hexdigit(c) << …
+ }
+ /* RFC8259 - 7. Strings - surr…
+ * 0xd800 - 0xdb7f - high surr…
+ if (cp >= 0xd800 && cp <= 0xdb…
+ if ((c = GETNEXT()) !=…
+ len += codepoi…
+ goto chr;
+ }
+ if ((c = GETNEXT()) !=…
+ len += codepoi…
+ goto escchr;
+ }
+ for (hi = cp, i = 12, …
+ if ((c = GETNE…
+ JSON_I…
+ lo |= (hexdigi…
+ }
+ /* 0xdc00 - 0xdfff - l…
+ if (lo >= 0xdc00 && lo…
+ cp = (hi << 10…
+ } else {
+ /* handle grac…
+ len += codepoi…
+ if (capacity(&…
+ goto e…
+ len += codepoi…
+ continue;
+ }
+ }
+ len += codepointtoutf8(cp, &st…
+ continue;
+ default:
+ JSON_INVALID(); /* invalid esc…
+ }
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = c;
+ } else if (c == '\\') {
+ escape = 1;
+ } else if (c == '"') {
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = '\0';
+
+ if (iskey) {
+ /* copy string as key, includi…
+ if (capacity(&(nodes[depth].na…
+ goto end;
+ memcpy(nodes[depth].name, str,…
+ } else {
+ cb(nodes, depth + 1, str);
+ }
+ break;
+ } else {
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = c;
+ }
+ }
+ if (iskey)
+ expect = EXPECT_OBJECT_KEY;
+ else
+ expect = EXPECT_END;
+ break;
+ case '[':
+ case '{':
+ if (depth + 1 >= JSON_MAX_NODE_DEPTH)
+ JSON_INVALID(); /* too deep */
+
+ nodes[depth].index = 0;
+ if (c == '[') {
+ nodes[depth].type = JSON_TYPE_ARRAY;
+ expect = EXPECT_ARRAY_VALUE;
+ } else if (c == '{') {
+ iskey = 1;
+ nodes[depth].type = JSON_TYPE_OBJECT;
+ expect = EXPECT_OBJECT_STRING;
+ }
+
+ cb(nodes, depth + 1, "");
+
+ depth++;
+ nodes[depth].index = 0;
+ if (capacity(&(nodes[depth].name), &(nodes[depth].name…
+ goto end;
+ nodes[depth].name[0] = '\0';
+ break;
+ case ']':
+ case '}':
+ if (!depth ||
+ (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARR…
+ (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJ…
+ JSON_INVALID(); /* unbalanced nodes */
+
+ nodes[--depth].index++;
+ expect = EXPECT_END;
+ break;
+ case ',':
+ if (!depth)
+ JSON_INVALID(); /* unbalanced nodes */
+
+ nodes[depth - 1].index++;
+ if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
+ iskey = 1;
+ expect = EXPECT_STRING;
+ } else {
+ expect = EXPECT_VALUE;
+ }
+ break;
+ case 't': /* true */
+ if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() …
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_BOOL;
+ cb(nodes, depth + 1, "true");
+ expect = EXPECT_END;
+ break;
+ case 'f': /* false */
+ if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() …
+ GETNEXT() != 'e')
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_BOOL;
+ cb(nodes, depth + 1, "false");
+ expect = EXPECT_END;
+ break;
+ case 'n': /* null */
+ if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() …
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_NULL;
+ cb(nodes, depth + 1, "null");
+ expect = EXPECT_END;
+ break;
+ default: /* number */
+ nodes[depth].type = JSON_TYPE_NUMBER;
+ p = 0;
+ pri[p++] = c;
+ expect = EXPECT_END;
+ while (1) {
+ c = GETNEXT();
+ if (c == EOF ||
+ !c || !strchr("0123456789eE+-.", c) ||
+ p + 1 >= sizeof(pri)) {
+ pri[p] = '\0';
+ cb(nodes, depth + 1, pri);
+ goto handlechr; /* do not read next ch…
+ } else {
+ pri[p++] = c;
+ }
+ }
+ }
+ }
+ if (depth)
+ JSON_INVALID(); /* unbalanced nodes */
+
+ ret = 0; /* success */
+end:
+ for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
+ free(nodes[depth].name);
+ free(str);
+
+ return ret;
+}
diff --git a/json.h b/json.h
@@ -0,0 +1,26 @@
+#include <stddef.h>
+
+enum JSONType {
+ JSON_TYPE_ARRAY = 'a',
+ JSON_TYPE_OBJECT = 'o',
+ JSON_TYPE_STRING = 's',
+ JSON_TYPE_BOOL = 'b',
+ JSON_TYPE_NULL = '?',
+ JSON_TYPE_NUMBER = 'n'
+};
+
+enum JSONError {
+ JSON_ERROR_MEM = -2,
+ JSON_ERROR_INVALID = -1
+};
+
+#define JSON_MAX_NODE_DEPTH 64
+
+struct json_node {
+ enum JSONType type;
+ char *name;
+ size_t namesiz;
+ size_t index; /* count/index for array or object type */
+};
+
+int parsejson(void (*cb)(struct json_node *, size_t, const char *));
diff --git a/tscrape.c b/tscrape.c
@@ -2,107 +2,52 @@
#include <ctype.h>
#include <err.h>
+#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
+#include <time.h>
#include <unistd.h>
-#include "xml.h"
+#include "json.h"
#include "util.h"
#define STRP(s) s,sizeof(s)-1
-/* states */
-enum {
- Item = 1,
- Stream = 2,
- Header = 4,
- Timestamp = 8,
- Text = 16
+/* a tweet */
+struct tweet {
+ char fullname[1024];
+ int ispinned;
+ char itemusername[1024];
+ char itemfullname[1024];
+ char full_text[4096];
+ char username[1024];
+ time_t timestamp;
+ char datatime[16];
+ char itemid[64];
+ char retweetid[64];
+
+ struct tweet *next;
};
-/* data */
-static char fullname[1024];
-static int ispinned;
-static char itemusername[1024];
-static char itemfullname[1024];
-static char timestamp[16];
-static char text[4096];
-static char username[1024];
-
-static char classname[256];
-static char datatime[16];
-static char itemid[64];
-static char retweetid[64];
-static int state;
-static XMLParser p;
-
-static const char *ignorestate, *endtag;
-static int (*getnext)(void);
-
-/* return a space for all data until some case-insensitive string occurs. This
- is used to parse incorrect HTML/XML that contains unescaped HTML in script
- or style tags. If you see some </script> tag in a CDATA or comment
- section then e-mail W3C and tell them the web is too complex. */
-static inline int
-getnext_ignore(void)
-{
- int c;
-
- if ((c = getnext()) == EOF)
- return EOF;
+/* url entities and their replacements */
+struct url {
+ char url[256];
+ size_t url_len;
+ char expanded_url[1024];
- if (tolower(c) == tolower((unsigned char)*ignorestate)) {
- ignorestate++;
- if (*ignorestate == '\0') {
- p.getnext = getnext; /* restore */
- return c;
- }
- } else {
- ignorestate = endtag;
- }
-
- return ' ';
-}
-
-static void
-printtweet(void)
-{
- char buf[32];
- time_t t;
-
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
- printf("%lld", (long long)t);
- putchar('\t');
- printescape(username);
- putchar('\t');
- printescape(fullname);
- putchar('\t');
- printescape(text);
- putchar('\t');
- printescape(itemid);
- putchar('\t');
- printescape(itemusername);
- putchar('\t');
- printescape(itemfullname);
- putchar('\t');
- printescape(retweetid);
- putchar('\t');
- printf("%d", ispinned);
- putchar('\n');
-}
+ struct url *next;
+};
-static int
-isclassmatch(const char *classes, const char *clss, size_t len)
-{
- const char *p;
+static struct tweet *tweets, *tc;
+static struct url *urls, *uc;
+static char url[256];
- if (!(p = strstr(classes, clss)))
- return 0;
- return (p == classes || isspace((unsigned char)p[-1])) &&
- (isspace((unsigned char)p[len]) || !p[len]);
-}
+#define MAX_PINNED 5
+static char pinnedids[MAX_PINNED][64];
+static size_t npinned;
+#if 0
/* convert XML and some HTML entities */
static int
html_entitytostr(const char *s, char *buf, size_t bufsiz)
@@ -115,192 +60,378 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
return (ssize_t)strlcpy(buf, " ", bufsiz);
return len;
}
+#endif
-static void
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
+long long
+datetounix(long long year, int mon, int day, int hour, int min, int sec)
{
- if (!strcmp(t, "p"))
- state &= ~Text;
- else if (!strcmp(t, "span"))
- state &= ~(Timestamp);
+ static const int secs_through_month[] = {
+ 0, 31 * 86400, 59 * 86400, 90 * 86400,
+ 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
+ 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
+ int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
+ long long t;
+
+ if (year - 2ULL <= 136) {
+ leaps = (year - 68) >> 2;
+ if (!((year - 68) & 3)) {
+ leaps--;
+ is_leap = 1;
+ } else {
+ is_leap = 0;
+ }
+ t = 31536000 * (year - 70) + 86400 * leaps;
+ } else {
+ cycles = (year - 100) / 400;
+ rem = (year - 100) % 400;
+ if (rem < 0) {
+ cycles--;
+ rem += 400;
+ }
+ if (!rem) {
+ is_leap = 1;
+ } else {
+ if (rem >= 300)
+ centuries = 3, rem -= 300;
+ else if (rem >= 200)
+ centuries = 2, rem -= 200;
+ else if (rem >= 100)
+ centuries = 1, rem -= 100;
+ if (rem) {
+ leaps = rem / 4U;
+ rem %= 4U;
+ is_leap = !rem;
+ }
+ }
+ leaps += 97 * cycles + 24 * centuries - is_leap;
+ t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + …
+ }
+ t += secs_through_month[mon];
+ if (is_leap && mon >= 2)
+ t += 86400;
+ t += 86400LL * (day - 1);
+ t += 3600LL * hour;
+ t += 60LL * min;
+ t += sec;
+
+ return t;
}
-static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
+/* parse time format: "Wed May 27 04:12:34 +0000 2020"
+ assumes tz offset is "+0000" */
+static int
+parsetime(const char *s, time_t *tp)
{
- classname[0] = '\0';
+ static char *mons[] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+ };
+ int year, mon = 0, mday, hour, min, sec, i;
+ char tzbuf[6], monbuf[4], wdaybuf[4];
+
+ for (; *s && isspace((unsigned char)*s); s++)
+ ;
+ i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
+ wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
+ if (i != 8)
+ return -1;
+ for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
+ if (!strcmp(mons[i], monbuf)) {
+ mon = i + 1;
+ break;
+ }
+ }
+ if (mon == 0)
+ return -1;
+
+ /* invalid range */
+ if (year < 0 || year > 9999 ||
+ mon < 1 || mon > 12 ||
+ mday < 1 || mday > 31 ||
+ hour < 0 || hour > 23 ||
+ min < 0 || min> 59 ||
+ sec < 0 || sec > 59)
+ return -1;
+
+ if (tp)
+ *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
+ return 0;
}
static void
-xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
+printescape(const char *s)
{
- /* temporary replace the callback except the reader and end of tag
- restore the context once we receive the same ignored tag in the
- end tag handler */
- if (!strcasecmp(t, "script")) {
- ignorestate = endtag = "</script>";
- getnext = x->getnext; /* for restore */
- x->getnext = getnext_ignore;
- return;
- } else if (!strcasecmp(t, "style")) {
- ignorestate = endtag = "</style>";
- getnext = x->getnext; /* for restore */
- x->getnext = getnext_ignore;
- return;
+ for (; *s; s++) {
+ if (!iscntrl((unsigned char)*s))
+ putchar(*s);
}
-
- if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))…
- if (state & (Item | Stream | Header))
- state |= Text;
- } else if (!strcmp(t, "div") &&
- isclassmatch(classname, STRP("stream-item-footer"))) {
- if (text[0] && username[0])
- printtweet();
- state = 0;
- } else if (!strcmp(t, "li") &&
- isclassmatch(classname, STRP("js-stream-item"))) {
- if (state & Item)
- return;
- state |= Item;
- datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
- itemid[0] = itemusername[0] = retweetid[0] = '\0';
- ispinned = 0;
- if (isclassmatch(classname, STRP("js-pinned")))
- ispinned = 1;
- } else if (state & Item) {
- if (!strcmp(t, "div") &&
- isclassmatch(classname, STRP("js-stream-tweet"))) {
- state &= ~(Text|Header);
- state |= Stream;
- } else if (!strcmp(t, "a") &&
- isclassmatch(classname, STRP("js-action-profile")))…
- state |= Header;
- } else if (!strcmp(t, "span") &&
- isclassmatch(classname, STRP("js-short-timestamp")))…
- state |= Timestamp;
- strlcpy(timestamp, datatime, sizeof(timestamp));
- datatime[0] = '\0';
- }
- }
- if ((state & Text) && !strcmp(t, "a") && !isspace((unsigned char)text[…
- strlcat(text, " ", sizeof(text));
}
+/* print text and expand urls */
static void
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+printexpand(const char *s)
{
- /* NOTE: assumes classname attribute is set before data-* in current t…
- if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-…
- if (!strcmp(a, "data-screen-name")) {
- strlcat(username, " ", sizeof(username));
- strlcat(username, v, sizeof(username));
- } else if (!strcmp(a, "data-name")) {
- strlcat(fullname, " ", sizeof(fullname));
- strlcat(fullname, v, sizeof(fullname));
- }
- }
-
- if (!strcmp(a, "class")) {
- strlcat(classname, v, sizeof(classname));
- } else if (state & Item) {
- if (!strcmp(t, "div")) {
- if (!strcmp(a, "data-item-id"))
- strlcpy(itemid, v, sizeof(itemid));
- else if (!strcmp(a, "data-retweet-id"))
- strlcpy(retweetid, v, sizeof(retweetid));
-
- if (isclassmatch(classname, STRP("js-stream-tweet"))) {
- if (!strcmp(a, "data-screen-name")) {
- strlcat(itemusername, " ", sizeof(item…
- strlcat(itemusername, v, sizeof(itemus…
- } else if (!strcmp(a, "data-name")) {
- strlcat(itemfullname, " ", sizeof(item…
- strlcat(itemfullname, v, sizeof(itemfu…
- }
+ struct url *u;
+
+ for (; *s; s++) {
+ if (iscntrl((unsigned char)*s))
+ continue;
+ for (u = urls; u; u = u->next) {
+ if (!strncmp(s, u->url, u->url_len)) {
+ s += u->url_len;
+ printescape(u->expanded_url);
+ break;
}
- } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) {
- /* UNIX timestamp */
- strlcpy(datatime, v, sizeof(datatime));
- }
- /* NOTE: can be <div data-image-url>. */
- if (!strcmp(a, "data-image-url")) {
- strlcat(text, " ", sizeof(text));
- strlcat(text, v, sizeof(text));
- }
-
- /* indication it has a video */
- if (itemid[0] && !strcmp(a, "data-playable-media-url")) {
- strlcat(text, " ", sizeof(text));
- strlcat(text, "https://twitter.com/i/videos/", sizeof(…
- strlcat(text, itemid, sizeof(text));
}
+ if (!u)
+ putchar(*s);
}
}
static void
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+printtweet(struct tweet *t)
{
- char buf[16];
- int len;
+ if (t->timestamp != -1)
+ printf("%lld", (long long)t->timestamp);
+ putchar('\t');
+ printescape(t->username);
+ putchar('\t');
+ printescape(t->fullname);
+ putchar('\t');
+ printexpand(t->full_text);
+ putchar('\t');
+ printescape(t->itemid);
+ putchar('\t');
+ if (t->itemusername[0])
+ printescape(t->itemusername);
+ else
+ printescape(t->username);
+ putchar('\t');
+ if (t->itemfullname[0])
+ printescape(t->itemfullname);
+ else
+ printescape(t->fullname);
+ putchar('\t');
+ printescape(t->retweetid);
+ putchar('\t');
+ printf("%d", t->ispinned);
+ putchar('\n');
+}
- if (!state)
+void
+addpinned(const char *str)
+{
+ if (npinned + 1 >= MAX_PINNED)
return;
- if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
- xmlattr(x, t, tl, a, al, buf, (size_t)len);
- else
- xmlattr(x, t, tl, a, al, v, vl);
+ strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
+ npinned++;
}
-static void
-xmldata(XMLParser *x, const char *d, size_t dl)
+void
+addtweet(void)
{
- if (state & Text) {
- if (!isclassmatch(classname, STRP("u-hidden")))
- strlcat(text, d, sizeof(text));
- }
+ struct tweet *t;
+
+ if (!(t = calloc(1, sizeof(*t))))
+ err(1, "calloc");
+ t->timestamp = -1;
+ if (tweets)
+ tc = tc->next = t;
+ else
+ tweets = tc = t;
}
-static void
-xmldataentity(XMLParser *x, const char *d, size_t dl)
+void
+addurl(const char *url, const char *expanded_url)
{
- char buf[16];
- int len;
+ struct url *u;
- if (!(state & Text))
- return;
- if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0)
- xmldata(x, buf, (size_t)len);
+ if (!(u = calloc(1, sizeof(*u))))
+ err(1, "calloc");
+ strlcpy(u->url, url, sizeof(u->url));
+ u->url_len = strlen(u->url);
+ strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
+
+ if (urls)
+ uc = uc->next = u;
else
- xmldata(x, d, dl);
+ urls = uc = u;
}
-static void
-xmlcdata(XMLParser *x, const char *d, size_t dl)
+void
+processnodes(struct json_node *nodes, size_t depth, const char *str)
{
- xmldata(x, d, dl);
+ if (depth == 2 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT) {
+ addtweet();
+ }
+
+ if (tc) {
+ if (depth == 3 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_STRING) {
+ if (!strcmp(nodes[2].name, "created_at")) {
+ parsetime(str, &tc->timestamp);
+ } else if (!strcmp(nodes[2].name, "id_str")) {
+ strlcpy(tc->itemid, str, sizeof(tc->itemid));
+ } else if (!strcmp(nodes[2].name, "full_text")) {
+ /* if set by retweet text don't override */
+ if (!tc->full_text[0])
+ strlcpy(tc->full_text, str, sizeof(tc-…
+ }
+ }
+ if (depth == 4 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ !strcmp(nodes[2].name, "user")) {
+ if (nodes[3].type == JSON_TYPE_STRING) {
+ if (!strcmp(nodes[3].name, "name")) {
+ strlcpy(tc->fullname, str, sizeof(tc->…
+ } else if (!strcmp(nodes[3].name, "screen_name…
+ strlcpy(tc->username, str, sizeof(tc->…
+ }
+ }
+ }
+
+ if (depth == 4 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status")) {
+ if (!strcmp(nodes[3].name, "id_str")) {
+// printf("DEBUG: retweet: id: %s\n", str);
+ strlcpy(tc->retweetid, str, sizeof(tc->retweet…
+ } else if (!strcmp(nodes[3].name, "full_text")) {
+ strlcpy(tc->full_text, str, sizeof(tc->full_te…
+// printf("DEBUG: retweet: full_text: %s\n", st…
+ }
+ }
+
+ if (depth == 5 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_OBJECT &&
+ nodes[4].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status") &&
+ !strcmp(nodes[3].name, "user")) {
+ if (!strcmp(nodes[4].name, "name")) {
+ strlcpy(tc->itemfullname, str, sizeof(tc->item…
+// printf("DEBUG: retweeted_status.user.name: %…
+ } else if (!strcmp(nodes[4].name, "screen_name")) {
+ strlcpy(tc->itemusername, str, sizeof(tc->item…
+// printf("DEBUG: retweeted_status.user.screen_…
+ }
+ }
+ }
+
+ if (depth == 5 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ !strcmp(nodes[2].name, "user")) {
+ if (nodes[3].type == JSON_TYPE_ARRAY &&
+ !strcmp(nodes[3].name, "pinned_tweet_ids")) {
+ if (nodes[4].type == JSON_TYPE_NUMBER) {
+ addpinned(str);
+// printf("DEBUG: pinned_tweets_ids[%zu]: %s\n",
+// nodes[4].index, str);
+ }
+ }
+ }
+
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_ARRAY &&
+ nodes[4].type == JSON_TYPE_OBJECT &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "entities") &&
+ !strcmp(nodes[3].name, "urls")) {
+ if (!strcmp(nodes[5].name, "url")) {
+// printf("DEBUG: url: %s\n", str);
+ strlcpy(url, str, sizeof(url));
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+// printf("DEBUG: expanded_url: %s\n", str);
+ /* assumes "expanded_url" is specified after "url" */
+ addurl(url, str);
+ url[0] = '\0';
+ }
+ }
+
+ /* [].entities.media[].url */
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_ARRAY &&
+ nodes[4].type == JSON_TYPE_OBJECT &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "entities") &&
+ !strcmp(nodes[3].name, "media")) {
+ if (!strcmp(nodes[5].name, "url")) {
+// printf("DEBUG: url: %s\n", str);
+ strlcpy(url, str, sizeof(url));
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+// printf("DEBUG: expanded_url: %s\n", str);
+ /* assumes "expanded_url" is specified after "url" */
+ addurl(url, str);
+ url[0] = '\0';
+ }
+ }
+
+// TODO: retweeted.status.entities.urls[]
+#if 0
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_OBJECT &&
+ nodes[4].type == JSON_TYPE_ARRAY &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status") &&
+ !strcmp(nodes[3].name, "entities") &&
+ !strcmp(nodes[4].name, "urls")) {
+ if (!strcmp(nodes[5].name, "url")) {
+ printf("DEBUG: url: %s\n", str);
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+ printf("DEBUG: expanded_url: %s\n", str);
+ }
+ }
+#endif
}
int
main(void)
{
+ struct tweet *t;
+ size_t i;
+ int r;
+
if (pledge("stdio", NULL) == -1)
err(1, "pledge");
- /* handlers */
- p.xmlattr = xmlattr;
- p.xmlattrentity = xmlattrentity;
- p.xmlcdata = xmlcdata;
- p.xmldata = xmldata;
- p.xmldataentity = xmldataentity;
- p.xmltagstart = xmltagstart;
- p.xmltagend = xmltagend;
- p.xmltagstartparsed = xmltagstartparsed;
- /* reader (stdin) */
- p.getnext = getchar;
-
- xml_parse(&p);
+ r = parsejson(processnodes);
+ if (r != 0)
+ errx(1, "invalid JSON");
+
+ // TODO: TEST: make sure the last tweet is printed too (addtweet() log…
+ for (t = tweets; t; t = t->next) {
+ /* check for pinned tweets */
+ for (i = 0; i < npinned; i++) {
+ if (!strcmp(t->itemid, pinnedids[i])) {
+// printf("DEBUG: pinned: %s\n", pinnedids[i]);
+ t->ispinned = 1;
+ break;
+ }
+ }
+ printtweet(t);
+ }
return 0;
}
diff --git a/tscrape_plain.c b/tscrape_plain.c
@@ -51,7 +51,7 @@ printfeed(FILE *fp, const char *feedname)
printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
fputs(" ", stdout);
- printescape(fields[FieldText]);
+ fputs(fields[FieldText], stdout);
putchar('\n');
}
}
diff --git a/tscrape_update b/tscrape_update
@@ -9,6 +9,12 @@ tscrapepath="$HOME/.tscrape/feeds"
# feeds are finished at a time.
maxjobs=8
+# Twitter authentication bearer (seems to be static).
+bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk…
+
+# guest token.
+token=""
+
# load config (evaluate shellscript).
# loadconfig(configfile)
loadconfig() {
@@ -36,12 +42,26 @@ log() {
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
}
+# acquire guest token.
+# guesttoken()
+guesttoken() {
+ # fail on redirects, hide User-Agent, timeout is 15 seconds.
+ curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+ -H "Authorization: Bearer ${bearer}" \
+ 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null …
+ sed -nE 's@.*{"guest_token":"([^"]*)"}.*@\1@p'
+}
+
# fetch a feed via HTTP/HTTPS etc.
-# fetch(name, url, feedfile)
+# fetch(name, twittername, feedfile)
fetch() {
+ url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_na…
+
# fail on redirects, hide User-Agent, timeout is 15 seconds.
curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
- "$2" 2>/dev/null
+ -H "Authorization: Bearer ${bearer}" \
+ -H "x-guest-token: $token" \
+ "${url}" 2>/dev/null
}
# filter fields.
@@ -151,6 +171,13 @@ feeds() {
echo "See tscraperc.example for an example." >&2
}
+# get quest token.
+token=$(guesttoken)
+if [ -z "${token}" ]; then
+ echo "Failed to acquire guest token" >&2
+ exit 1
+fi
+
# job counter.
curjobs=0
# signal number received for parent.
diff --git a/tscraperc.example b/tscraperc.example
@@ -2,8 +2,8 @@
# list of feeds to fetch:
feeds() {
- # feed <name> <feedurl>
- feed "Rich Felker" "https://twitter.com/richfelker"
- feed "Internet of shit" "https://twitter.com/internetofshit"
- feed "Donald Trump" "https://twitter.com/realdonaldtrump"
+ # feed <name> <twittername>
+ feed "Rich Felker" "richfelker"
+ feed "Internet of shit" "internetofshit"
+ feed "Donald Trump" "realdonaldtrump"
}
diff --git a/util.c b/util.c
@@ -106,43 +106,3 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
for (; col < len; ++col)
putc(pad, fp);
}
-
-void
-printescape(const char *s)
-{
- int r;
- const char *e;
-
- /* strip leading and trailing white-space */
- for (; *s && isspace((unsigned char)*s); s++)
- ;
- for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
- ;
-
- for (r = 0; *s && s < e; s++) {
- if (iscntrl((unsigned char)*s) || isspace((unsigned char)*s)) {
- r = 1;
- continue;
- }
- if (r) {
- r = 0;
- putchar(' ');
- }
- putchar(*s);
- }
-}
-
-int
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
-{
- struct tm *tm;
-
- if (strtotime(s, t))
- return -1;
- if (!(tm = localtime(t)))
- return -1;
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
- return -1;
-
- return 0;
-}
diff --git a/util.h b/util.h
@@ -30,8 +30,6 @@ enum {
};
size_t parseline(char *, char *[FieldLast]);
-int parsetime(const char *, time_t *, char *, size_t);
-void printescape(const char *);
void printutf8pad(FILE *, const char *, size_t, int);
int strtotime(const char *, time_t *);
void xmlencode(const char *, FILE *);
diff --git a/xml.c b/xml.c
@@ -1,451 +0,0 @@
-#include <ctype.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "xml.h"
-
-static void
-xml_parseattrs(XMLParser *x)
-{
- size_t namelen = 0, valuelen;
- int c, endsep, endname = 0, valuestart = 0;
-
- while ((c = GETNEXT()) != EOF) {
- if (isspace(c)) {
- if (namelen)
- endname = 1;
- continue;
- } else if (c == '?')
- ; /* ignore */
- else if (c == '=') {
- x->name[namelen] = '\0';
- valuestart = 1;
- endname = 1;
- } else if (namelen && ((endname && !valuestart && isalpha(c)) …
- /* attribute without value */
- x->name[namelen] = '\0';
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name,…
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, name…
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->taglen, x->name, n…
- endname = 0;
- x->name[0] = c;
- namelen = 1;
- } else if (namelen && valuestart) {
- /* attribute with value */
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name,…
-
- valuelen = 0;
- if (c == '\'' || c == '"') {
- endsep = c;
- } else {
- endsep = ' '; /* isspace() */
- goto startvalue;
- }
-
- while ((c = GETNEXT()) != EOF) {
-startvalue:
- if (c == '&') { /* entities */
- x->data[valuelen] = '\0';
- /* call data function with data before…
- if (valuelen && x->xmlattr)
- x->xmlattr(x, x->tag, x->tagle…
- x->data[0] = c;
- valuelen = 1;
- while ((c = GETNEXT()) != EOF) {
- if (c == endsep || (endsep == …
- break;
- if (valuelen < sizeof(x->data)…
- x->data[valuelen++] = …
- else {
- /* entity too long for…
- x->data[valuelen] = '\…
- if (x->xmlattr)
- x->xmlattr(x, …
- x->data[0] = c;
- valuelen = 1;
- break;
- }
- if (c == ';') {
- x->data[valuelen] = '\…
- if (x->xmlattrentity)
- x->xmlattrenti…
- valuelen = 0;
- break;
- }
- }
- } else if (c != endsep && !(endsep == ' ' && (…
- if (valuelen < sizeof(x->data) - 1) {
- x->data[valuelen++] = c;
- } else {
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, …
- x->data[0] = c;
- valuelen = 1;
- }
- }
- if (c == endsep || (endsep == ' ' && (c == '>'…
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->tagle…
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->ta…
- break;
- }
- }
- namelen = endname = valuestart = 0;
- } else if (namelen < sizeof(x->name) - 1) {
- x->name[namelen++] = c;
- }
- if (c == '>') {
- break;
- } else if (c == '/') {
- x->isshorttag = 1;
- x->name[0] = '\0';
- namelen = 0;
- }
- }
-}
-
-static void
-xml_parsecomment(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcommentstart)
- x->xmlcommentstart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == '-' || c == '>') {
- if (x->xmlcomment && datalen) {
- x->data[datalen] = '\0';
- x->xmlcomment(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == '-') {
- if (++i > 2) {
- if (x->xmlcomment)
- for (; i > 2; i--)
- x->xmlcomment(x, "-", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcommentend)
- x->xmlcommentend(x);
- return;
- } else if (i) {
- if (x->xmlcomment) {
- for (; i > 0; i--)
- x->xmlcomment(x, "-", 1);
- }
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcomment)
- x->xmlcomment(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static void
-xml_parsecdata(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcdatastart)
- x->xmlcdatastart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == ']' || c == '>') {
- if (x->xmlcdata && datalen) {
- x->data[datalen] = '\0';
- x->xmlcdata(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == ']') {
- if (++i > 2) {
- if (x->xmlcdata)
- for (; i > 2; i--)
- x->xmlcdata(x, "]", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcdataend)
- x->xmlcdataend(x);
- return;
- } else if (i) {
- if (x->xmlcdata)
- for (; i > 0; i--)
- x->xmlcdata(x, "]", 1);
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcdata)
- x->xmlcdata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static int
-codepointtoutf8(long r, char *s)
-{
- if (r == 0) {
- return 0; /* NUL byte */
- } else if (r <= 0x7F) {
- /* 1 byte: 0aaaaaaa */
- s[0] = r;
- return 1;
- } else if (r <= 0x07FF) {
- /* 2 bytes: 00000aaa aabbbbbb */
- s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
- s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
- return 2;
- } else if (r <= 0xFFFF) {
- /* 3 bytes: aaaabbbb bbcccccc */
- s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
- s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
- s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
- return 3;
- } else {
- /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
- s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
- s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
- s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
- s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
- return 4;
- }
-}
-
-static int
-namedentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- static const struct {
- const char *entity;
- int c;
- } entities[] = {
- { "amp;", '&' },
- { "lt;", '<' },
- { "gt;", '>' },
- { "apos;", '\'' },
- { "quot;", '"' },
- };
- size_t i;
-
- /* buffer is too small */
- if (bufsiz < 2)
- return -1;
-
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
- if (!strcmp(e, entities[i].entity)) {
- buf[0] = entities[i].c;
- buf[1] = '\0';
- return 1;
- }
- }
- return -1;
-}
-
-static int
-numericentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- long l;
- int len;
- char *end;
-
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
-
- errno = 0;
- /* hex (16) or decimal (10) */
- if (*e == 'x')
- l = strtol(++e, &end, 16);
- else
- l = strtol(e, &end, 10);
- /* invalid value or not a well-formed entity or invalid code point */
- if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
- return -1;
- len = codepointtoutf8(l, buf);
- buf[len] = '\0';
-
- return len;
-}
-
-/* convert named- or numeric entity string to buffer string
- * returns byte-length of string or -1 on failure. */
-int
-xml_entitytostr(const char *e, char *buf, size_t bufsiz)
-{
- /* doesn't start with & */
- if (e[0] != '&')
- return -1;
- /* numeric entity */
- if (e[1] == '#')
- return numericentitytostr(e + 2, buf, bufsiz);
- else /* named entity */
- return namedentitytostr(e + 1, buf, bufsiz);
-}
-
-void
-xml_parse(XMLParser *x)
-{
- size_t datalen, tagdatalen;
- int c, isend;
-
- while ((c = GETNEXT()) != EOF && c != '<')
- ; /* skip until < */
-
- while (c != EOF) {
- if (c == '<') { /* parse tag */
- if ((c = GETNEXT()) == EOF)
- return;
-
- if (c == '!') { /* cdata and comments */
- for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
- /* NOTE: sizeof(x->data) must be atlea…
- if (tagdatalen <= sizeof("[CDATA[") - …
- x->data[tagdatalen++] = c;
- if (c == '>')
- break;
- else if (c == '-' && tagdatalen == siz…
- (x->data[0] == '-')) {
- xml_parsecomment(x);
- break;
- } else if (c == '[') {
- if (tagdatalen == sizeof("[CDA…
- !strncmp(x->data, "[CDATA[…
- xml_parsecdata(x);
- break;
- }
- }
- }
- } else {
- /* normal tag (open, short open, close), proce…
- x->tag[0] = c;
- x->taglen = 1;
- x->isshorttag = isend = 0;
-
- /* treat processing instruction as shorttag, d…
- if (c == '?') {
- x->isshorttag = 1;
- } else if (c == '/') {
- if ((c = GETNEXT()) == EOF)
- return;
- x->tag[0] = c;
- isend = 1;
- }
-
- while ((c = GETNEXT()) != EOF) {
- if (c == '/')
- x->isshorttag = 1; /* short ta…
- else if (c == '>' || isspace(c)) {
- x->tag[x->taglen] = '\0';
- if (isend) { /* end tag, start…
- if (x->xmltagend)
- x->xmltagend(x…
- x->tag[0] = '\0';
- x->taglen = 0;
- } else {
- /* start tag */
- if (x->xmltagstart)
- x->xmltagstart…
- if (isspace(c))
- xml_parseattrs…
- if (x->xmltagstartpars…
- x->xmltagstart…
- }
- /* call tagend for shortform o…
- if (x->isshorttag) {
- if (x->xmltagend)
- x->xmltagend(x…
- x->tag[0] = '\0';
- x->taglen = 0;
- }
- break;
- } else if (x->taglen < sizeof(x->tag) …
- x->tag[x->taglen++] = c; /* NO…
- }
- }
- } else {
- /* parse tag data */
- datalen = 0;
- if (x->xmldatastart)
- x->xmldatastart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == '&') {
- if (datalen) {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data,…
- }
- x->data[0] = c;
- datalen = 1;
- while ((c = GETNEXT()) != EOF) {
- if (c == '<')
- break;
- if (datalen < sizeof(x->data) …
- x->data[datalen++] = c;
- else {
- /* entity too long for…
- x->data[datalen] = '\0…
- if (x->xmldata)
- x->xmldata(x, …
- x->data[0] = c;
- datalen = 1;
- break;
- }
- if (c == ';') {
- x->data[datalen] = '\0…
- if (x->xmldataentity)
- x->xmldataenti…
- datalen = 0;
- break;
- }
- }
- } else if (c != '<') {
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data,…
- x->data[0] = c;
- datalen = 1;
- }
- }
- if (c == '<') {
- x->data[datalen] = '\0';
- if (x->xmldata && datalen)
- x->xmldata(x, x->data, datalen…
- if (x->xmldataend)
- x->xmldataend(x);
- break;
- }
- }
- }
- }
-}
diff --git a/xml.h b/xml.h
@@ -1,49 +0,0 @@
-#ifndef _XML_H
-#define _XML_H
-
-#include <stdio.h>
-
-typedef struct xmlparser {
- /* handlers */
- void (*xmlattr)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlattrend)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlcdatastart)(struct xmlparser *);
- void (*xmlcdata)(struct xmlparser *, const char *, size_t);
- void (*xmlcdataend)(struct xmlparser *);
- void (*xmlcommentstart)(struct xmlparser *);
- void (*xmlcomment)(struct xmlparser *, const char *, size_t);
- void (*xmlcommentend)(struct xmlparser *);
- void (*xmldata)(struct xmlparser *, const char *, size_t);
- void (*xmldataend)(struct xmlparser *);
- void (*xmldataentity)(struct xmlparser *, const char *, size_t);
- void (*xmldatastart)(struct xmlparser *);
- void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
- void (*xmltagstart)(struct xmlparser *, const char *, size_t);
- void (*xmltagstartparsed)(struct xmlparser *, const char *,
- size_t, int);
-
-#ifndef GETNEXT
- #define GETNEXT (x)->getnext
- int (*getnext)(void);
-#endif
-
- /* current tag */
- char tag[1024];
- size_t taglen;
- /* current tag is in short form ? <tag /> */
- int isshorttag;
- /* current attribute name */
- char name[1024];
- /* data buffer used for tag data, cdata and attribute data */
- char data[BUFSIZ];
-} XMLParser;
-
-int xml_entitytostr(const char *, char *, size_t);
-void xml_parse(XMLParser *);
-#endif
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.