work-in-progress: support the new Twitter site - tscrape - twitter scraper | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit c3e76b0f57c58b284cd13ce008c082525c8ee28a | |
parent 663dab7d9883a291ed570a743fb89a16e1a01d85 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Fri, 5 Jun 2020 14:51:58 +0200 | |
work-in-progress: support the new Twitter site | |
Scraping doesn't work anymore. Use the Twitter JSON API. | |
Major thanks to leot for helping with this. | |
Diffstat: | |
M Makefile | 18 +++++++++--------- | |
M README | 28 ++++++++++++---------------- | |
A json.c | 313 +++++++++++++++++++++++++++++… | |
A json.h | 26 ++++++++++++++++++++++++++ | |
M tscrape.c | 591 +++++++++++++++++++----------… | |
M tscrape_plain.c | 2 +- | |
M tscrape_update | 31 +++++++++++++++++++++++++++++… | |
M tscraperc.example | 8 ++++---- | |
M util.c | 40 -----------------------------… | |
M util.h | 2 -- | |
D xml.c | 451 -----------------------------… | |
D xml.h | 49 -----------------------------… | |
12 files changed, 755 insertions(+), 804 deletions(-) | |
--- | |
diff --git a/Makefile b/Makefile | |
@@ -25,17 +25,17 @@ SCRIPTS = \ | |
SRC = ${BIN:=.c} | |
HDR = \ | |
util.h\ | |
- xml.h | |
+ json.h | |
LIBUTIL = libutil.a | |
LIBUTILSRC = \ | |
util.c | |
LIBUTILOBJ = ${LIBUTILSRC:.c=.o} | |
-LIBXML = libxml.a | |
-LIBXMLSRC = \ | |
- xml.c | |
-LIBXMLOBJ = ${LIBXMLSRC:.c=.o} | |
+LIBJSON = libjson.a | |
+LIBJSONSRC = \ | |
+ json.c | |
+LIBJSONOBJ = ${LIBJSONSRC:.c=.o} | |
COMPATSRC = \ | |
strlcat.c\ | |
@@ -44,7 +44,7 @@ COMPATOBJ =\ | |
strlcat.o\ | |
strlcpy.o | |
-LIB = ${LIBUTIL} ${LIBXML} ${COMPATOBJ} | |
+LIB = ${LIBUTIL} ${LIBJSON} ${COMPATOBJ} | |
MAN1 = ${BIN:=.1}\ | |
${SCRIPTS:=.1} | |
@@ -59,7 +59,7 @@ all: $(BIN) | |
${BIN}: ${LIB} ${@:=.o} | |
-OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ} ${COMPATOBJ} | |
+OBJ = ${SRC:.c=.o} ${LIBJSONOBJ} ${LIBUTILOBJ} ${COMPATOBJ} | |
${OBJ}: ${HDR} | |
@@ -73,7 +73,7 @@ ${LIBUTIL}: ${LIBUTILOBJ} | |
${AR} rc $@ $? | |
${RANLIB} $@ | |
-${LIBXML}: ${LIBXMLOBJ} | |
+${LIBJSON}: ${LIBJSONOBJ} | |
${AR} rc $@ $? | |
${RANLIB} $@ | |
@@ -81,7 +81,7 @@ dist: | |
rm -rf "${NAME}-${VERSION}" | |
mkdir -p "${NAME}-${VERSION}" | |
cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \ | |
- ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \ | |
+ ${SRC} ${LIBJSONSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \ | |
Makefile \ | |
tscraperc.example style.css \ | |
"${NAME}-${VERSION}" | |
diff --git a/README b/README | |
@@ -1,13 +1,16 @@ | |
tscrape | |
------- | |
-Twitter feed HTML scraper. | |
+Twitter feed parser. | |
-It scrapes HTML from stdin and outputs it to a TAB-separated format that can be | |
-easier parsed with various (UNIX) tools. There are formatting programs included | |
-to convert this TAB-separated format to various other formats. There are also | |
-some programs and scripts included to import and export OPML and to fetch, | |
-filter, merge and order items. | |
+It parses JSON from stdin and outputs it to a TAB-separated format that can be | |
+processed easier with various (UNIX) tools. There are formatting programs | |
+included to convert this TAB-separated format to various other formats. There | |
+are also some programs and scripts included to import and export OPML and to | |
+fetch, filter, merge and order items. | |
+ | |
+The name tscrape is used because it used to scrape the HTML from the Twitter | |
+page. It is now using the JSON API contents. | |
Build and install | |
@@ -20,20 +23,13 @@ $ make | |
Usage | |
----- | |
- curl -H 'User-Agent:' -s 'https://twitter.com/namehere' | tscrape | |
- | |
-or | |
- | |
- ftp -o - -U '' 'https://twitter.com/namehere' 2>/dev/null | tscrape | |
- | |
-or | |
- | |
- hurl 'https://twitter.com/namehere' | tscrape | |
+* Create a tscraperc configuration file in ~/.tscrape/tscraperc, see tscraperc… | |
+* Run tscrape_update | |
Using sfeed to convert the tscrape TSV output to an Atom feed: | |
- hurl 'https://twitter.com/namehere' | tscrape | \ | |
+ tscrape < ~/.tscrape/feeds/name | \ | |
awk 'BEGIN { OFS = FS = "\t"; } | |
{ | |
print $1 OFS $4 OFS "https://twitter.com/" $6 "/status/" $5 \ | |
diff --git a/json.c b/json.c | |
@@ -0,0 +1,313 @@ | |
+#include <ctype.h> | |
+#include <errno.h> | |
+#include <stdint.h> | |
+#include <stdio.h> | |
+#include <stdlib.h> | |
+#include <string.h> | |
+ | |
+#define GETNEXT getchar | |
+ | |
+#include "json.h" | |
+ | |
+static int | |
+codepointtoutf8(long r, char *s) | |
+{ | |
+ if (r == 0) { | |
+ return 0; /* NUL byte */ | |
+ } else if (r <= 0x7F) { | |
+ /* 1 byte: 0aaaaaaa */ | |
+ s[0] = r; | |
+ return 1; | |
+ } else if (r <= 0x07FF) { | |
+ /* 2 bytes: 00000aaa aabbbbbb */ | |
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
+ return 2; | |
+ } else if (r <= 0xFFFF) { | |
+ /* 3 bytes: aaaabbbb bbcccccc */ | |
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
+ return 3; | |
+ } else { | |
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
+ return 4; | |
+ } | |
+} | |
+ | |
+static int | |
+hexdigit(int c) | |
+{ | |
+ if (c >= '0' && c <= '9') | |
+ return c - '0'; | |
+ else if (c >= 'a' && c <= 'f') | |
+ return 10 + (c - 'a'); | |
+ else if (c >= 'A' && c <= 'F') | |
+ return 10 + (c - 'A'); | |
+ return 0; | |
+} | |
+ | |
+static int | |
+capacity(char **value, size_t *sz, size_t cur, size_t inc) | |
+{ | |
+ size_t need, newsiz; | |
+ char *newp; | |
+ | |
+ /* check for addition overflow */ | |
+ if (cur > SIZE_MAX - inc) { | |
+ errno = EOVERFLOW; | |
+ return -1; | |
+ } | |
+ need = cur + inc; | |
+ | |
+ if (need > *sz) { | |
+ if (need > SIZE_MAX / 2) { | |
+ newsiz = SIZE_MAX; | |
+ } else { | |
+ for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; new… | |
+ ; | |
+ } | |
+ if (!(newp = realloc(*value, newsiz))) | |
+ return -1; /* up to caller to free *value */ | |
+ *value = newp; | |
+ *sz = newsiz; | |
+ } | |
+ return 0; | |
+} | |
+ | |
+#define EXPECT_VALUE "{[\"-0123456789tfn" | |
+#define EXPECT_STRING "\"" | |
+#define EXPECT_END "}]," | |
+#define EXPECT_OBJECT_STRING EXPECT_STRING "}" | |
+#define EXPECT_OBJECT_KEY ":" | |
+#define EXPECT_ARRAY_VALUE EXPECT_VALUE "]" | |
+ | |
+#define JSON_INVALID() do { ret = JSON_ERROR_INVALID; goto end; } while … | |
+ | |
+int | |
+parsejson(void (*cb)(struct json_node *, size_t, const char *)) | |
+{ | |
+ struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 }; | |
+ size_t depth = 0, p = 0, len, sz = 0; | |
+ long cp, hi, lo; | |
+ char pri[128], *str = NULL; | |
+ int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM; | |
+ const char *expect = EXPECT_VALUE; | |
+ | |
+ if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1) | |
+ goto end; | |
+ nodes[0].name[0] = '\0'; | |
+ | |
+ while (1) { | |
+ c = GETNEXT(); | |
+handlechr: | |
+ if (c == EOF) | |
+ break; | |
+ | |
+ /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */ | |
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r') | |
+ continue; | |
+ | |
+ if (!c || !strchr(expect, c)) | |
+ JSON_INVALID(); | |
+ | |
+ switch (c) { | |
+ case ':': | |
+ iskey = 0; | |
+ expect = EXPECT_VALUE; | |
+ break; | |
+ case '"': | |
+ nodes[depth].type = JSON_TYPE_STRING; | |
+ escape = 0; | |
+ len = 0; | |
+ while (1) { | |
+ c = GETNEXT(); | |
+chr: | |
+ /* EOF or control char: 0x7f is not defined as… | |
+ if (c < 0x20) | |
+ JSON_INVALID(); | |
+ | |
+ if (escape) { | |
+escchr: | |
+ escape = 0; | |
+ switch (c) { | |
+ case '"': /* FALLTHROUGH */ | |
+ case '\\': | |
+ case '/': break; | |
+ case 'b': c = '\b'; break; | |
+ case 'f': c = '\f'; break; | |
+ case 'n': c = '\n'; break; | |
+ case 'r': c = '\r'; break; | |
+ case 't': c = '\t'; break; | |
+ case 'u': /* hex hex hex hex */ | |
+ if (capacity(&str, &sz, len, 4… | |
+ goto end; | |
+ for (i = 12, cp = 0; i >= 0; i… | |
+ if ((c = GETNEXT()) ==… | |
+ JSON_INVALID()… | |
+ cp |= (hexdigit(c) << … | |
+ } | |
+ /* RFC8259 - 7. Strings - surr… | |
+ * 0xd800 - 0xdb7f - high surr… | |
+ if (cp >= 0xd800 && cp <= 0xdb… | |
+ if ((c = GETNEXT()) !=… | |
+ len += codepoi… | |
+ goto chr; | |
+ } | |
+ if ((c = GETNEXT()) !=… | |
+ len += codepoi… | |
+ goto escchr; | |
+ } | |
+ for (hi = cp, i = 12, … | |
+ if ((c = GETNE… | |
+ JSON_I… | |
+ lo |= (hexdigi… | |
+ } | |
+ /* 0xdc00 - 0xdfff - l… | |
+ if (lo >= 0xdc00 && lo… | |
+ cp = (hi << 10… | |
+ } else { | |
+ /* handle grac… | |
+ len += codepoi… | |
+ if (capacity(&… | |
+ goto e… | |
+ len += codepoi… | |
+ continue; | |
+ } | |
+ } | |
+ len += codepointtoutf8(cp, &st… | |
+ continue; | |
+ default: | |
+ JSON_INVALID(); /* invalid esc… | |
+ } | |
+ if (capacity(&str, &sz, len, 1) == -1) | |
+ goto end; | |
+ str[len++] = c; | |
+ } else if (c == '\\') { | |
+ escape = 1; | |
+ } else if (c == '"') { | |
+ if (capacity(&str, &sz, len, 1) == -1) | |
+ goto end; | |
+ str[len++] = '\0'; | |
+ | |
+ if (iskey) { | |
+ /* copy string as key, includi… | |
+ if (capacity(&(nodes[depth].na… | |
+ goto end; | |
+ memcpy(nodes[depth].name, str,… | |
+ } else { | |
+ cb(nodes, depth + 1, str); | |
+ } | |
+ break; | |
+ } else { | |
+ if (capacity(&str, &sz, len, 1) == -1) | |
+ goto end; | |
+ str[len++] = c; | |
+ } | |
+ } | |
+ if (iskey) | |
+ expect = EXPECT_OBJECT_KEY; | |
+ else | |
+ expect = EXPECT_END; | |
+ break; | |
+ case '[': | |
+ case '{': | |
+ if (depth + 1 >= JSON_MAX_NODE_DEPTH) | |
+ JSON_INVALID(); /* too deep */ | |
+ | |
+ nodes[depth].index = 0; | |
+ if (c == '[') { | |
+ nodes[depth].type = JSON_TYPE_ARRAY; | |
+ expect = EXPECT_ARRAY_VALUE; | |
+ } else if (c == '{') { | |
+ iskey = 1; | |
+ nodes[depth].type = JSON_TYPE_OBJECT; | |
+ expect = EXPECT_OBJECT_STRING; | |
+ } | |
+ | |
+ cb(nodes, depth + 1, ""); | |
+ | |
+ depth++; | |
+ nodes[depth].index = 0; | |
+ if (capacity(&(nodes[depth].name), &(nodes[depth].name… | |
+ goto end; | |
+ nodes[depth].name[0] = '\0'; | |
+ break; | |
+ case ']': | |
+ case '}': | |
+ if (!depth || | |
+ (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARR… | |
+ (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJ… | |
+ JSON_INVALID(); /* unbalanced nodes */ | |
+ | |
+ nodes[--depth].index++; | |
+ expect = EXPECT_END; | |
+ break; | |
+ case ',': | |
+ if (!depth) | |
+ JSON_INVALID(); /* unbalanced nodes */ | |
+ | |
+ nodes[depth - 1].index++; | |
+ if (nodes[depth - 1].type == JSON_TYPE_OBJECT) { | |
+ iskey = 1; | |
+ expect = EXPECT_STRING; | |
+ } else { | |
+ expect = EXPECT_VALUE; | |
+ } | |
+ break; | |
+ case 't': /* true */ | |
+ if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() … | |
+ JSON_INVALID(); | |
+ nodes[depth].type = JSON_TYPE_BOOL; | |
+ cb(nodes, depth + 1, "true"); | |
+ expect = EXPECT_END; | |
+ break; | |
+ case 'f': /* false */ | |
+ if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() … | |
+ GETNEXT() != 'e') | |
+ JSON_INVALID(); | |
+ nodes[depth].type = JSON_TYPE_BOOL; | |
+ cb(nodes, depth + 1, "false"); | |
+ expect = EXPECT_END; | |
+ break; | |
+ case 'n': /* null */ | |
+ if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() … | |
+ JSON_INVALID(); | |
+ nodes[depth].type = JSON_TYPE_NULL; | |
+ cb(nodes, depth + 1, "null"); | |
+ expect = EXPECT_END; | |
+ break; | |
+ default: /* number */ | |
+ nodes[depth].type = JSON_TYPE_NUMBER; | |
+ p = 0; | |
+ pri[p++] = c; | |
+ expect = EXPECT_END; | |
+ while (1) { | |
+ c = GETNEXT(); | |
+ if (c == EOF || | |
+ !c || !strchr("0123456789eE+-.", c) || | |
+ p + 1 >= sizeof(pri)) { | |
+ pri[p] = '\0'; | |
+ cb(nodes, depth + 1, pri); | |
+ goto handlechr; /* do not read next ch… | |
+ } else { | |
+ pri[p++] = c; | |
+ } | |
+ } | |
+ } | |
+ } | |
+ if (depth) | |
+ JSON_INVALID(); /* unbalanced nodes */ | |
+ | |
+ ret = 0; /* success */ | |
+end: | |
+ for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++) | |
+ free(nodes[depth].name); | |
+ free(str); | |
+ | |
+ return ret; | |
+} | |
diff --git a/json.h b/json.h | |
@@ -0,0 +1,26 @@ | |
+#include <stddef.h> | |
+ | |
+enum JSONType { | |
+ JSON_TYPE_ARRAY = 'a', | |
+ JSON_TYPE_OBJECT = 'o', | |
+ JSON_TYPE_STRING = 's', | |
+ JSON_TYPE_BOOL = 'b', | |
+ JSON_TYPE_NULL = '?', | |
+ JSON_TYPE_NUMBER = 'n' | |
+}; | |
+ | |
+enum JSONError { | |
+ JSON_ERROR_MEM = -2, | |
+ JSON_ERROR_INVALID = -1 | |
+}; | |
+ | |
+#define JSON_MAX_NODE_DEPTH 64 | |
+ | |
+struct json_node { | |
+ enum JSONType type; | |
+ char *name; | |
+ size_t namesiz; | |
+ size_t index; /* count/index for array or object type */ | |
+}; | |
+ | |
+int parsejson(void (*cb)(struct json_node *, size_t, const char *)); | |
diff --git a/tscrape.c b/tscrape.c | |
@@ -2,107 +2,52 @@ | |
#include <ctype.h> | |
#include <err.h> | |
+#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <strings.h> | |
+#include <time.h> | |
#include <unistd.h> | |
-#include "xml.h" | |
+#include "json.h" | |
#include "util.h" | |
#define STRP(s) s,sizeof(s)-1 | |
-/* states */ | |
-enum { | |
- Item = 1, | |
- Stream = 2, | |
- Header = 4, | |
- Timestamp = 8, | |
- Text = 16 | |
+/* a tweet */ | |
+struct tweet { | |
+ char fullname[1024]; | |
+ int ispinned; | |
+ char itemusername[1024]; | |
+ char itemfullname[1024]; | |
+ char full_text[4096]; | |
+ char username[1024]; | |
+ time_t timestamp; | |
+ char datatime[16]; | |
+ char itemid[64]; | |
+ char retweetid[64]; | |
+ | |
+ struct tweet *next; | |
}; | |
-/* data */ | |
-static char fullname[1024]; | |
-static int ispinned; | |
-static char itemusername[1024]; | |
-static char itemfullname[1024]; | |
-static char timestamp[16]; | |
-static char text[4096]; | |
-static char username[1024]; | |
- | |
-static char classname[256]; | |
-static char datatime[16]; | |
-static char itemid[64]; | |
-static char retweetid[64]; | |
-static int state; | |
-static XMLParser p; | |
- | |
-static const char *ignorestate, *endtag; | |
-static int (*getnext)(void); | |
- | |
-/* return a space for all data until some case-insensitive string occurs. This | |
- is used to parse incorrect HTML/XML that contains unescaped HTML in script | |
- or style tags. If you see some </script> tag in a CDATA or comment | |
- section then e-mail W3C and tell them the web is too complex. */ | |
-static inline int | |
-getnext_ignore(void) | |
-{ | |
- int c; | |
- | |
- if ((c = getnext()) == EOF) | |
- return EOF; | |
+/* url entities and their replacements */ | |
+struct url { | |
+ char url[256]; | |
+ size_t url_len; | |
+ char expanded_url[1024]; | |
- if (tolower(c) == tolower((unsigned char)*ignorestate)) { | |
- ignorestate++; | |
- if (*ignorestate == '\0') { | |
- p.getnext = getnext; /* restore */ | |
- return c; | |
- } | |
- } else { | |
- ignorestate = endtag; | |
- } | |
- | |
- return ' '; | |
-} | |
- | |
-static void | |
-printtweet(void) | |
-{ | |
- char buf[32]; | |
- time_t t; | |
- | |
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) | |
- printf("%lld", (long long)t); | |
- putchar('\t'); | |
- printescape(username); | |
- putchar('\t'); | |
- printescape(fullname); | |
- putchar('\t'); | |
- printescape(text); | |
- putchar('\t'); | |
- printescape(itemid); | |
- putchar('\t'); | |
- printescape(itemusername); | |
- putchar('\t'); | |
- printescape(itemfullname); | |
- putchar('\t'); | |
- printescape(retweetid); | |
- putchar('\t'); | |
- printf("%d", ispinned); | |
- putchar('\n'); | |
-} | |
+ struct url *next; | |
+}; | |
-static int | |
-isclassmatch(const char *classes, const char *clss, size_t len) | |
-{ | |
- const char *p; | |
+static struct tweet *tweets, *tc; | |
+static struct url *urls, *uc; | |
+static char url[256]; | |
- if (!(p = strstr(classes, clss))) | |
- return 0; | |
- return (p == classes || isspace((unsigned char)p[-1])) && | |
- (isspace((unsigned char)p[len]) || !p[len]); | |
-} | |
+#define MAX_PINNED 5 | |
+static char pinnedids[MAX_PINNED][64]; | |
+static size_t npinned; | |
+#if 0 | |
/* convert XML and some HTML entities */ | |
static int | |
html_entitytostr(const char *s, char *buf, size_t bufsiz) | |
@@ -115,192 +60,378 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz) | |
return (ssize_t)strlcpy(buf, " ", bufsiz); | |
return len; | |
} | |
+#endif | |
-static void | |
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) | |
+long long | |
+datetounix(long long year, int mon, int day, int hour, int min, int sec) | |
{ | |
- if (!strcmp(t, "p")) | |
- state &= ~Text; | |
- else if (!strcmp(t, "span")) | |
- state &= ~(Timestamp); | |
+ static const int secs_through_month[] = { | |
+ 0, 31 * 86400, 59 * 86400, 90 * 86400, | |
+ 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, | |
+ 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; | |
+ int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; | |
+ long long t; | |
+ | |
+ if (year - 2ULL <= 136) { | |
+ leaps = (year - 68) >> 2; | |
+ if (!((year - 68) & 3)) { | |
+ leaps--; | |
+ is_leap = 1; | |
+ } else { | |
+ is_leap = 0; | |
+ } | |
+ t = 31536000 * (year - 70) + 86400 * leaps; | |
+ } else { | |
+ cycles = (year - 100) / 400; | |
+ rem = (year - 100) % 400; | |
+ if (rem < 0) { | |
+ cycles--; | |
+ rem += 400; | |
+ } | |
+ if (!rem) { | |
+ is_leap = 1; | |
+ } else { | |
+ if (rem >= 300) | |
+ centuries = 3, rem -= 300; | |
+ else if (rem >= 200) | |
+ centuries = 2, rem -= 200; | |
+ else if (rem >= 100) | |
+ centuries = 1, rem -= 100; | |
+ if (rem) { | |
+ leaps = rem / 4U; | |
+ rem %= 4U; | |
+ is_leap = !rem; | |
+ } | |
+ } | |
+ leaps += 97 * cycles + 24 * centuries - is_leap; | |
+ t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + … | |
+ } | |
+ t += secs_through_month[mon]; | |
+ if (is_leap && mon >= 2) | |
+ t += 86400; | |
+ t += 86400LL * (day - 1); | |
+ t += 3600LL * hour; | |
+ t += 60LL * min; | |
+ t += sec; | |
+ | |
+ return t; | |
} | |
-static void | |
-xmltagstart(XMLParser *x, const char *t, size_t tl) | |
+/* parse time format: "Wed May 27 04:12:34 +0000 2020" | |
+ assumes tz offset is "+0000" */ | |
+static int | |
+parsetime(const char *s, time_t *tp) | |
{ | |
- classname[0] = '\0'; | |
+ static char *mons[] = { | |
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun", | |
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", | |
+ }; | |
+ int year, mon = 0, mday, hour, min, sec, i; | |
+ char tzbuf[6], monbuf[4], wdaybuf[4]; | |
+ | |
+ for (; *s && isspace((unsigned char)*s); s++) | |
+ ; | |
+ i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d", | |
+ wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year); | |
+ if (i != 8) | |
+ return -1; | |
+ for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) { | |
+ if (!strcmp(mons[i], monbuf)) { | |
+ mon = i + 1; | |
+ break; | |
+ } | |
+ } | |
+ if (mon == 0) | |
+ return -1; | |
+ | |
+ /* invalid range */ | |
+ if (year < 0 || year > 9999 || | |
+ mon < 1 || mon > 12 || | |
+ mday < 1 || mday > 31 || | |
+ hour < 0 || hour > 23 || | |
+ min < 0 || min> 59 || | |
+ sec < 0 || sec > 59) | |
+ return -1; | |
+ | |
+ if (tp) | |
+ *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec); | |
+ return 0; | |
} | |
static void | |
-xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) | |
+printescape(const char *s) | |
{ | |
- /* temporary replace the callback except the reader and end of tag | |
- restore the context once we receive the same ignored tag in the | |
- end tag handler */ | |
- if (!strcasecmp(t, "script")) { | |
- ignorestate = endtag = "</script>"; | |
- getnext = x->getnext; /* for restore */ | |
- x->getnext = getnext_ignore; | |
- return; | |
- } else if (!strcasecmp(t, "style")) { | |
- ignorestate = endtag = "</style>"; | |
- getnext = x->getnext; /* for restore */ | |
- x->getnext = getnext_ignore; | |
- return; | |
+ for (; *s; s++) { | |
+ if (!iscntrl((unsigned char)*s)) | |
+ putchar(*s); | |
} | |
- | |
- if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text")))… | |
- if (state & (Item | Stream | Header)) | |
- state |= Text; | |
- } else if (!strcmp(t, "div") && | |
- isclassmatch(classname, STRP("stream-item-footer"))) { | |
- if (text[0] && username[0]) | |
- printtweet(); | |
- state = 0; | |
- } else if (!strcmp(t, "li") && | |
- isclassmatch(classname, STRP("js-stream-item"))) { | |
- if (state & Item) | |
- return; | |
- state |= Item; | |
- datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0'; | |
- itemid[0] = itemusername[0] = retweetid[0] = '\0'; | |
- ispinned = 0; | |
- if (isclassmatch(classname, STRP("js-pinned"))) | |
- ispinned = 1; | |
- } else if (state & Item) { | |
- if (!strcmp(t, "div") && | |
- isclassmatch(classname, STRP("js-stream-tweet"))) { | |
- state &= ~(Text|Header); | |
- state |= Stream; | |
- } else if (!strcmp(t, "a") && | |
- isclassmatch(classname, STRP("js-action-profile")))… | |
- state |= Header; | |
- } else if (!strcmp(t, "span") && | |
- isclassmatch(classname, STRP("js-short-timestamp")))… | |
- state |= Timestamp; | |
- strlcpy(timestamp, datatime, sizeof(timestamp)); | |
- datatime[0] = '\0'; | |
- } | |
- } | |
- if ((state & Text) && !strcmp(t, "a") && !isspace((unsigned char)text[… | |
- strlcat(text, " ", sizeof(text)); | |
} | |
+/* print text and expand urls */ | |
static void | |
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
- const char *v, size_t vl) | |
+printexpand(const char *s) | |
{ | |
- /* NOTE: assumes classname attribute is set before data-* in current t… | |
- if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-… | |
- if (!strcmp(a, "data-screen-name")) { | |
- strlcat(username, " ", sizeof(username)); | |
- strlcat(username, v, sizeof(username)); | |
- } else if (!strcmp(a, "data-name")) { | |
- strlcat(fullname, " ", sizeof(fullname)); | |
- strlcat(fullname, v, sizeof(fullname)); | |
- } | |
- } | |
- | |
- if (!strcmp(a, "class")) { | |
- strlcat(classname, v, sizeof(classname)); | |
- } else if (state & Item) { | |
- if (!strcmp(t, "div")) { | |
- if (!strcmp(a, "data-item-id")) | |
- strlcpy(itemid, v, sizeof(itemid)); | |
- else if (!strcmp(a, "data-retweet-id")) | |
- strlcpy(retweetid, v, sizeof(retweetid)); | |
- | |
- if (isclassmatch(classname, STRP("js-stream-tweet"))) { | |
- if (!strcmp(a, "data-screen-name")) { | |
- strlcat(itemusername, " ", sizeof(item… | |
- strlcat(itemusername, v, sizeof(itemus… | |
- } else if (!strcmp(a, "data-name")) { | |
- strlcat(itemfullname, " ", sizeof(item… | |
- strlcat(itemfullname, v, sizeof(itemfu… | |
- } | |
+ struct url *u; | |
+ | |
+ for (; *s; s++) { | |
+ if (iscntrl((unsigned char)*s)) | |
+ continue; | |
+ for (u = urls; u; u = u->next) { | |
+ if (!strncmp(s, u->url, u->url_len)) { | |
+ s += u->url_len; | |
+ printescape(u->expanded_url); | |
+ break; | |
} | |
- } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) { | |
- /* UNIX timestamp */ | |
- strlcpy(datatime, v, sizeof(datatime)); | |
- } | |
- /* NOTE: can be <div data-image-url>. */ | |
- if (!strcmp(a, "data-image-url")) { | |
- strlcat(text, " ", sizeof(text)); | |
- strlcat(text, v, sizeof(text)); | |
- } | |
- | |
- /* indication it has a video */ | |
- if (itemid[0] && !strcmp(a, "data-playable-media-url")) { | |
- strlcat(text, " ", sizeof(text)); | |
- strlcat(text, "https://twitter.com/i/videos/", sizeof(… | |
- strlcat(text, itemid, sizeof(text)); | |
} | |
+ if (!u) | |
+ putchar(*s); | |
} | |
} | |
static void | |
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
- const char *v, size_t vl) | |
+printtweet(struct tweet *t) | |
{ | |
- char buf[16]; | |
- int len; | |
+ if (t->timestamp != -1) | |
+ printf("%lld", (long long)t->timestamp); | |
+ putchar('\t'); | |
+ printescape(t->username); | |
+ putchar('\t'); | |
+ printescape(t->fullname); | |
+ putchar('\t'); | |
+ printexpand(t->full_text); | |
+ putchar('\t'); | |
+ printescape(t->itemid); | |
+ putchar('\t'); | |
+ if (t->itemusername[0]) | |
+ printescape(t->itemusername); | |
+ else | |
+ printescape(t->username); | |
+ putchar('\t'); | |
+ if (t->itemfullname[0]) | |
+ printescape(t->itemfullname); | |
+ else | |
+ printescape(t->fullname); | |
+ putchar('\t'); | |
+ printescape(t->retweetid); | |
+ putchar('\t'); | |
+ printf("%d", t->ispinned); | |
+ putchar('\n'); | |
+} | |
- if (!state) | |
+void | |
+addpinned(const char *str) | |
+{ | |
+ if (npinned + 1 >= MAX_PINNED) | |
return; | |
- if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0) | |
- xmlattr(x, t, tl, a, al, buf, (size_t)len); | |
- else | |
- xmlattr(x, t, tl, a, al, v, vl); | |
+ strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0])); | |
+ npinned++; | |
} | |
-static void | |
-xmldata(XMLParser *x, const char *d, size_t dl) | |
+void | |
+addtweet(void) | |
{ | |
- if (state & Text) { | |
- if (!isclassmatch(classname, STRP("u-hidden"))) | |
- strlcat(text, d, sizeof(text)); | |
- } | |
+ struct tweet *t; | |
+ | |
+ if (!(t = calloc(1, sizeof(*t)))) | |
+ err(1, "calloc"); | |
+ t->timestamp = -1; | |
+ if (tweets) | |
+ tc = tc->next = t; | |
+ else | |
+ tweets = tc = t; | |
} | |
-static void | |
-xmldataentity(XMLParser *x, const char *d, size_t dl) | |
+void | |
+addurl(const char *url, const char *expanded_url) | |
{ | |
- char buf[16]; | |
- int len; | |
+ struct url *u; | |
- if (!(state & Text)) | |
- return; | |
- if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0) | |
- xmldata(x, buf, (size_t)len); | |
+ if (!(u = calloc(1, sizeof(*u)))) | |
+ err(1, "calloc"); | |
+ strlcpy(u->url, url, sizeof(u->url)); | |
+ u->url_len = strlen(u->url); | |
+ strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url)); | |
+ | |
+ if (urls) | |
+ uc = uc->next = u; | |
else | |
- xmldata(x, d, dl); | |
+ urls = uc = u; | |
} | |
-static void | |
-xmlcdata(XMLParser *x, const char *d, size_t dl) | |
+void | |
+processnodes(struct json_node *nodes, size_t depth, const char *str) | |
{ | |
- xmldata(x, d, dl); | |
+ if (depth == 2 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT) { | |
+ addtweet(); | |
+ } | |
+ | |
+ if (tc) { | |
+ if (depth == 3 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_STRING) { | |
+ if (!strcmp(nodes[2].name, "created_at")) { | |
+ parsetime(str, &tc->timestamp); | |
+ } else if (!strcmp(nodes[2].name, "id_str")) { | |
+ strlcpy(tc->itemid, str, sizeof(tc->itemid)); | |
+ } else if (!strcmp(nodes[2].name, "full_text")) { | |
+ /* if set by retweet text don't override */ | |
+ if (!tc->full_text[0]) | |
+ strlcpy(tc->full_text, str, sizeof(tc-… | |
+ } | |
+ } | |
+ if (depth == 4 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ !strcmp(nodes[2].name, "user")) { | |
+ if (nodes[3].type == JSON_TYPE_STRING) { | |
+ if (!strcmp(nodes[3].name, "name")) { | |
+ strlcpy(tc->fullname, str, sizeof(tc->… | |
+ } else if (!strcmp(nodes[3].name, "screen_name… | |
+ strlcpy(tc->username, str, sizeof(tc->… | |
+ } | |
+ } | |
+ } | |
+ | |
+ if (depth == 4 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ nodes[3].type == JSON_TYPE_STRING && | |
+ !strcmp(nodes[2].name, "retweeted_status")) { | |
+ if (!strcmp(nodes[3].name, "id_str")) { | |
+// printf("DEBUG: retweet: id: %s\n", str); | |
+ strlcpy(tc->retweetid, str, sizeof(tc->retweet… | |
+ } else if (!strcmp(nodes[3].name, "full_text")) { | |
+ strlcpy(tc->full_text, str, sizeof(tc->full_te… | |
+// printf("DEBUG: retweet: full_text: %s\n", st… | |
+ } | |
+ } | |
+ | |
+ if (depth == 5 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ nodes[3].type == JSON_TYPE_OBJECT && | |
+ nodes[4].type == JSON_TYPE_STRING && | |
+ !strcmp(nodes[2].name, "retweeted_status") && | |
+ !strcmp(nodes[3].name, "user")) { | |
+ if (!strcmp(nodes[4].name, "name")) { | |
+ strlcpy(tc->itemfullname, str, sizeof(tc->item… | |
+// printf("DEBUG: retweeted_status.user.name: %… | |
+ } else if (!strcmp(nodes[4].name, "screen_name")) { | |
+ strlcpy(tc->itemusername, str, sizeof(tc->item… | |
+// printf("DEBUG: retweeted_status.user.screen_… | |
+ } | |
+ } | |
+ } | |
+ | |
+ if (depth == 5 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ !strcmp(nodes[2].name, "user")) { | |
+ if (nodes[3].type == JSON_TYPE_ARRAY && | |
+ !strcmp(nodes[3].name, "pinned_tweet_ids")) { | |
+ if (nodes[4].type == JSON_TYPE_NUMBER) { | |
+ addpinned(str); | |
+// printf("DEBUG: pinned_tweets_ids[%zu]: %s\n", | |
+// nodes[4].index, str); | |
+ } | |
+ } | |
+ } | |
+ | |
+ if (depth == 6 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ nodes[3].type == JSON_TYPE_ARRAY && | |
+ nodes[4].type == JSON_TYPE_OBJECT && | |
+ nodes[5].type == JSON_TYPE_STRING && | |
+ !strcmp(nodes[2].name, "entities") && | |
+ !strcmp(nodes[3].name, "urls")) { | |
+ if (!strcmp(nodes[5].name, "url")) { | |
+// printf("DEBUG: url: %s\n", str); | |
+ strlcpy(url, str, sizeof(url)); | |
+ } else if (!strcmp(nodes[5].name, "expanded_url")) { | |
+// printf("DEBUG: expanded_url: %s\n", str); | |
+ /* assumes "expanded_url" is specified after "url" */ | |
+ addurl(url, str); | |
+ url[0] = '\0'; | |
+ } | |
+ } | |
+ | |
+ /* [].entities.media[].url */ | |
+ if (depth == 6 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ nodes[3].type == JSON_TYPE_ARRAY && | |
+ nodes[4].type == JSON_TYPE_OBJECT && | |
+ nodes[5].type == JSON_TYPE_STRING && | |
+ !strcmp(nodes[2].name, "entities") && | |
+ !strcmp(nodes[3].name, "media")) { | |
+ if (!strcmp(nodes[5].name, "url")) { | |
+// printf("DEBUG: url: %s\n", str); | |
+ strlcpy(url, str, sizeof(url)); | |
+ } else if (!strcmp(nodes[5].name, "expanded_url")) { | |
+// printf("DEBUG: expanded_url: %s\n", str); | |
+ /* assumes "expanded_url" is specified after "url" */ | |
+ addurl(url, str); | |
+ url[0] = '\0'; | |
+ } | |
+ } | |
+ | |
+// TODO: retweeted.status.entities.urls[] | |
+#if 0 | |
+ if (depth == 6 && | |
+ nodes[0].type == JSON_TYPE_ARRAY && | |
+ nodes[1].type == JSON_TYPE_OBJECT && | |
+ nodes[2].type == JSON_TYPE_OBJECT && | |
+ nodes[3].type == JSON_TYPE_OBJECT && | |
+ nodes[4].type == JSON_TYPE_ARRAY && | |
+ nodes[5].type == JSON_TYPE_STRING && | |
+ !strcmp(nodes[2].name, "retweeted_status") && | |
+ !strcmp(nodes[3].name, "entities") && | |
+ !strcmp(nodes[4].name, "urls")) { | |
+ if (!strcmp(nodes[5].name, "url")) { | |
+ printf("DEBUG: url: %s\n", str); | |
+ } else if (!strcmp(nodes[5].name, "expanded_url")) { | |
+ printf("DEBUG: expanded_url: %s\n", str); | |
+ } | |
+ } | |
+#endif | |
} | |
int | |
main(void) | |
{ | |
+ struct tweet *t; | |
+ size_t i; | |
+ int r; | |
+ | |
if (pledge("stdio", NULL) == -1) | |
err(1, "pledge"); | |
- /* handlers */ | |
- p.xmlattr = xmlattr; | |
- p.xmlattrentity = xmlattrentity; | |
- p.xmlcdata = xmlcdata; | |
- p.xmldata = xmldata; | |
- p.xmldataentity = xmldataentity; | |
- p.xmltagstart = xmltagstart; | |
- p.xmltagend = xmltagend; | |
- p.xmltagstartparsed = xmltagstartparsed; | |
- /* reader (stdin) */ | |
- p.getnext = getchar; | |
- | |
- xml_parse(&p); | |
+ r = parsejson(processnodes); | |
+ if (r != 0) | |
+ errx(1, "invalid JSON"); | |
+ | |
+ // TODO: TEST: make sure the last tweet is printed too (addtweet() log… | |
+ for (t = tweets; t; t = t->next) { | |
+ /* check for pinned tweets */ | |
+ for (i = 0; i < npinned; i++) { | |
+ if (!strcmp(t->itemid, pinnedids[i])) { | |
+// printf("DEBUG: pinned: %s\n", pinnedids[i]); | |
+ t->ispinned = 1; | |
+ break; | |
+ } | |
+ } | |
+ printtweet(t); | |
+ } | |
return 0; | |
} | |
diff --git a/tscrape_plain.c b/tscrape_plain.c | |
@@ -51,7 +51,7 @@ printfeed(FILE *fp, const char *feedname) | |
printutf8pad(stdout, fields[FieldItemFullname], 25, ' '); | |
fputs(" ", stdout); | |
- printescape(fields[FieldText]); | |
+ fputs(fields[FieldText], stdout); | |
putchar('\n'); | |
} | |
} | |
diff --git a/tscrape_update b/tscrape_update | |
@@ -9,6 +9,12 @@ tscrapepath="$HOME/.tscrape/feeds" | |
# feeds are finished at a time. | |
maxjobs=8 | |
+# Twitter authentication bearer (seems to be static). | |
+bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk… | |
+ | |
+# guest token. | |
+token="" | |
+ | |
# load config (evaluate shellscript). | |
# loadconfig(configfile) | |
loadconfig() { | |
@@ -36,12 +42,26 @@ log() { | |
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 | |
} | |
+# acquire guest token. | |
+# guesttoken() | |
+guesttoken() { | |
+ # fail on redirects, hide User-Agent, timeout is 15 seconds. | |
+ curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ | |
+ -H "Authorization: Bearer ${bearer}" \ | |
+ 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null … | |
+ sed -nE 's@.*{"guest_token":"([^"]*)"}.*@\1@p' | |
+} | |
+ | |
# fetch a feed via HTTP/HTTPS etc. | |
-# fetch(name, url, feedfile) | |
+# fetch(name, twittername, feedfile) | |
fetch() { | |
+ url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_na… | |
+ | |
# fail on redirects, hide User-Agent, timeout is 15 seconds. | |
curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ | |
- "$2" 2>/dev/null | |
+ -H "Authorization: Bearer ${bearer}" \ | |
+ -H "x-guest-token: $token" \ | |
+ "${url}" 2>/dev/null | |
} | |
# filter fields. | |
@@ -151,6 +171,13 @@ feeds() { | |
echo "See tscraperc.example for an example." >&2 | |
} | |
+# get quest token. | |
+token=$(guesttoken) | |
+if [ -z "${token}" ]; then | |
+ echo "Failed to acquire guest token" >&2 | |
+ exit 1 | |
+fi | |
+ | |
# job counter. | |
curjobs=0 | |
# signal number received for parent. | |
diff --git a/tscraperc.example b/tscraperc.example | |
@@ -2,8 +2,8 @@ | |
# list of feeds to fetch: | |
feeds() { | |
- # feed <name> <feedurl> | |
- feed "Rich Felker" "https://twitter.com/richfelker" | |
- feed "Internet of shit" "https://twitter.com/internetofshit" | |
- feed "Donald Trump" "https://twitter.com/realdonaldtrump" | |
+ # feed <name> <twittername> | |
+ feed "Rich Felker" "richfelker" | |
+ feed "Internet of shit" "internetofshit" | |
+ feed "Donald Trump" "realdonaldtrump" | |
} | |
diff --git a/util.c b/util.c | |
@@ -106,43 +106,3 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad) | |
for (; col < len; ++col) | |
putc(pad, fp); | |
} | |
- | |
-void | |
-printescape(const char *s) | |
-{ | |
- int r; | |
- const char *e; | |
- | |
- /* strip leading and trailing white-space */ | |
- for (; *s && isspace((unsigned char)*s); s++) | |
- ; | |
- for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--) | |
- ; | |
- | |
- for (r = 0; *s && s < e; s++) { | |
- if (iscntrl((unsigned char)*s) || isspace((unsigned char)*s)) { | |
- r = 1; | |
- continue; | |
- } | |
- if (r) { | |
- r = 0; | |
- putchar(' '); | |
- } | |
- putchar(*s); | |
- } | |
-} | |
- | |
-int | |
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz) | |
-{ | |
- struct tm *tm; | |
- | |
- if (strtotime(s, t)) | |
- return -1; | |
- if (!(tm = localtime(t))) | |
- return -1; | |
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm)) | |
- return -1; | |
- | |
- return 0; | |
-} | |
diff --git a/util.h b/util.h | |
@@ -30,8 +30,6 @@ enum { | |
}; | |
size_t parseline(char *, char *[FieldLast]); | |
-int parsetime(const char *, time_t *, char *, size_t); | |
-void printescape(const char *); | |
void printutf8pad(FILE *, const char *, size_t, int); | |
int strtotime(const char *, time_t *); | |
void xmlencode(const char *, FILE *); | |
diff --git a/xml.c b/xml.c | |
@@ -1,451 +0,0 @@ | |
-#include <ctype.h> | |
-#include <errno.h> | |
-#include <stdio.h> | |
-#include <stdlib.h> | |
-#include <string.h> | |
- | |
-#include "xml.h" | |
- | |
-static void | |
-xml_parseattrs(XMLParser *x) | |
-{ | |
- size_t namelen = 0, valuelen; | |
- int c, endsep, endname = 0, valuestart = 0; | |
- | |
- while ((c = GETNEXT()) != EOF) { | |
- if (isspace(c)) { | |
- if (namelen) | |
- endname = 1; | |
- continue; | |
- } else if (c == '?') | |
- ; /* ignore */ | |
- else if (c == '=') { | |
- x->name[namelen] = '\0'; | |
- valuestart = 1; | |
- endname = 1; | |
- } else if (namelen && ((endname && !valuestart && isalpha(c)) … | |
- /* attribute without value */ | |
- x->name[namelen] = '\0'; | |
- if (x->xmlattrstart) | |
- x->xmlattrstart(x, x->tag, x->taglen, x->name,… | |
- if (x->xmlattr) | |
- x->xmlattr(x, x->tag, x->taglen, x->name, name… | |
- if (x->xmlattrend) | |
- x->xmlattrend(x, x->tag, x->taglen, x->name, n… | |
- endname = 0; | |
- x->name[0] = c; | |
- namelen = 1; | |
- } else if (namelen && valuestart) { | |
- /* attribute with value */ | |
- if (x->xmlattrstart) | |
- x->xmlattrstart(x, x->tag, x->taglen, x->name,… | |
- | |
- valuelen = 0; | |
- if (c == '\'' || c == '"') { | |
- endsep = c; | |
- } else { | |
- endsep = ' '; /* isspace() */ | |
- goto startvalue; | |
- } | |
- | |
- while ((c = GETNEXT()) != EOF) { | |
-startvalue: | |
- if (c == '&') { /* entities */ | |
- x->data[valuelen] = '\0'; | |
- /* call data function with data before… | |
- if (valuelen && x->xmlattr) | |
- x->xmlattr(x, x->tag, x->tagle… | |
- x->data[0] = c; | |
- valuelen = 1; | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == endsep || (endsep == … | |
- break; | |
- if (valuelen < sizeof(x->data)… | |
- x->data[valuelen++] = … | |
- else { | |
- /* entity too long for… | |
- x->data[valuelen] = '\… | |
- if (x->xmlattr) | |
- x->xmlattr(x, … | |
- x->data[0] = c; | |
- valuelen = 1; | |
- break; | |
- } | |
- if (c == ';') { | |
- x->data[valuelen] = '\… | |
- if (x->xmlattrentity) | |
- x->xmlattrenti… | |
- valuelen = 0; | |
- break; | |
- } | |
- } | |
- } else if (c != endsep && !(endsep == ' ' && (… | |
- if (valuelen < sizeof(x->data) - 1) { | |
- x->data[valuelen++] = c; | |
- } else { | |
- x->data[valuelen] = '\0'; | |
- if (x->xmlattr) | |
- x->xmlattr(x, x->tag, … | |
- x->data[0] = c; | |
- valuelen = 1; | |
- } | |
- } | |
- if (c == endsep || (endsep == ' ' && (c == '>'… | |
- x->data[valuelen] = '\0'; | |
- if (x->xmlattr) | |
- x->xmlattr(x, x->tag, x->tagle… | |
- if (x->xmlattrend) | |
- x->xmlattrend(x, x->tag, x->ta… | |
- break; | |
- } | |
- } | |
- namelen = endname = valuestart = 0; | |
- } else if (namelen < sizeof(x->name) - 1) { | |
- x->name[namelen++] = c; | |
- } | |
- if (c == '>') { | |
- break; | |
- } else if (c == '/') { | |
- x->isshorttag = 1; | |
- x->name[0] = '\0'; | |
- namelen = 0; | |
- } | |
- } | |
-} | |
- | |
-static void | |
-xml_parsecomment(XMLParser *x) | |
-{ | |
- size_t datalen = 0, i = 0; | |
- int c; | |
- | |
- if (x->xmlcommentstart) | |
- x->xmlcommentstart(x); | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == '-' || c == '>') { | |
- if (x->xmlcomment && datalen) { | |
- x->data[datalen] = '\0'; | |
- x->xmlcomment(x, x->data, datalen); | |
- datalen = 0; | |
- } | |
- } | |
- | |
- if (c == '-') { | |
- if (++i > 2) { | |
- if (x->xmlcomment) | |
- for (; i > 2; i--) | |
- x->xmlcomment(x, "-", 1); | |
- i = 2; | |
- } | |
- continue; | |
- } else if (c == '>' && i == 2) { | |
- if (x->xmlcommentend) | |
- x->xmlcommentend(x); | |
- return; | |
- } else if (i) { | |
- if (x->xmlcomment) { | |
- for (; i > 0; i--) | |
- x->xmlcomment(x, "-", 1); | |
- } | |
- i = 0; | |
- } | |
- | |
- if (datalen < sizeof(x->data) - 1) { | |
- x->data[datalen++] = c; | |
- } else { | |
- x->data[datalen] = '\0'; | |
- if (x->xmlcomment) | |
- x->xmlcomment(x, x->data, datalen); | |
- x->data[0] = c; | |
- datalen = 1; | |
- } | |
- } | |
-} | |
- | |
-static void | |
-xml_parsecdata(XMLParser *x) | |
-{ | |
- size_t datalen = 0, i = 0; | |
- int c; | |
- | |
- if (x->xmlcdatastart) | |
- x->xmlcdatastart(x); | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == ']' || c == '>') { | |
- if (x->xmlcdata && datalen) { | |
- x->data[datalen] = '\0'; | |
- x->xmlcdata(x, x->data, datalen); | |
- datalen = 0; | |
- } | |
- } | |
- | |
- if (c == ']') { | |
- if (++i > 2) { | |
- if (x->xmlcdata) | |
- for (; i > 2; i--) | |
- x->xmlcdata(x, "]", 1); | |
- i = 2; | |
- } | |
- continue; | |
- } else if (c == '>' && i == 2) { | |
- if (x->xmlcdataend) | |
- x->xmlcdataend(x); | |
- return; | |
- } else if (i) { | |
- if (x->xmlcdata) | |
- for (; i > 0; i--) | |
- x->xmlcdata(x, "]", 1); | |
- i = 0; | |
- } | |
- | |
- if (datalen < sizeof(x->data) - 1) { | |
- x->data[datalen++] = c; | |
- } else { | |
- x->data[datalen] = '\0'; | |
- if (x->xmlcdata) | |
- x->xmlcdata(x, x->data, datalen); | |
- x->data[0] = c; | |
- datalen = 1; | |
- } | |
- } | |
-} | |
- | |
-static int | |
-codepointtoutf8(long r, char *s) | |
-{ | |
- if (r == 0) { | |
- return 0; /* NUL byte */ | |
- } else if (r <= 0x7F) { | |
- /* 1 byte: 0aaaaaaa */ | |
- s[0] = r; | |
- return 1; | |
- } else if (r <= 0x07FF) { | |
- /* 2 bytes: 00000aaa aabbbbbb */ | |
- s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
- s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
- return 2; | |
- } else if (r <= 0xFFFF) { | |
- /* 3 bytes: aaaabbbb bbcccccc */ | |
- s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
- s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
- s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
- return 3; | |
- } else { | |
- /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
- s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
- s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
- s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
- s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
- return 4; | |
- } | |
-} | |
- | |
-static int | |
-namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
-{ | |
- static const struct { | |
- const char *entity; | |
- int c; | |
- } entities[] = { | |
- { "amp;", '&' }, | |
- { "lt;", '<' }, | |
- { "gt;", '>' }, | |
- { "apos;", '\'' }, | |
- { "quot;", '"' }, | |
- }; | |
- size_t i; | |
- | |
- /* buffer is too small */ | |
- if (bufsiz < 2) | |
- return -1; | |
- | |
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { | |
- if (!strcmp(e, entities[i].entity)) { | |
- buf[0] = entities[i].c; | |
- buf[1] = '\0'; | |
- return 1; | |
- } | |
- } | |
- return -1; | |
-} | |
- | |
-static int | |
-numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
-{ | |
- long l; | |
- int len; | |
- char *end; | |
- | |
- /* buffer is too small */ | |
- if (bufsiz < 5) | |
- return -1; | |
- | |
- errno = 0; | |
- /* hex (16) or decimal (10) */ | |
- if (*e == 'x') | |
- l = strtol(++e, &end, 16); | |
- else | |
- l = strtol(e, &end, 10); | |
- /* invalid value or not a well-formed entity or invalid code point */ | |
- if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff) | |
- return -1; | |
- len = codepointtoutf8(l, buf); | |
- buf[len] = '\0'; | |
- | |
- return len; | |
-} | |
- | |
-/* convert named- or numeric entity string to buffer string | |
- * returns byte-length of string or -1 on failure. */ | |
-int | |
-xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
-{ | |
- /* doesn't start with & */ | |
- if (e[0] != '&') | |
- return -1; | |
- /* numeric entity */ | |
- if (e[1] == '#') | |
- return numericentitytostr(e + 2, buf, bufsiz); | |
- else /* named entity */ | |
- return namedentitytostr(e + 1, buf, bufsiz); | |
-} | |
- | |
-void | |
-xml_parse(XMLParser *x) | |
-{ | |
- size_t datalen, tagdatalen; | |
- int c, isend; | |
- | |
- while ((c = GETNEXT()) != EOF && c != '<') | |
- ; /* skip until < */ | |
- | |
- while (c != EOF) { | |
- if (c == '<') { /* parse tag */ | |
- if ((c = GETNEXT()) == EOF) | |
- return; | |
- | |
- if (c == '!') { /* cdata and comments */ | |
- for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { | |
- /* NOTE: sizeof(x->data) must be atlea… | |
- if (tagdatalen <= sizeof("[CDATA[") - … | |
- x->data[tagdatalen++] = c; | |
- if (c == '>') | |
- break; | |
- else if (c == '-' && tagdatalen == siz… | |
- (x->data[0] == '-')) { | |
- xml_parsecomment(x); | |
- break; | |
- } else if (c == '[') { | |
- if (tagdatalen == sizeof("[CDA… | |
- !strncmp(x->data, "[CDATA[… | |
- xml_parsecdata(x); | |
- break; | |
- } | |
- } | |
- } | |
- } else { | |
- /* normal tag (open, short open, close), proce… | |
- x->tag[0] = c; | |
- x->taglen = 1; | |
- x->isshorttag = isend = 0; | |
- | |
- /* treat processing instruction as shorttag, d… | |
- if (c == '?') { | |
- x->isshorttag = 1; | |
- } else if (c == '/') { | |
- if ((c = GETNEXT()) == EOF) | |
- return; | |
- x->tag[0] = c; | |
- isend = 1; | |
- } | |
- | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == '/') | |
- x->isshorttag = 1; /* short ta… | |
- else if (c == '>' || isspace(c)) { | |
- x->tag[x->taglen] = '\0'; | |
- if (isend) { /* end tag, start… | |
- if (x->xmltagend) | |
- x->xmltagend(x… | |
- x->tag[0] = '\0'; | |
- x->taglen = 0; | |
- } else { | |
- /* start tag */ | |
- if (x->xmltagstart) | |
- x->xmltagstart… | |
- if (isspace(c)) | |
- xml_parseattrs… | |
- if (x->xmltagstartpars… | |
- x->xmltagstart… | |
- } | |
- /* call tagend for shortform o… | |
- if (x->isshorttag) { | |
- if (x->xmltagend) | |
- x->xmltagend(x… | |
- x->tag[0] = '\0'; | |
- x->taglen = 0; | |
- } | |
- break; | |
- } else if (x->taglen < sizeof(x->tag) … | |
- x->tag[x->taglen++] = c; /* NO… | |
- } | |
- } | |
- } else { | |
- /* parse tag data */ | |
- datalen = 0; | |
- if (x->xmldatastart) | |
- x->xmldatastart(x); | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == '&') { | |
- if (datalen) { | |
- x->data[datalen] = '\0'; | |
- if (x->xmldata) | |
- x->xmldata(x, x->data,… | |
- } | |
- x->data[0] = c; | |
- datalen = 1; | |
- while ((c = GETNEXT()) != EOF) { | |
- if (c == '<') | |
- break; | |
- if (datalen < sizeof(x->data) … | |
- x->data[datalen++] = c; | |
- else { | |
- /* entity too long for… | |
- x->data[datalen] = '\0… | |
- if (x->xmldata) | |
- x->xmldata(x, … | |
- x->data[0] = c; | |
- datalen = 1; | |
- break; | |
- } | |
- if (c == ';') { | |
- x->data[datalen] = '\0… | |
- if (x->xmldataentity) | |
- x->xmldataenti… | |
- datalen = 0; | |
- break; | |
- } | |
- } | |
- } else if (c != '<') { | |
- if (datalen < sizeof(x->data) - 1) { | |
- x->data[datalen++] = c; | |
- } else { | |
- x->data[datalen] = '\0'; | |
- if (x->xmldata) | |
- x->xmldata(x, x->data,… | |
- x->data[0] = c; | |
- datalen = 1; | |
- } | |
- } | |
- if (c == '<') { | |
- x->data[datalen] = '\0'; | |
- if (x->xmldata && datalen) | |
- x->xmldata(x, x->data, datalen… | |
- if (x->xmldataend) | |
- x->xmldataend(x); | |
- break; | |
- } | |
- } | |
- } | |
- } | |
-} | |
diff --git a/xml.h b/xml.h | |
@@ -1,49 +0,0 @@ | |
-#ifndef _XML_H | |
-#define _XML_H | |
- | |
-#include <stdio.h> | |
- | |
-typedef struct xmlparser { | |
- /* handlers */ | |
- void (*xmlattr)(struct xmlparser *, const char *, size_t, | |
- const char *, size_t, const char *, size_t); | |
- void (*xmlattrend)(struct xmlparser *, const char *, size_t, | |
- const char *, size_t); | |
- void (*xmlattrstart)(struct xmlparser *, const char *, size_t, | |
- const char *, size_t); | |
- void (*xmlattrentity)(struct xmlparser *, const char *, size_t, | |
- const char *, size_t, const char *, size_t); | |
- void (*xmlcdatastart)(struct xmlparser *); | |
- void (*xmlcdata)(struct xmlparser *, const char *, size_t); | |
- void (*xmlcdataend)(struct xmlparser *); | |
- void (*xmlcommentstart)(struct xmlparser *); | |
- void (*xmlcomment)(struct xmlparser *, const char *, size_t); | |
- void (*xmlcommentend)(struct xmlparser *); | |
- void (*xmldata)(struct xmlparser *, const char *, size_t); | |
- void (*xmldataend)(struct xmlparser *); | |
- void (*xmldataentity)(struct xmlparser *, const char *, size_t); | |
- void (*xmldatastart)(struct xmlparser *); | |
- void (*xmltagend)(struct xmlparser *, const char *, size_t, int); | |
- void (*xmltagstart)(struct xmlparser *, const char *, size_t); | |
- void (*xmltagstartparsed)(struct xmlparser *, const char *, | |
- size_t, int); | |
- | |
-#ifndef GETNEXT | |
- #define GETNEXT (x)->getnext | |
- int (*getnext)(void); | |
-#endif | |
- | |
- /* current tag */ | |
- char tag[1024]; | |
- size_t taglen; | |
- /* current tag is in short form ? <tag /> */ | |
- int isshorttag; | |
- /* current attribute name */ | |
- char name[1024]; | |
- /* data buffer used for tag data, cdata and attribute data */ | |
- char data[BUFSIZ]; | |
-} XMLParser; | |
- | |
-int xml_entitytostr(const char *, char *, size_t); | |
-void xml_parse(XMLParser *); | |
-#endif |