separate parsing and formatting like sfeed - tscrape - twitter scraper | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit f712b91a8db0fb66f7facf349ea859da07717dc7 | |
parent f0b8be83a871c59f1bd9a99f16bf20ce9df57c22 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 12 Aug 2017 12:52:23 +0200 | |
separate parsing and formatting like sfeed | |
- remove formatted timestamp field. | |
- add tscrape_plain | |
Diffstat: | |
M Makefile | 6 ++++-- | |
M tscrape.c | 76 +----------------------------… | |
A tscrape_plain.c | 91 +++++++++++++++++++++++++++++… | |
A util.c | 146 +++++++++++++++++++++++++++++… | |
A util.h | 38 +++++++++++++++++++++++++++++… | |
5 files changed, 281 insertions(+), 76 deletions(-) | |
--- | |
diff --git a/Makefile b/Makefile | |
@@ -3,14 +3,16 @@ include config.mk | |
NAME = tscrape | |
VERSION = 0.1 | |
BIN = \ | |
- tscrape | |
+ tscrape\ | |
+ tscrape_plain | |
SRC = ${BIN:=.c} | |
LIBUTIL = libutil.a | |
LIBUTILSRC = \ | |
strlcat.c\ | |
- strlcpy.c | |
+ strlcpy.c\ | |
+ util.c | |
LIBUTILOBJ = ${LIBUTILSRC:.c=.o} | |
LIBXML = libxml.a | |
diff --git a/tscrape.c b/tscrape.c | |
@@ -11,11 +11,8 @@ | |
#include <time.h> | |
#include <unistd.h> | |
-#ifndef USE_PLEDGE | |
-#define pledge(p1,p2) 0 | |
-#endif | |
- | |
#include "xml.h" | |
+#include "util.h" | |
#define STRP(s) s,sizeof(s)-1 | |
@@ -30,13 +27,6 @@ enum { | |
Username = 64 | |
}; | |
-/* for compatibility with libc's that don't have strlcat or strlcpy. The | |
- * functions are synced from OpenBSD */ | |
-#undef strlcat | |
-size_t strlcat(char *, const char *, size_t); | |
-#undef strlcpy | |
-size_t strlcpy(char *, const char *, size_t); | |
- | |
/* data */ | |
static char fullname[1024]; | |
static char timestamp[16]; | |
@@ -52,75 +42,13 @@ static int state; | |
static XMLParser p; | |
static void | |
-printescape(const char *s) | |
-{ | |
- size_t i; | |
- const char *e; | |
- | |
- /* strip leading and trailing white-space */ | |
- for (; *s && isspace(*s); s++) | |
- ; | |
- for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--) | |
- ; | |
- | |
- for (i = 0; *s && s < e; s++) { | |
- if (iscntrl(*s) || isspace(*s)) { | |
- i++; | |
- continue; | |
- } | |
- if (i) { | |
- i = 0; | |
- putchar(' '); | |
- } | |
- putchar(*s); | |
- } | |
-} | |
- | |
-/* Parse time to time_t, assumes time_t is signed. */ | |
-int | |
-strtotime(const char *s, time_t *t) | |
-{ | |
- long long l; | |
- char *e; | |
- | |
- errno = 0; | |
- l = strtoll(s, &e, 10); | |
- if (*s == '\0' || *e != '\0') | |
- return -1; | |
- if (t) | |
- *t = (time_t)l; | |
- | |
- return 0; | |
-} | |
- | |
-static int | |
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz) | |
-{ | |
- struct tm *tm; | |
- | |
- if (strtotime(s, t)) | |
- return -1; | |
- if (!(tm = localtime(t))) | |
- return -1; | |
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm)) | |
- return -1; | |
- | |
- return 0; | |
-} | |
- | |
-static void | |
printtweet(void) | |
{ | |
char buf[32]; | |
time_t t; | |
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) { | |
+ if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) | |
printf("%lld", (long long)t); | |
- putchar('\t'); | |
- fputs(buf, stdout); | |
- } else { | |
- putchar('\t'); | |
- } | |
putchar('\t'); | |
printescape(text); | |
putchar('\t'); | |
diff --git a/tscrape_plain.c b/tscrape_plain.c | |
@@ -0,0 +1,91 @@ | |
+#include <ctype.h> | |
+#include <err.h> | |
+#include <locale.h> | |
+#include <stdio.h> | |
+#include <stdlib.h> | |
+#include <string.h> | |
+#include <time.h> | |
+ | |
+#include "util.h" | |
+ | |
+static time_t comparetime; | |
+static char *line; | |
+static size_t linesize; | |
+ | |
+static void | |
+printfeed(FILE *fp, const char *feedname) | |
+{ | |
+ char *fields[FieldLast]; | |
+ struct tm *tm; | |
+ time_t parsedtime; | |
+ ssize_t linelen; | |
+ | |
+ while ((linelen = getline(&line, &linesize, fp)) > 0) { | |
+ if (line[linelen - 1] == '\n') | |
+ line[--linelen] = '\0'; | |
+ if (!parseline(line, fields)) | |
+ break; | |
+ | |
+ parsedtime = 0; | |
+ strtotime(fields[FieldUnixTimestamp], &parsedtime); | |
+ if (!(tm = localtime(&parsedtime))) | |
+ err(1, "localtime"); | |
+ | |
+ if (parsedtime >= comparetime) | |
+ putchar('N'); | |
+ else | |
+ putchar(' '); | |
+ if (fields[FieldRetweetid][0]) | |
+ putchar('R'); | |
+ else | |
+ putchar(' '); | |
+ putchar(' '); | |
+ | |
+ if (feedname[0]) | |
+ printf("%-15.15s ", feedname); | |
+ | |
+ fprintf(stdout, "%04d-%02d-%02d %02d:%02d ", | |
+ tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, | |
+ tm->tm_hour, tm->tm_min); | |
+ | |
+ printutf8pad(stdout, fields[FieldFullname], 25, ' '); | |
+ printescape(fields[FieldText]); | |
+ putchar('\n'); | |
+ } | |
+} | |
+ | |
+int | |
+main(int argc, char *argv[]) | |
+{ | |
+ FILE *fp; | |
+ char *name; | |
+ int i; | |
+ | |
+ if (pledge("stdio rpath", NULL) == -1) | |
+ err(1, "pledge"); | |
+ | |
+ setlocale(LC_CTYPE, ""); | |
+ | |
+ if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1) | |
+ err(1, "pledge"); | |
+ | |
+ if ((comparetime = time(NULL)) == -1) | |
+ err(1, "time"); | |
+ /* 1 day is old news */ | |
+ comparetime -= 86400; | |
+ | |
+ if (argc == 1) { | |
+ printfeed(stdin, ""); | |
+ } else { | |
+ for (i = 1; i < argc; i++) { | |
+ if (!(fp = fopen(argv[i], "r"))) | |
+ err(1, "fopen: %s", argv[i]); | |
+ name = ((name = strrchr(argv[i], '/'))) ? name + 1 : a… | |
+ printfeed(fp, name); | |
+ if (ferror(fp)) | |
+ err(1, "ferror: %s", argv[i]); | |
+ fclose(fp); | |
+ } | |
+ } | |
+ return 0; | |
+} | |
diff --git a/util.c b/util.c | |
@@ -0,0 +1,146 @@ | |
+#include <sys/types.h> | |
+ | |
+#include <ctype.h> | |
+#include <err.h> | |
+#include <errno.h> | |
+#include <limits.h> | |
+#include <stdarg.h> | |
+#include <stdio.h> | |
+#include <stdint.h> | |
+#include <stdlib.h> | |
+#include <string.h> | |
+#include <time.h> | |
+#include <wchar.h> | |
+ | |
+#include "util.h" | |
+ | |
+/* Read a field-separated line from 'fp', | |
+ * separated by a character 'separator', | |
+ * 'fields' is a list of pointers with a size of FieldLast (must be >0). | |
+ * 'line' buffer is allocated using malloc, 'size' will contain the allocated | |
+ * buffer size. | |
+ * returns: amount of fields read (>0) or -1 on error. */ | |
+size_t | |
+parseline(char *line, char *fields[FieldLast]) | |
+{ | |
+ char *prev, *s; | |
+ size_t i; | |
+ | |
+ for (prev = line, i = 0; | |
+ (s = strchr(prev, '\t')) && i < FieldLast - 1; | |
+ i++) { | |
+ *s = '\0'; | |
+ fields[i] = prev; | |
+ prev = s + 1; | |
+ } | |
+ fields[i++] = prev; | |
+ /* make non-parsed fields empty. */ | |
+ for (; i < FieldLast; i++) | |
+ fields[i] = ""; | |
+ | |
+ return i; | |
+} | |
+ | |
+/* Parse time to time_t, assumes time_t is signed, ignores fractions. */ | |
+int | |
+strtotime(const char *s, time_t *t) | |
+{ | |
+ long long l; | |
+ char *e; | |
+ | |
+ errno = 0; | |
+ l = strtoll(s, &e, 10); | |
+ if (errno || *s == '\0' || *e) | |
+ return -1; | |
+ /* NOTE: assumes time_t is 64-bit on 64-bit platforms: | |
+ long long (atleast 32-bit) to time_t. */ | |
+ if (t) | |
+ *t = (time_t)l; | |
+ | |
+ return 0; | |
+} | |
+ | |
+/* Escape characters below as HTML 2.0 / XML 1.0. */ | |
+void | |
+xmlencode(const char *s, FILE *fp) | |
+{ | |
+ for (; *s; s++) { | |
+ switch(*s) { | |
+ case '<': fputs("<", fp); break; | |
+ case '>': fputs(">", fp); break; | |
+ case '\'': fputs("'", fp); break; | |
+ case '&': fputs("&", fp); break; | |
+ case '"': fputs(""", fp); break; | |
+ default: fputc(*s, fp); | |
+ } | |
+ } | |
+} | |
+ | |
+/* print `len' columns of characters. If string is shorter pad the rest | |
+ * with characters `pad`. */ | |
+void | |
+printutf8pad(FILE *fp, const char *s, size_t len, int pad) | |
+{ | |
+ wchar_t w; | |
+ size_t col = 0, i, slen; | |
+ int rl, wc; | |
+ | |
+ if (!len) | |
+ return; | |
+ | |
+ slen = strlen(s); | |
+ for (i = 0; i < slen && col < len + 1; i += rl) { | |
+ if ((rl = mbtowc(&w, &s[i], slen - i < 4 ? slen - i : 4)) <= 0) | |
+ break; | |
+ if ((wc = wcwidth(w)) == -1) | |
+ wc = 1; | |
+ col += (size_t)wc; | |
+ if (col >= len && s[i + rl]) { | |
+ fputs("\xe2\x80\xa6", fp); | |
+ break; | |
+ } | |
+ fwrite(&s[i], 1, rl, fp); | |
+ } | |
+ for (; col < len; col++) | |
+ putc(pad, fp); | |
+} | |
+ | |
+void | |
+printescape(const char *s) | |
+{ | |
+ size_t i; | |
+ const char *e; | |
+ | |
+ /* strip leading and trailing white-space */ | |
+ for (; *s && isspace(*s); s++) | |
+ ; | |
+ for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--) | |
+ ; | |
+ | |
+ for (i = 0; *s && s < e; s++) { | |
+ if (iscntrl(*s) || isspace(*s)) { | |
+ i++; | |
+ continue; | |
+ } | |
+ if (i) { | |
+ i = 0; | |
+ putchar(' '); | |
+ } | |
+ putchar(*s); | |
+ } | |
+} | |
+ | |
+int | |
+parsetime(const char *s, time_t *t, char *buf, size_t bufsiz) | |
+{ | |
+ struct tm *tm; | |
+ | |
+ if (strtotime(s, t)) | |
+ return -1; | |
+ if (!(tm = localtime(t))) | |
+ return -1; | |
+ if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm)) | |
+ return -1; | |
+ | |
+ return 0; | |
+} | |
diff --git a/util.h b/util.h | |
@@ -0,0 +1,38 @@ | |
+#include <stdint.h> | |
+#include <time.h> | |
+#ifdef USE_PLEDGE | |
+#include <unistd.h> | |
+#else | |
+#define pledge(p1,p2) 0 | |
+#endif | |
+ | |
+#undef strlcat | |
+size_t strlcat(char *, const char *, size_t); | |
+#undef strlcpy | |
+size_t strlcpy(char *, const char *, size_t); | |
+ | |
+#define ISUTF8(c) (((c) & 0xc0) != 0x80) | |
+ | |
+/* feed info */ | |
+struct feed { | |
+ char * name; /* feed name */ | |
+ unsigned long totalnew; /* amount of new items per feed */ | |
+ unsigned long total; /* total items */ | |
+ time_t timenewest; | |
+ char timenewestformat[64]; | |
+}; | |
+ | |
+enum { | |
+ FieldUnixTimestamp = 0, | |
+ FieldText, FieldItemid, | |
+ FieldUsername, FieldFullname, | |
+ FieldRetweetid, FieldIspinned, | |
+ FieldLast | |
+}; | |
+ | |
+size_t parseline(char *, char *[FieldLast]); | |
+int parsetime(const char *, time_t *, char *, size_t); | |
+void printescape(const char *); | |
+void printutf8pad(FILE *, const char *, size_t, int); | |
+int strtotime(const char *, time_t *); | |
+void xmlencode(const char *, FILE *); |