Introduction
Introduction Statistics Contact Development Disclaimer Help
separate parsing and formatting like sfeed - tscrape - twitter scraper
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit f712b91a8db0fb66f7facf349ea859da07717dc7
parent f0b8be83a871c59f1bd9a99f16bf20ce9df57c22
Author: Hiltjo Posthuma <[email protected]>
Date: Sat, 12 Aug 2017 12:52:23 +0200
separate parsing and formatting like sfeed
- remove formatted timestamp field.
- add tscrape_plain
Diffstat:
M Makefile | 6 ++++--
M tscrape.c | 76 +----------------------------…
A tscrape_plain.c | 91 +++++++++++++++++++++++++++++…
A util.c | 146 +++++++++++++++++++++++++++++…
A util.h | 38 +++++++++++++++++++++++++++++…
5 files changed, 281 insertions(+), 76 deletions(-)
---
diff --git a/Makefile b/Makefile
@@ -3,14 +3,16 @@ include config.mk
NAME = tscrape
VERSION = 0.1
BIN = \
- tscrape
+ tscrape\
+ tscrape_plain
SRC = ${BIN:=.c}
LIBUTIL = libutil.a
LIBUTILSRC = \
strlcat.c\
- strlcpy.c
+ strlcpy.c\
+ util.c
LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
LIBXML = libxml.a
diff --git a/tscrape.c b/tscrape.c
@@ -11,11 +11,8 @@
#include <time.h>
#include <unistd.h>
-#ifndef USE_PLEDGE
-#define pledge(p1,p2) 0
-#endif
-
#include "xml.h"
+#include "util.h"
#define STRP(s) s,sizeof(s)-1
@@ -30,13 +27,6 @@ enum {
Username = 64
};
-/* for compatibility with libc's that don't have strlcat or strlcpy. The
- * functions are synced from OpenBSD */
-#undef strlcat
-size_t strlcat(char *, const char *, size_t);
-#undef strlcpy
-size_t strlcpy(char *, const char *, size_t);
-
/* data */
static char fullname[1024];
static char timestamp[16];
@@ -52,75 +42,13 @@ static int state;
static XMLParser p;
static void
-printescape(const char *s)
-{
- size_t i;
- const char *e;
-
- /* strip leading and trailing white-space */
- for (; *s && isspace(*s); s++)
- ;
- for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
- ;
-
- for (i = 0; *s && s < e; s++) {
- if (iscntrl(*s) || isspace(*s)) {
- i++;
- continue;
- }
- if (i) {
- i = 0;
- putchar(' ');
- }
- putchar(*s);
- }
-}
-
-/* Parse time to time_t, assumes time_t is signed. */
-int
-strtotime(const char *s, time_t *t)
-{
- long long l;
- char *e;
-
- errno = 0;
- l = strtoll(s, &e, 10);
- if (*s == '\0' || *e != '\0')
- return -1;
- if (t)
- *t = (time_t)l;
-
- return 0;
-}
-
-static int
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
-{
- struct tm *tm;
-
- if (strtotime(s, t))
- return -1;
- if (!(tm = localtime(t)))
- return -1;
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
- return -1;
-
- return 0;
-}
-
-static void
printtweet(void)
{
char buf[32];
time_t t;
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) {
+ if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
printf("%lld", (long long)t);
- putchar('\t');
- fputs(buf, stdout);
- } else {
- putchar('\t');
- }
putchar('\t');
printescape(text);
putchar('\t');
diff --git a/tscrape_plain.c b/tscrape_plain.c
@@ -0,0 +1,91 @@
+#include <ctype.h>
+#include <err.h>
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "util.h"
+
+static time_t comparetime;
+static char *line;
+static size_t linesize;
+
+static void
+printfeed(FILE *fp, const char *feedname)
+{
+ char *fields[FieldLast];
+ struct tm *tm;
+ time_t parsedtime;
+ ssize_t linelen;
+
+ while ((linelen = getline(&line, &linesize, fp)) > 0) {
+ if (line[linelen - 1] == '\n')
+ line[--linelen] = '\0';
+ if (!parseline(line, fields))
+ break;
+
+ parsedtime = 0;
+ strtotime(fields[FieldUnixTimestamp], &parsedtime);
+ if (!(tm = localtime(&parsedtime)))
+ err(1, "localtime");
+
+ if (parsedtime >= comparetime)
+ putchar('N');
+ else
+ putchar(' ');
+ if (fields[FieldRetweetid][0])
+ putchar('R');
+ else
+ putchar(' ');
+ putchar(' ');
+
+ if (feedname[0])
+ printf("%-15.15s ", feedname);
+
+ fprintf(stdout, "%04d-%02d-%02d %02d:%02d ",
+ tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+ tm->tm_hour, tm->tm_min);
+
+ printutf8pad(stdout, fields[FieldFullname], 25, ' ');
+ printescape(fields[FieldText]);
+ putchar('\n');
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ FILE *fp;
+ char *name;
+ int i;
+
+ if (pledge("stdio rpath", NULL) == -1)
+ err(1, "pledge");
+
+ setlocale(LC_CTYPE, "");
+
+ if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1)
+ err(1, "pledge");
+
+ if ((comparetime = time(NULL)) == -1)
+ err(1, "time");
+ /* 1 day is old news */
+ comparetime -= 86400;
+
+ if (argc == 1) {
+ printfeed(stdin, "");
+ } else {
+ for (i = 1; i < argc; i++) {
+ if (!(fp = fopen(argv[i], "r")))
+ err(1, "fopen: %s", argv[i]);
+ name = ((name = strrchr(argv[i], '/'))) ? name + 1 : a…
+ printfeed(fp, name);
+ if (ferror(fp))
+ err(1, "ferror: %s", argv[i]);
+ fclose(fp);
+ }
+ }
+ return 0;
+}
diff --git a/util.c b/util.c
@@ -0,0 +1,146 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <wchar.h>
+
+#include "util.h"
+
+/* Read a field-separated line from 'fp',
+ * separated by a character 'separator',
+ * 'fields' is a list of pointers with a size of FieldLast (must be >0).
+ * 'line' buffer is allocated using malloc, 'size' will contain the allocated
+ * buffer size.
+ * returns: amount of fields read (>0) or -1 on error. */
+size_t
+parseline(char *line, char *fields[FieldLast])
+{
+ char *prev, *s;
+ size_t i;
+
+ for (prev = line, i = 0;
+ (s = strchr(prev, '\t')) && i < FieldLast - 1;
+ i++) {
+ *s = '\0';
+ fields[i] = prev;
+ prev = s + 1;
+ }
+ fields[i++] = prev;
+ /* make non-parsed fields empty. */
+ for (; i < FieldLast; i++)
+ fields[i] = "";
+
+ return i;
+}
+
+/* Parse time to time_t, assumes time_t is signed, ignores fractions. */
+int
+strtotime(const char *s, time_t *t)
+{
+ long long l;
+ char *e;
+
+ errno = 0;
+ l = strtoll(s, &e, 10);
+ if (errno || *s == '\0' || *e)
+ return -1;
+ /* NOTE: assumes time_t is 64-bit on 64-bit platforms:
+ long long (atleast 32-bit) to time_t. */
+ if (t)
+ *t = (time_t)l;
+
+ return 0;
+}
+
+/* Escape characters below as HTML 2.0 / XML 1.0. */
+void
+xmlencode(const char *s, FILE *fp)
+{
+ for (; *s; s++) {
+ switch(*s) {
+ case '<': fputs("&lt;", fp); break;
+ case '>': fputs("&gt;", fp); break;
+ case '\'': fputs("&#39;", fp); break;
+ case '&': fputs("&amp;", fp); break;
+ case '"': fputs("&quot;", fp); break;
+ default: fputc(*s, fp);
+ }
+ }
+}
+
+/* print `len' columns of characters. If string is shorter pad the rest
+ * with characters `pad`. */
+void
+printutf8pad(FILE *fp, const char *s, size_t len, int pad)
+{
+ wchar_t w;
+ size_t col = 0, i, slen;
+ int rl, wc;
+
+ if (!len)
+ return;
+
+ slen = strlen(s);
+ for (i = 0; i < slen && col < len + 1; i += rl) {
+ if ((rl = mbtowc(&w, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
+ break;
+ if ((wc = wcwidth(w)) == -1)
+ wc = 1;
+ col += (size_t)wc;
+ if (col >= len && s[i + rl]) {
+ fputs("\xe2\x80\xa6", fp);
+ break;
+ }
+ fwrite(&s[i], 1, rl, fp);
+ }
+ for (; col < len; col++)
+ putc(pad, fp);
+}
+
+void
+printescape(const char *s)
+{
+ size_t i;
+ const char *e;
+
+ /* strip leading and trailing white-space */
+ for (; *s && isspace(*s); s++)
+ ;
+ for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
+ ;
+
+ for (i = 0; *s && s < e; s++) {
+ if (iscntrl(*s) || isspace(*s)) {
+ i++;
+ continue;
+ }
+ if (i) {
+ i = 0;
+ putchar(' ');
+ }
+ putchar(*s);
+ }
+}
+
+int
+parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
+{
+ struct tm *tm;
+
+ if (strtotime(s, t))
+ return -1;
+ if (!(tm = localtime(t)))
+ return -1;
+ if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
+ return -1;
+
+ return 0;
+}
diff --git a/util.h b/util.h
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include <time.h>
+#ifdef USE_PLEDGE
+#include <unistd.h>
+#else
+#define pledge(p1,p2) 0
+#endif
+
+#undef strlcat
+size_t strlcat(char *, const char *, size_t);
+#undef strlcpy
+size_t strlcpy(char *, const char *, size_t);
+
+#define ISUTF8(c) (((c) & 0xc0) != 0x80)
+
+/* feed info */
+struct feed {
+ char * name; /* feed name */
+ unsigned long totalnew; /* amount of new items per feed */
+ unsigned long total; /* total items */
+ time_t timenewest;
+ char timenewestformat[64];
+};
+
+enum {
+ FieldUnixTimestamp = 0,
+ FieldText, FieldItemid,
+ FieldUsername, FieldFullname,
+ FieldRetweetid, FieldIspinned,
+ FieldLast
+};
+
+size_t parseline(char *, char *[FieldLast]);
+int parsetime(const char *, time_t *, char *, size_t);
+void printescape(const char *);
+void printutf8pad(FILE *, const char *, size_t, int);
+int strtotime(const char *, time_t *);
+void xmlencode(const char *, FILE *);
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.