Introduction
Introduction Statistics Contact Development Disclaimer Help
tscrape.c - tscrape - twitter scraper (not working anymore)
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
tscrape.c (11213B)
---
1 #include <sys/types.h>
2
3 #include <ctype.h>
4 #include <err.h>
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <strings.h>
9 #include <time.h>
10 #include <unistd.h>
11
12 #include "json.h"
13 #include "util.h"
14
15 #define STRP(s) s,sizeof(s)-1
16
17 /* a tweet */
18 struct tweet {
19 char fullname[1024];
20 int ispinned;
21 char itemusername[1024];
22 char itemfullname[1024];
23 char full_text[4096];
24 char username[1024];
25 time_t timestamp;
26 char datatime[16];
27 char itemid[64];
28 char retweetid[64];
29
30 struct tweet *next;
31 };
32
33 struct replacement {
34 char search[256];
35 size_t search_len;
36 char replace[1024];
37
38 struct replacement *next;
39 };
40
41 static struct tweet *tweets, *tc;
42 static struct replacement *reps, *rc;
43 static char expanded_url[1024], media_url[1024], url[256];
44
45 #define MAX_PINNED 5
46 static char pinnedids[MAX_PINNED][64];
47 static size_t npinned;
48
49 long long
50 datetounix(long long year, int mon, int day, int hour, int min, int sec)
51 {
52 static const int secs_through_month[] = {
53 0, 31 * 86400, 59 * 86400, 90 * 86400,
54 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
55 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
56 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
57 long long t;
58
59 if (year - 2ULL <= 136) {
60 leaps = (year - 68) >> 2;
61 if (!((year - 68) & 3)) {
62 leaps--;
63 is_leap = 1;
64 } else {
65 is_leap = 0;
66 }
67 t = 31536000 * (year - 70) + 86400 * leaps;
68 } else {
69 cycles = (year - 100) / 400;
70 rem = (year - 100) % 400;
71 if (rem < 0) {
72 cycles--;
73 rem += 400;
74 }
75 if (!rem) {
76 is_leap = 1;
77 } else {
78 if (rem >= 300)
79 centuries = 3, rem -= 300;
80 else if (rem >= 200)
81 centuries = 2, rem -= 200;
82 else if (rem >= 100)
83 centuries = 1, rem -= 100;
84 if (rem) {
85 leaps = rem / 4U;
86 rem %= 4U;
87 is_leap = !rem;
88 }
89 }
90 leaps += 97 * cycles + 24 * centuries - is_leap;
91 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684…
92 }
93 t += secs_through_month[mon];
94 if (is_leap && mon >= 2)
95 t += 86400;
96 t += 86400LL * (day - 1);
97 t += 3600LL * hour;
98 t += 60LL * min;
99 t += sec;
100
101 return t;
102 }
103
104 /* parse time format: "Wed May 27 04:12:34 +0000 2020"
105 assumes tz offset is "+0000" */
106 static int
107 parsetime(const char *s, time_t *tp)
108 {
109 static char *mons[] = {
110 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
111 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
112 };
113 int year, mon = 0, mday, hour, min, sec, i;
114 char tzbuf[6], monbuf[4], wdaybuf[4];
115
116 for (; *s && isspace((unsigned char)*s); s++)
117 ;
118 i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
119 wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &ye…
120 if (i != 8)
121 return -1;
122 for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
123 if (!strcmp(mons[i], monbuf)) {
124 mon = i + 1;
125 break;
126 }
127 }
128 if (mon == 0)
129 return -1;
130
131 /* invalid range */
132 if (year < 0 || year > 9999 ||
133 mon < 1 || mon > 12 ||
134 mday < 1 || mday > 31 ||
135 hour < 0 || hour > 23 ||
136 min < 0 || min> 59 ||
137 sec < 0 || sec > 59)
138 return -1;
139
140 if (tp)
141 *tp = datetounix(year - 1900, mon - 1, mday, hour, min, …
142 return 0;
143 }
144
145 static void
146 printescape(const char *s)
147 {
148 for (; *s; s++) {
149 if (!iscntrl((unsigned char)*s))
150 putchar(*s);
151 }
152 }
153
154 /* print text and expand urls */
155 static void
156 printexpand(const char *s)
157 {
158 struct replacement *r;
159
160 for (; *s; s++) {
161 if (isspace((unsigned char)*s)) {
162 putchar(' ');
163 continue;
164 } else if (iscntrl((unsigned char)*s)) {
165 continue;
166 }
167 for (r = reps; r; r = r->next) {
168 if (!strncmp(s, r->search, r->search_len)) {
169 s += r->search_len - 1;
170 printescape(r->replace);
171 break;
172 }
173 }
174 if (!r)
175 putchar(*s);
176 }
177 }
178
179 static void
180 printtweet(struct tweet *t)
181 {
182 if (t->timestamp != -1)
183 printf("%lld", (long long)t->timestamp);
184 putchar('\t');
185 printescape(t->username);
186 putchar('\t');
187 printescape(t->fullname);
188 putchar('\t');
189 printexpand(t->full_text);
190 putchar('\t');
191 printescape(t->itemid);
192 putchar('\t');
193 if (t->itemusername[0])
194 printescape(t->itemusername);
195 else
196 printescape(t->username);
197 putchar('\t');
198 if (t->itemfullname[0])
199 printescape(t->itemfullname);
200 else
201 printescape(t->fullname);
202 putchar('\t');
203 printescape(t->retweetid);
204 putchar('\t');
205 printf("%d", t->ispinned);
206 putchar('\n');
207 }
208
209 void
210 addpinned(const char *str)
211 {
212 if (npinned + 1 >= MAX_PINNED)
213 return;
214 strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
215 npinned++;
216 }
217
218 void
219 addtweet(void)
220 {
221 struct tweet *t;
222
223 if (!(t = calloc(1, sizeof(*t))))
224 err(1, "calloc");
225 t->timestamp = -1;
226 if (tweets)
227 tc = tc->next = t;
228 else
229 tweets = tc = t;
230 }
231
232 void
233 addreplacement(const char *search, const char *replace)
234 {
235 struct replacement *r;
236
237 for (r = reps; r; r = r->next) {
238 if (!strncmp(search, r->search, r->search_len))
239 return;
240 }
241
242 if (!(r = calloc(1, sizeof(*r))))
243 err(1, "calloc");
244 strlcpy(r->search, search, sizeof(r->search));
245 r->search_len = strlen(r->search);
246 strlcpy(r->replace, replace, sizeof(r->replace));
247
248 if (reps)
249 rc = rc->next = r;
250 else
251 reps = rc = r;
252 }
253
254 void
255 processnodes(struct json_node *nodes, size_t depth, const char *str)
256 {
257 if (depth == 2 &&
258 nodes[0].type == JSON_TYPE_ARRAY &&
259 nodes[1].type == JSON_TYPE_OBJECT) {
260 addtweet();
261 }
262
263 if (tc) {
264 if (depth == 3 &&
265 nodes[0].type == JSON_TYPE_ARRAY &&
266 nodes[1].type == JSON_TYPE_OBJECT &&
267 nodes[2].type == JSON_TYPE_STRING) {
268 if (!strcmp(nodes[2].name, "created_at")) {
269 parsetime(str, &tc->timestamp);
270 } else if (!strcmp(nodes[2].name, "id_str")) {
271 strlcpy(tc->itemid, str, sizeof(tc->item…
272 } else if (!strcmp(nodes[2].name, "full_text")) {
273 /* if set by retweet text don't override…
274 if (!tc->full_text[0])
275 strlcpy(tc->full_text, str, size…
276 }
277 }
278 if (depth == 4 &&
279 nodes[0].type == JSON_TYPE_ARRAY &&
280 nodes[1].type == JSON_TYPE_OBJECT &&
281 nodes[2].type == JSON_TYPE_OBJECT &&
282 !strcmp(nodes[2].name, "user")) {
283 if (nodes[3].type == JSON_TYPE_STRING) {
284 if (!strcmp(nodes[3].name, "name")) {
285 strlcpy(tc->fullname, str, sizeo…
286 } else if (!strcmp(nodes[3].name, "scree…
287 strlcpy(tc->username, str, sizeo…
288 }
289 }
290 }
291
292 if (depth == 4 &&
293 nodes[0].type == JSON_TYPE_ARRAY &&
294 nodes[1].type == JSON_TYPE_OBJECT &&
295 nodes[2].type == JSON_TYPE_OBJECT &&
296 nodes[3].type == JSON_TYPE_STRING &&
297 !strcmp(nodes[2].name, "retweeted_status")) {
298 if (!strcmp(nodes[3].name, "id_str")) {
299 strlcpy(tc->retweetid, str, sizeof(tc->r…
300 } else if (!strcmp(nodes[3].name, "full_text")) {
301 strlcpy(tc->full_text, str, sizeof(tc->f…
302 }
303 }
304
305 if (depth == 5 &&
306 nodes[0].type == JSON_TYPE_ARRAY &&
307 nodes[1].type == JSON_TYPE_OBJECT &&
308 nodes[2].type == JSON_TYPE_OBJECT &&
309 nodes[3].type == JSON_TYPE_OBJECT &&
310 nodes[4].type == JSON_TYPE_STRING &&
311 !strcmp(nodes[2].name, "retweeted_status") &&
312 !strcmp(nodes[3].name, "user")) {
313 if (!strcmp(nodes[4].name, "name")) {
314 strlcpy(tc->itemfullname, str, sizeof(tc…
315 } else if (!strcmp(nodes[4].name, "screen_name")…
316 strlcpy(tc->itemusername, str, sizeof(tc…
317 }
318 }
319 }
320
321 if (depth == 5 &&
322 nodes[0].type == JSON_TYPE_ARRAY &&
323 nodes[1].type == JSON_TYPE_OBJECT &&
324 nodes[2].type == JSON_TYPE_OBJECT &&
325 !strcmp(nodes[2].name, "user")) {
326 if (nodes[3].type == JSON_TYPE_ARRAY &&
327 !strcmp(nodes[3].name, "pinned_tweet_ids")) {
328 if (nodes[4].type == JSON_TYPE_NUMBER) {
329 addpinned(str);
330 }
331 }
332 }
333
334 if (depth == 6 &&
335 nodes[0].type == JSON_TYPE_ARRAY &&
336 nodes[1].type == JSON_TYPE_OBJECT &&
337 nodes[2].type == JSON_TYPE_OBJECT &&
338 nodes[3].type == JSON_TYPE_ARRAY &&
339 nodes[4].type == JSON_TYPE_OBJECT &&
340 nodes[5].type == JSON_TYPE_STRING &&
341 !strcmp(nodes[2].name, "entities") &&
342 !strcmp(nodes[3].name, "urls")) {
343 if (!strcmp(nodes[5].name, "url")) {
344 strlcpy(url, str, sizeof(url));
345 } else if (!strcmp(nodes[5].name, "expanded_url")) {
346 /* assumes "expanded_url" is specified after "ur…
347 addreplacement(url, str);
348 url[0] = '\0';
349 }
350 }
351
352 /* [].extended_entities.media[].url */
353 if (depth == 6 &&
354 nodes[0].type == JSON_TYPE_ARRAY &&
355 nodes[1].type == JSON_TYPE_OBJECT &&
356 nodes[2].type == JSON_TYPE_OBJECT &&
357 nodes[3].type == JSON_TYPE_ARRAY &&
358 nodes[4].type == JSON_TYPE_OBJECT &&
359 nodes[5].type == JSON_TYPE_STRING &&
360 !strcmp(nodes[2].name, "extended_entities") &&
361 !strcmp(nodes[3].name, "media")) {
362 if (!strcmp(nodes[5].name, "media_url_https")) {
363 strlcpy(media_url, str, sizeof(media_url));
364 } else if (!strcmp(nodes[5].name, "url")) {
365 strlcpy(url, str, sizeof(url));
366 } else if (!strcmp(nodes[5].name, "expanded_url")) {
367 strlcpy(expanded_url, str, sizeof(expanded_url));
368 } else if (!strcmp(nodes[5].name, "type")) {
369 if (!strcmp(str, "photo")) {
370 addreplacement(url, media_url);
371 } else {
372 addreplacement(url, expanded_url);
373 }
374 media_url[0] = url[0] = expanded_url[0] = '\0';
375 }
376 }
377
378 if (depth == 7 &&
379 nodes[0].type == JSON_TYPE_ARRAY &&
380 nodes[1].type == JSON_TYPE_OBJECT &&
381 nodes[2].type == JSON_TYPE_OBJECT &&
382 nodes[3].type == JSON_TYPE_OBJECT &&
383 nodes[4].type == JSON_TYPE_ARRAY &&
384 nodes[5].type == JSON_TYPE_OBJECT &&
385 nodes[6].type == JSON_TYPE_STRING &&
386 !strcmp(nodes[2].name, "retweeted_status") &&
387 !strcmp(nodes[3].name, "entities") &&
388 !strcmp(nodes[4].name, "urls")) {
389 if (!strcmp(nodes[6].name, "url")) {
390 strlcpy(url, str, sizeof(url));
391 } else if (!strcmp(nodes[6].name, "expanded_url")) {
392 addreplacement(url, str);
393 url[0] = '\0';
394 }
395 }
396
397 /* [].retweeted_status.extended_entities.media[].url */
398 if (depth == 7 &&
399 nodes[0].type == JSON_TYPE_ARRAY &&
400 nodes[1].type == JSON_TYPE_OBJECT &&
401 nodes[2].type == JSON_TYPE_OBJECT &&
402 nodes[3].type == JSON_TYPE_OBJECT &&
403 nodes[4].type == JSON_TYPE_ARRAY &&
404 nodes[5].type == JSON_TYPE_OBJECT &&
405 nodes[6].type == JSON_TYPE_STRING &&
406 !strcmp(nodes[2].name, "retweeted_status") &&
407 !strcmp(nodes[3].name, "extended_entities") &&
408 !strcmp(nodes[4].name, "media")) {
409 if (!strcmp(nodes[6].name, "media_url_https")) {
410 strlcpy(media_url, str, sizeof(media_url));
411 } else if (!strcmp(nodes[6].name, "url")) {
412 strlcpy(url, str, sizeof(url));
413 } else if (!strcmp(nodes[6].name, "expanded_url")) {
414 strlcpy(expanded_url, str, sizeof(expanded_url));
415 } else if (!strcmp(nodes[6].name, "type")) {
416 if (!strcmp(str, "photo")) {
417 addreplacement(url, media_url);
418 } else {
419 addreplacement(url, expanded_url);
420 }
421 media_url[0] = url[0] = expanded_url[0] = '\0';
422 }
423 }
424 }
425
426 int
427 main(void)
428 {
429 struct tweet *t;
430 size_t i;
431
432 if (pledge("stdio", NULL) == -1)
433 err(1, "pledge");
434
435 if (parsejson(processnodes))
436 errx(2, "invalid JSON");
437
438 /* replace some HTML entities */
439 addreplacement("&lt;", "<");
440 addreplacement("&gt;", ">");
441 addreplacement("&amp;", "&");
442
443 for (t = tweets; t; t = t->next) {
444 /* check for pinned tweets */
445 for (i = 0; i < npinned; i++) {
446 if (!strcmp(t->itemid, pinnedids[i])) {
447 t->ispinned = 1;
448 break;
449 }
450 }
451 printtweet(t);
452 }
453
454 return 0;
455 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.