tscrape.c - tscrape - twitter scraper (not working anymore) | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
tscrape.c (11213B) | |
--- | |
1 #include <sys/types.h> | |
2 | |
3 #include <ctype.h> | |
4 #include <err.h> | |
5 #include <stdlib.h> | |
6 #include <stdio.h> | |
7 #include <string.h> | |
8 #include <strings.h> | |
9 #include <time.h> | |
10 #include <unistd.h> | |
11 | |
12 #include "json.h" | |
13 #include "util.h" | |
14 | |
15 #define STRP(s) s,sizeof(s)-1 | |
16 | |
17 /* a tweet */ | |
18 struct tweet { | |
19 char fullname[1024]; | |
20 int ispinned; | |
21 char itemusername[1024]; | |
22 char itemfullname[1024]; | |
23 char full_text[4096]; | |
24 char username[1024]; | |
25 time_t timestamp; | |
26 char datatime[16]; | |
27 char itemid[64]; | |
28 char retweetid[64]; | |
29 | |
30 struct tweet *next; | |
31 }; | |
32 | |
33 struct replacement { | |
34 char search[256]; | |
35 size_t search_len; | |
36 char replace[1024]; | |
37 | |
38 struct replacement *next; | |
39 }; | |
40 | |
41 static struct tweet *tweets, *tc; | |
42 static struct replacement *reps, *rc; | |
43 static char expanded_url[1024], media_url[1024], url[256]; | |
44 | |
45 #define MAX_PINNED 5 | |
46 static char pinnedids[MAX_PINNED][64]; | |
47 static size_t npinned; | |
48 | |
49 long long | |
50 datetounix(long long year, int mon, int day, int hour, int min, int sec) | |
51 { | |
52 static const int secs_through_month[] = { | |
53 0, 31 * 86400, 59 * 86400, 90 * 86400, | |
54 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, | |
55 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; | |
56 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; | |
57 long long t; | |
58 | |
59 if (year - 2ULL <= 136) { | |
60 leaps = (year - 68) >> 2; | |
61 if (!((year - 68) & 3)) { | |
62 leaps--; | |
63 is_leap = 1; | |
64 } else { | |
65 is_leap = 0; | |
66 } | |
67 t = 31536000 * (year - 70) + 86400 * leaps; | |
68 } else { | |
69 cycles = (year - 100) / 400; | |
70 rem = (year - 100) % 400; | |
71 if (rem < 0) { | |
72 cycles--; | |
73 rem += 400; | |
74 } | |
75 if (!rem) { | |
76 is_leap = 1; | |
77 } else { | |
78 if (rem >= 300) | |
79 centuries = 3, rem -= 300; | |
80 else if (rem >= 200) | |
81 centuries = 2, rem -= 200; | |
82 else if (rem >= 100) | |
83 centuries = 1, rem -= 100; | |
84 if (rem) { | |
85 leaps = rem / 4U; | |
86 rem %= 4U; | |
87 is_leap = !rem; | |
88 } | |
89 } | |
90 leaps += 97 * cycles + 24 * centuries - is_leap; | |
91 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684… | |
92 } | |
93 t += secs_through_month[mon]; | |
94 if (is_leap && mon >= 2) | |
95 t += 86400; | |
96 t += 86400LL * (day - 1); | |
97 t += 3600LL * hour; | |
98 t += 60LL * min; | |
99 t += sec; | |
100 | |
101 return t; | |
102 } | |
103 | |
104 /* parse time format: "Wed May 27 04:12:34 +0000 2020" | |
105 assumes tz offset is "+0000" */ | |
106 static int | |
107 parsetime(const char *s, time_t *tp) | |
108 { | |
109 static char *mons[] = { | |
110 "Jan", "Feb", "Mar", "Apr", "May", "Jun", | |
111 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", | |
112 }; | |
113 int year, mon = 0, mday, hour, min, sec, i; | |
114 char tzbuf[6], monbuf[4], wdaybuf[4]; | |
115 | |
116 for (; *s && isspace((unsigned char)*s); s++) | |
117 ; | |
118 i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d", | |
119 wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &ye… | |
120 if (i != 8) | |
121 return -1; | |
122 for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) { | |
123 if (!strcmp(mons[i], monbuf)) { | |
124 mon = i + 1; | |
125 break; | |
126 } | |
127 } | |
128 if (mon == 0) | |
129 return -1; | |
130 | |
131 /* invalid range */ | |
132 if (year < 0 || year > 9999 || | |
133 mon < 1 || mon > 12 || | |
134 mday < 1 || mday > 31 || | |
135 hour < 0 || hour > 23 || | |
136 min < 0 || min> 59 || | |
137 sec < 0 || sec > 59) | |
138 return -1; | |
139 | |
140 if (tp) | |
141 *tp = datetounix(year - 1900, mon - 1, mday, hour, min, … | |
142 return 0; | |
143 } | |
144 | |
145 static void | |
146 printescape(const char *s) | |
147 { | |
148 for (; *s; s++) { | |
149 if (!iscntrl((unsigned char)*s)) | |
150 putchar(*s); | |
151 } | |
152 } | |
153 | |
154 /* print text and expand urls */ | |
155 static void | |
156 printexpand(const char *s) | |
157 { | |
158 struct replacement *r; | |
159 | |
160 for (; *s; s++) { | |
161 if (isspace((unsigned char)*s)) { | |
162 putchar(' '); | |
163 continue; | |
164 } else if (iscntrl((unsigned char)*s)) { | |
165 continue; | |
166 } | |
167 for (r = reps; r; r = r->next) { | |
168 if (!strncmp(s, r->search, r->search_len)) { | |
169 s += r->search_len - 1; | |
170 printescape(r->replace); | |
171 break; | |
172 } | |
173 } | |
174 if (!r) | |
175 putchar(*s); | |
176 } | |
177 } | |
178 | |
179 static void | |
180 printtweet(struct tweet *t) | |
181 { | |
182 if (t->timestamp != -1) | |
183 printf("%lld", (long long)t->timestamp); | |
184 putchar('\t'); | |
185 printescape(t->username); | |
186 putchar('\t'); | |
187 printescape(t->fullname); | |
188 putchar('\t'); | |
189 printexpand(t->full_text); | |
190 putchar('\t'); | |
191 printescape(t->itemid); | |
192 putchar('\t'); | |
193 if (t->itemusername[0]) | |
194 printescape(t->itemusername); | |
195 else | |
196 printescape(t->username); | |
197 putchar('\t'); | |
198 if (t->itemfullname[0]) | |
199 printescape(t->itemfullname); | |
200 else | |
201 printescape(t->fullname); | |
202 putchar('\t'); | |
203 printescape(t->retweetid); | |
204 putchar('\t'); | |
205 printf("%d", t->ispinned); | |
206 putchar('\n'); | |
207 } | |
208 | |
209 void | |
210 addpinned(const char *str) | |
211 { | |
212 if (npinned + 1 >= MAX_PINNED) | |
213 return; | |
214 strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0])); | |
215 npinned++; | |
216 } | |
217 | |
218 void | |
219 addtweet(void) | |
220 { | |
221 struct tweet *t; | |
222 | |
223 if (!(t = calloc(1, sizeof(*t)))) | |
224 err(1, "calloc"); | |
225 t->timestamp = -1; | |
226 if (tweets) | |
227 tc = tc->next = t; | |
228 else | |
229 tweets = tc = t; | |
230 } | |
231 | |
232 void | |
233 addreplacement(const char *search, const char *replace) | |
234 { | |
235 struct replacement *r; | |
236 | |
237 for (r = reps; r; r = r->next) { | |
238 if (!strncmp(search, r->search, r->search_len)) | |
239 return; | |
240 } | |
241 | |
242 if (!(r = calloc(1, sizeof(*r)))) | |
243 err(1, "calloc"); | |
244 strlcpy(r->search, search, sizeof(r->search)); | |
245 r->search_len = strlen(r->search); | |
246 strlcpy(r->replace, replace, sizeof(r->replace)); | |
247 | |
248 if (reps) | |
249 rc = rc->next = r; | |
250 else | |
251 reps = rc = r; | |
252 } | |
253 | |
254 void | |
255 processnodes(struct json_node *nodes, size_t depth, const char *str) | |
256 { | |
257 if (depth == 2 && | |
258 nodes[0].type == JSON_TYPE_ARRAY && | |
259 nodes[1].type == JSON_TYPE_OBJECT) { | |
260 addtweet(); | |
261 } | |
262 | |
263 if (tc) { | |
264 if (depth == 3 && | |
265 nodes[0].type == JSON_TYPE_ARRAY && | |
266 nodes[1].type == JSON_TYPE_OBJECT && | |
267 nodes[2].type == JSON_TYPE_STRING) { | |
268 if (!strcmp(nodes[2].name, "created_at")) { | |
269 parsetime(str, &tc->timestamp); | |
270 } else if (!strcmp(nodes[2].name, "id_str")) { | |
271 strlcpy(tc->itemid, str, sizeof(tc->item… | |
272 } else if (!strcmp(nodes[2].name, "full_text")) { | |
273 /* if set by retweet text don't override… | |
274 if (!tc->full_text[0]) | |
275 strlcpy(tc->full_text, str, size… | |
276 } | |
277 } | |
278 if (depth == 4 && | |
279 nodes[0].type == JSON_TYPE_ARRAY && | |
280 nodes[1].type == JSON_TYPE_OBJECT && | |
281 nodes[2].type == JSON_TYPE_OBJECT && | |
282 !strcmp(nodes[2].name, "user")) { | |
283 if (nodes[3].type == JSON_TYPE_STRING) { | |
284 if (!strcmp(nodes[3].name, "name")) { | |
285 strlcpy(tc->fullname, str, sizeo… | |
286 } else if (!strcmp(nodes[3].name, "scree… | |
287 strlcpy(tc->username, str, sizeo… | |
288 } | |
289 } | |
290 } | |
291 | |
292 if (depth == 4 && | |
293 nodes[0].type == JSON_TYPE_ARRAY && | |
294 nodes[1].type == JSON_TYPE_OBJECT && | |
295 nodes[2].type == JSON_TYPE_OBJECT && | |
296 nodes[3].type == JSON_TYPE_STRING && | |
297 !strcmp(nodes[2].name, "retweeted_status")) { | |
298 if (!strcmp(nodes[3].name, "id_str")) { | |
299 strlcpy(tc->retweetid, str, sizeof(tc->r… | |
300 } else if (!strcmp(nodes[3].name, "full_text")) { | |
301 strlcpy(tc->full_text, str, sizeof(tc->f… | |
302 } | |
303 } | |
304 | |
305 if (depth == 5 && | |
306 nodes[0].type == JSON_TYPE_ARRAY && | |
307 nodes[1].type == JSON_TYPE_OBJECT && | |
308 nodes[2].type == JSON_TYPE_OBJECT && | |
309 nodes[3].type == JSON_TYPE_OBJECT && | |
310 nodes[4].type == JSON_TYPE_STRING && | |
311 !strcmp(nodes[2].name, "retweeted_status") && | |
312 !strcmp(nodes[3].name, "user")) { | |
313 if (!strcmp(nodes[4].name, "name")) { | |
314 strlcpy(tc->itemfullname, str, sizeof(tc… | |
315 } else if (!strcmp(nodes[4].name, "screen_name")… | |
316 strlcpy(tc->itemusername, str, sizeof(tc… | |
317 } | |
318 } | |
319 } | |
320 | |
321 if (depth == 5 && | |
322 nodes[0].type == JSON_TYPE_ARRAY && | |
323 nodes[1].type == JSON_TYPE_OBJECT && | |
324 nodes[2].type == JSON_TYPE_OBJECT && | |
325 !strcmp(nodes[2].name, "user")) { | |
326 if (nodes[3].type == JSON_TYPE_ARRAY && | |
327 !strcmp(nodes[3].name, "pinned_tweet_ids")) { | |
328 if (nodes[4].type == JSON_TYPE_NUMBER) { | |
329 addpinned(str); | |
330 } | |
331 } | |
332 } | |
333 | |
334 if (depth == 6 && | |
335 nodes[0].type == JSON_TYPE_ARRAY && | |
336 nodes[1].type == JSON_TYPE_OBJECT && | |
337 nodes[2].type == JSON_TYPE_OBJECT && | |
338 nodes[3].type == JSON_TYPE_ARRAY && | |
339 nodes[4].type == JSON_TYPE_OBJECT && | |
340 nodes[5].type == JSON_TYPE_STRING && | |
341 !strcmp(nodes[2].name, "entities") && | |
342 !strcmp(nodes[3].name, "urls")) { | |
343 if (!strcmp(nodes[5].name, "url")) { | |
344 strlcpy(url, str, sizeof(url)); | |
345 } else if (!strcmp(nodes[5].name, "expanded_url")) { | |
346 /* assumes "expanded_url" is specified after "ur… | |
347 addreplacement(url, str); | |
348 url[0] = '\0'; | |
349 } | |
350 } | |
351 | |
352 /* [].extended_entities.media[].url */ | |
353 if (depth == 6 && | |
354 nodes[0].type == JSON_TYPE_ARRAY && | |
355 nodes[1].type == JSON_TYPE_OBJECT && | |
356 nodes[2].type == JSON_TYPE_OBJECT && | |
357 nodes[3].type == JSON_TYPE_ARRAY && | |
358 nodes[4].type == JSON_TYPE_OBJECT && | |
359 nodes[5].type == JSON_TYPE_STRING && | |
360 !strcmp(nodes[2].name, "extended_entities") && | |
361 !strcmp(nodes[3].name, "media")) { | |
362 if (!strcmp(nodes[5].name, "media_url_https")) { | |
363 strlcpy(media_url, str, sizeof(media_url)); | |
364 } else if (!strcmp(nodes[5].name, "url")) { | |
365 strlcpy(url, str, sizeof(url)); | |
366 } else if (!strcmp(nodes[5].name, "expanded_url")) { | |
367 strlcpy(expanded_url, str, sizeof(expanded_url)); | |
368 } else if (!strcmp(nodes[5].name, "type")) { | |
369 if (!strcmp(str, "photo")) { | |
370 addreplacement(url, media_url); | |
371 } else { | |
372 addreplacement(url, expanded_url); | |
373 } | |
374 media_url[0] = url[0] = expanded_url[0] = '\0'; | |
375 } | |
376 } | |
377 | |
378 if (depth == 7 && | |
379 nodes[0].type == JSON_TYPE_ARRAY && | |
380 nodes[1].type == JSON_TYPE_OBJECT && | |
381 nodes[2].type == JSON_TYPE_OBJECT && | |
382 nodes[3].type == JSON_TYPE_OBJECT && | |
383 nodes[4].type == JSON_TYPE_ARRAY && | |
384 nodes[5].type == JSON_TYPE_OBJECT && | |
385 nodes[6].type == JSON_TYPE_STRING && | |
386 !strcmp(nodes[2].name, "retweeted_status") && | |
387 !strcmp(nodes[3].name, "entities") && | |
388 !strcmp(nodes[4].name, "urls")) { | |
389 if (!strcmp(nodes[6].name, "url")) { | |
390 strlcpy(url, str, sizeof(url)); | |
391 } else if (!strcmp(nodes[6].name, "expanded_url")) { | |
392 addreplacement(url, str); | |
393 url[0] = '\0'; | |
394 } | |
395 } | |
396 | |
397 /* [].retweeted_status.extended_entities.media[].url */ | |
398 if (depth == 7 && | |
399 nodes[0].type == JSON_TYPE_ARRAY && | |
400 nodes[1].type == JSON_TYPE_OBJECT && | |
401 nodes[2].type == JSON_TYPE_OBJECT && | |
402 nodes[3].type == JSON_TYPE_OBJECT && | |
403 nodes[4].type == JSON_TYPE_ARRAY && | |
404 nodes[5].type == JSON_TYPE_OBJECT && | |
405 nodes[6].type == JSON_TYPE_STRING && | |
406 !strcmp(nodes[2].name, "retweeted_status") && | |
407 !strcmp(nodes[3].name, "extended_entities") && | |
408 !strcmp(nodes[4].name, "media")) { | |
409 if (!strcmp(nodes[6].name, "media_url_https")) { | |
410 strlcpy(media_url, str, sizeof(media_url)); | |
411 } else if (!strcmp(nodes[6].name, "url")) { | |
412 strlcpy(url, str, sizeof(url)); | |
413 } else if (!strcmp(nodes[6].name, "expanded_url")) { | |
414 strlcpy(expanded_url, str, sizeof(expanded_url)); | |
415 } else if (!strcmp(nodes[6].name, "type")) { | |
416 if (!strcmp(str, "photo")) { | |
417 addreplacement(url, media_url); | |
418 } else { | |
419 addreplacement(url, expanded_url); | |
420 } | |
421 media_url[0] = url[0] = expanded_url[0] = '\0'; | |
422 } | |
423 } | |
424 } | |
425 | |
426 int | |
427 main(void) | |
428 { | |
429 struct tweet *t; | |
430 size_t i; | |
431 | |
432 if (pledge("stdio", NULL) == -1) | |
433 err(1, "pledge"); | |
434 | |
435 if (parsejson(processnodes)) | |
436 errx(2, "invalid JSON"); | |
437 | |
438 /* replace some HTML entities */ | |
439 addreplacement("<", "<"); | |
440 addreplacement(">", ">"); | |
441 addreplacement("&", "&"); | |
442 | |
443 for (t = tweets; t; t = t->next) { | |
444 /* check for pinned tweets */ | |
445 for (i = 0; i < npinned; i++) { | |
446 if (!strcmp(t->itemid, pinnedids[i])) { | |
447 t->ispinned = 1; | |
448 break; | |
449 } | |
450 } | |
451 printtweet(t); | |
452 } | |
453 | |
454 return 0; | |
455 } |