jf2sfeed.c - jfconvert - JSON Feed (subset) to sfeed or Atom converter | |
git clone git://git.codemadness.org/jfconvert | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
jf2sfeed.c (14382B) | |
--- | |
1 #include <errno.h> | |
2 #include <stdarg.h> | |
3 #include <stdint.h> | |
4 #include <stdio.h> | |
5 #include <stdlib.h> | |
6 #include <string.h> | |
7 | |
8 #ifdef __OpenBSD__ | |
9 #include <unistd.h> | |
10 #else | |
11 #define pledge(a,b) 0 | |
12 #endif | |
13 | |
14 #include "json.h" | |
15 | |
16 /* hint for compilers and static analyzers that a function exits */ | |
17 #ifndef __dead | |
18 #define __dead | |
19 #endif | |
20 | |
21 /* ctype-like macros, but always compatible with ASCII / UTF-8 */ | |
22 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
23 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) | |
24 #define ISDIGIT(c) (((unsigned)c) - '0' < 10) | |
25 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
26 | |
27 /* compare attributes case-sensitively */ | |
28 #define attrcmp strcmp | |
29 | |
30 enum { | |
31 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldConte… | |
32 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCateg… | |
33 FeedFieldLast | |
34 }; | |
35 | |
36 enum ContentType { | |
37 ContentTypeNone = 0, | |
38 ContentTypePlain = 1, | |
39 ContentTypeHTML = 2 | |
40 }; | |
41 static const char *contenttypes[] = { "", "plain", "html" }; | |
42 | |
43 /* String data / memory pool */ | |
44 typedef struct string { | |
45 char *data; /* data */ | |
46 size_t len; /* string length */ | |
47 size_t bufsiz; /* allocated size */ | |
48 } String; | |
49 | |
50 static String fields[FeedFieldLast]; /* data for current item */ | |
51 static enum ContentType contenttype; /* content-type for item */ | |
52 static int itemisopen = 0; | |
53 | |
54 static const int FieldSeparator = '\t'; | |
55 /* separator for multiple values in a field, separator should be 1 byte … | |
56 static const char FieldMultiSeparator[] = "|"; | |
57 | |
58 /* print to stderr, print error message of errno and exit(). | |
59 Unlike BSD err() it does not prefix __progname */ | |
60 __dead void | |
61 err(int exitstatus, const char *fmt, ...) | |
62 { | |
63 va_list ap; | |
64 int saved_errno; | |
65 | |
66 saved_errno = errno; | |
67 | |
68 if (fmt) { | |
69 va_start(ap, fmt); | |
70 vfprintf(stderr, fmt, ap); | |
71 va_end(ap); | |
72 fputs(": ", stderr); | |
73 } | |
74 fprintf(stderr, "%s\n", strerror(saved_errno)); | |
75 | |
76 exit(exitstatus); | |
77 } | |
78 | |
79 /* print to stderr and exit(). | |
80 Unlike BSD errx() it does not prefix __progname */ | |
81 __dead void | |
82 errx(int exitstatus, const char *fmt, ...) | |
83 { | |
84 va_list ap; | |
85 | |
86 if (fmt) { | |
87 va_start(ap, fmt); | |
88 vfprintf(stderr, fmt, ap); | |
89 va_end(ap); | |
90 } | |
91 fputs("\n", stderr); | |
92 | |
93 exit(exitstatus); | |
94 } | |
95 | |
96 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestam… | |
97 Parameters should be passed as they are in a struct tm: | |
98 that is: year = year - 1900, month = month - 1. */ | |
99 static long long | |
100 datetounix(long long year, int mon, int day, int hour, int min, int sec) | |
101 { | |
102 /* seconds in a month in a regular (non-leap) year */ | |
103 static const long secs_through_month[] = { | |
104 0, 31 * 86400, 59 * 86400, 90 * 86400, | |
105 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, | |
106 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; | |
107 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; | |
108 long long t; | |
109 | |
110 /* optimization: handle common range year 1902 up to and includi… | |
111 if (year - 2ULL <= 136) { | |
112 /* amount of leap days relative to 1970: every 4 years */ | |
113 leaps = (year - 68) >> 2; | |
114 if (!((year - 68) & 3)) { | |
115 leaps--; | |
116 is_leap = 1; | |
117 } else { | |
118 is_leap = 0; | |
119 } | |
120 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 8… | |
121 } else { | |
122 /* general leap year calculation: | |
123 leap years occur mostly every 4 years but every 100 y… | |
124 a leap year is skipped unless the year is divisible b… | |
125 cycles = (year - 100) / 400; | |
126 rem = (year - 100) % 400; | |
127 if (rem < 0) { | |
128 cycles--; | |
129 rem += 400; | |
130 } | |
131 if (!rem) { | |
132 is_leap = 1; | |
133 } else { | |
134 if (rem >= 300) { | |
135 centuries = 3; | |
136 rem -= 300; | |
137 } else if (rem >= 200) { | |
138 centuries = 2; | |
139 rem -= 200; | |
140 } else if (rem >= 100) { | |
141 centuries = 1; | |
142 rem -= 100; | |
143 } | |
144 if (rem) { | |
145 leaps = rem / 4U; | |
146 rem %= 4U; | |
147 is_leap = !rem; | |
148 } | |
149 } | |
150 leaps += (97 * cycles) + (24 * centuries) - is_leap; | |
151 | |
152 /* adjust 8 leap days from 1970 up to and including 2000: | |
153 ((30 * 365) + 8) * 86400 = 946771200 */ | |
154 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 94… | |
155 } | |
156 t += secs_through_month[mon]; | |
157 if (is_leap && mon >= 2) | |
158 t += 86400; | |
159 t += 86400LL * (day - 1); | |
160 t += 3600LL * hour; | |
161 t += 60LL * min; | |
162 t += sec; | |
163 | |
164 return t; | |
165 } | |
166 | |
167 /* Get timezone from string, return time offset in seconds from UTC. */ | |
168 static long | |
169 gettzoffset(const char *s) | |
170 { | |
171 const char *p; | |
172 long tzhour = 0, tzmin = 0; | |
173 size_t i; | |
174 | |
175 for (; ISSPACE((unsigned char)*s); s++) | |
176 ; | |
177 switch (*s) { | |
178 case '-': /* offset */ | |
179 case '+': | |
180 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*… | |
181 tzhour = (tzhour * 10) + (*p - '0'); | |
182 if (*p == ':') | |
183 p++; | |
184 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p+… | |
185 tzmin = (tzmin * 10) + (*p - '0'); | |
186 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ?… | |
187 default: /* timezone name */ | |
188 break; | |
189 } | |
190 return 0; | |
191 } | |
192 | |
193 /* Parse time string `s` into the UNIX timestamp `tp`. | |
194 Returns 0 on success or -1 on failure. */ | |
195 static int | |
196 parsetime(const char *s, long long *tp) | |
197 { | |
198 int va[6] = { 0 }, i, v, vi; | |
199 | |
200 for (; ISSPACE((unsigned char)*s); s++) | |
201 ; | |
202 | |
203 if (!ISDIGIT((unsigned char)s[0]) || | |
204 !ISDIGIT((unsigned char)s[1]) || | |
205 !ISDIGIT((unsigned char)s[2]) || | |
206 !ISDIGIT((unsigned char)s[3])) | |
207 return -1; | |
208 | |
209 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H… | |
210 vi = 0; | |
211 | |
212 /* parse time parts (and possibly remaining date parts) */ | |
213 for (; *s && vi < 6; vi++) { | |
214 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && | |
215 ISDIGIT((unsigned char)*s); s++, i++)… | |
216 v = (v * 10) + (*s - '0'); | |
217 } | |
218 va[vi] = v; | |
219 | |
220 if ((vi < 2 && *s == '-') || | |
221 (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s)… | |
222 (vi > 2 && *s == ':')) | |
223 s++; | |
224 } | |
225 | |
226 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ | |
227 if (*s == '.') { | |
228 for (s++; ISDIGIT((unsigned char)*s); s++) | |
229 ; | |
230 } | |
231 | |
232 /* invalid range */ | |
233 if (va[0] < 0 || va[0] > 9999 || | |
234 va[1] < 1 || va[1] > 12 || | |
235 va[2] < 1 || va[2] > 31 || | |
236 va[3] < 0 || va[3] > 23 || | |
237 va[4] < 0 || va[4] > 59 || | |
238 va[5] < 0 || va[5] > 60) /* allow leap second */ | |
239 return -1; | |
240 | |
241 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], v… | |
242 gettzoffset(s); | |
243 | |
244 return 0; | |
245 } | |
246 | |
247 /* Handle read or write errors for a FILE * stream */ | |
248 static void | |
249 checkfileerror(FILE *fp, const char *name, int mode) | |
250 { | |
251 if (mode == 'r' && ferror(fp)) | |
252 errx(1, "read error: %s", name); | |
253 else if (mode == 'w' && (fflush(fp) || ferror(fp))) | |
254 errx(1, "write error: %s", name); | |
255 } | |
256 | |
257 /* Clear string only; don't free, prevents unnecessary reallocation. */ | |
258 static void | |
259 string_clear(String *s) | |
260 { | |
261 if (s->data) | |
262 s->data[0] = '\0'; | |
263 s->len = 0; | |
264 } | |
265 | |
266 static void | |
267 string_buffer_realloc(String *s, size_t newlen) | |
268 { | |
269 size_t alloclen; | |
270 | |
271 if (newlen > SIZE_MAX / 2) { | |
272 alloclen = SIZE_MAX; | |
273 } else { | |
274 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) | |
275 ; | |
276 } | |
277 if (!(s->data = realloc(s->data, alloclen))) | |
278 err(1, "realloc"); | |
279 s->bufsiz = alloclen; | |
280 } | |
281 | |
282 /* Append data to String, s->data and data may not overlap. */ | |
283 static void | |
284 string_append(String *s, const char *data, size_t len) | |
285 { | |
286 if (!len) | |
287 return; | |
288 | |
289 if (s->len >= SIZE_MAX - len) { | |
290 errno = ENOMEM; | |
291 err(1, "realloc"); | |
292 } | |
293 | |
294 /* check if allocation is necessary, never shrink the buffer. */ | |
295 if (s->len + len >= s->bufsiz) | |
296 string_buffer_realloc(s, s->len + len + 1); | |
297 memcpy(s->data + s->len, data, len); | |
298 s->len += len; | |
299 s->data[s->len] = '\0'; | |
300 } | |
301 | |
302 /* Clear and append string */ | |
303 static void | |
304 string_set(String *s, const char *data, size_t len) | |
305 { | |
306 string_clear(s); | |
307 string_append(s, data, len); | |
308 } | |
309 | |
310 /* Print text, encode TABs, newlines and '\', remove other whitespace. | |
311 * Remove leading and trailing whitespace. */ | |
312 static void | |
313 string_print_encoded(String *s) | |
314 { | |
315 const char *p, *e; | |
316 | |
317 if (!s->data || !s->len) | |
318 return; | |
319 | |
320 p = s->data; | |
321 e = p + s->len; | |
322 | |
323 for (; *p && p != e; p++) { | |
324 switch (*p) { | |
325 case '\n': putchar('\\'); putchar('n'); break; | |
326 case '\\': putchar('\\'); putchar('\\'); break; | |
327 case '\t': putchar('\\'); putchar('t'); break; | |
328 default: | |
329 /* ignore control chars */ | |
330 if (!ISCNTRL((unsigned char)*p)) | |
331 putchar(*p); | |
332 break; | |
333 } | |
334 } | |
335 } | |
336 | |
337 /* Print text, replace TABs, carriage return and other whitespace with '… | |
338 * Other control chars are removed. Remove leading and trailing whitespa… | |
339 static void | |
340 string_print(String *s) | |
341 { | |
342 const char *p, *e; | |
343 | |
344 if (!s->data || !s->len) | |
345 return; | |
346 | |
347 p = s->data; | |
348 e = s->data + s->len; | |
349 for (; *p && p != e; p++) { | |
350 if (ISSPACE((unsigned char)*p)) | |
351 putchar(' '); /* any whitespace to space */ | |
352 else if (!ISCNTRL((unsigned char)*p)) | |
353 /* ignore other control chars */ | |
354 putchar(*p); | |
355 } | |
356 } | |
357 | |
358 /* Print as UNIX timestamp, print nothing if the time is empty or invali… | |
359 static void | |
360 string_print_timestamp(String *s) | |
361 { | |
362 long long t; | |
363 | |
364 if (!s->data || !s->len) | |
365 return; | |
366 | |
367 if (parsetime(s->data, &t) != -1) | |
368 printf("%lld", t); | |
369 } | |
370 | |
371 static void | |
372 printfields(void) | |
373 { | |
374 string_print_timestamp(&fields[FeedFieldTime]); | |
375 putchar(FieldSeparator); | |
376 string_print(&fields[FeedFieldTitle]); | |
377 putchar(FieldSeparator); | |
378 string_print(&fields[FeedFieldLink]); | |
379 putchar(FieldSeparator); | |
380 string_print_encoded(&fields[FeedFieldContent]); | |
381 putchar(FieldSeparator); | |
382 fputs(contenttypes[contenttype], stdout); | |
383 putchar(FieldSeparator); | |
384 string_print(&fields[FeedFieldId]); | |
385 putchar(FieldSeparator); | |
386 string_print(&fields[FeedFieldAuthor]); | |
387 putchar(FieldSeparator); | |
388 string_print(&fields[FeedFieldEnclosure]); | |
389 putchar(FieldSeparator); | |
390 string_print(&fields[FeedFieldCategory]); | |
391 putchar('\n'); | |
392 | |
393 if (ferror(stdout)) /* check for errors but do not flush */ | |
394 checkfileerror(stdout, "<stdout>", 'w'); | |
395 } | |
396 | |
397 static void | |
398 newitem(void) | |
399 { | |
400 size_t i; | |
401 | |
402 contenttype = ContentTypeNone; | |
403 for (i = 0; i < FeedFieldLast; i++) | |
404 string_clear(&fields[i]); | |
405 | |
406 } | |
407 | |
408 static void | |
409 processnode(struct json_node *nodes, size_t depth, const char *value, si… | |
410 { | |
411 /* item */ | |
412 if (depth == 3) { | |
413 if (nodes[0].type == JSON_TYPE_OBJECT && | |
414 nodes[1].type == JSON_TYPE_ARRAY && | |
415 nodes[2].type == JSON_TYPE_OBJECT && | |
416 !attrcmp(nodes[1].name, "items")) { | |
417 if (itemisopen) | |
418 printfields(); | |
419 newitem(); | |
420 itemisopen = 1; | |
421 } | |
422 } | |
423 | |
424 /* item attributes */ | |
425 if (depth == 4) { | |
426 if (nodes[0].type == JSON_TYPE_OBJECT && | |
427 nodes[1].type == JSON_TYPE_ARRAY && | |
428 nodes[2].type == JSON_TYPE_OBJECT && | |
429 !attrcmp(nodes[1].name, "items")) { | |
430 if (!attrcmp(nodes[3].name, "content_html")) { | |
431 string_set(&fields[FeedFieldContent], va… | |
432 contenttype = ContentTypeHTML; | |
433 } else if (!attrcmp(nodes[3].name, "content_text… | |
434 /* prefer HTML, if summary text is set o… | |
435 if (!fields[FeedFieldContent].len && con… | |
436 string_set(&fields[FeedFieldCont… | |
437 contenttype = ContentTypePlain; | |
438 } | |
439 } else if (!attrcmp(nodes[3].name, "date_publish… | |
440 /* published has higher priority than up… | |
441 string_set(&fields[FeedFieldTime], value… | |
442 } else if (!attrcmp(nodes[3].name, "date_modifie… | |
443 if (!fields[FeedFieldTime].len) | |
444 string_append(&fields[FeedFieldT… | |
445 } else if (!attrcmp(nodes[3].name, "id")) { | |
446 if (!fields[FeedFieldId].len) | |
447 string_append(&fields[FeedFieldI… | |
448 } else if (!attrcmp(nodes[3].name, "summary")) { | |
449 /* only if content_html or content_text … | |
450 if (!fields[FeedFieldContent].len) { | |
451 string_append(&fields[FeedFieldC… | |
452 contenttype = ContentTypePlain; | |
453 } | |
454 } else if (!attrcmp(nodes[3].name, "title")) { | |
455 if (!fields[FeedFieldTitle].len) | |
456 string_set(&fields[FeedFieldTitl… | |
457 } else if (!attrcmp(nodes[3].name, "url")) { | |
458 if (!fields[FeedFieldLink].len) | |
459 string_append(&fields[FeedFieldL… | |
460 } | |
461 } | |
462 } | |
463 | |
464 if (depth == 5) { | |
465 /* 1.0 author name */ | |
466 if (nodes[0].type == JSON_TYPE_OBJECT && | |
467 nodes[1].type == JSON_TYPE_ARRAY && | |
468 nodes[2].type == JSON_TYPE_OBJECT && | |
469 nodes[3].type == JSON_TYPE_OBJECT && | |
470 nodes[4].type == JSON_TYPE_STRING && | |
471 !attrcmp(nodes[1].name, "items") && | |
472 !attrcmp(nodes[3].name, "author") && | |
473 !attrcmp(nodes[4].name, "name")) { | |
474 if (!fields[FeedFieldAuthor].len) | |
475 string_append(&fields[FeedFieldAuthor], … | |
476 } | |
477 | |
478 /* tags / categories */ | |
479 if (nodes[0].type == JSON_TYPE_OBJECT && | |
480 nodes[1].type == JSON_TYPE_ARRAY && | |
481 nodes[2].type == JSON_TYPE_OBJECT && | |
482 nodes[3].type == JSON_TYPE_ARRAY && | |
483 nodes[4].type == JSON_TYPE_STRING && | |
484 !attrcmp(nodes[1].name, "items") && | |
485 !attrcmp(nodes[3].name, "tags")) { | |
486 if (fields[FeedFieldCategory].len) | |
487 string_append(&fields[FeedFieldCategory]… | |
488 sizeof(FieldMultiSeparator… | |
489 string_append(&fields[FeedFieldCategory], value,… | |
490 } | |
491 } | |
492 | |
493 if (depth == 6) { | |
494 /* 1.1 author name */ | |
495 if (nodes[0].type == JSON_TYPE_OBJECT && | |
496 nodes[1].type == JSON_TYPE_ARRAY && | |
497 nodes[2].type == JSON_TYPE_OBJECT && | |
498 nodes[3].type == JSON_TYPE_ARRAY && | |
499 nodes[4].type == JSON_TYPE_OBJECT && | |
500 nodes[5].type == JSON_TYPE_STRING && | |
501 !attrcmp(nodes[1].name, "items") && | |
502 !attrcmp(nodes[3].name, "authors") && | |
503 !attrcmp(nodes[5].name, "name")) { | |
504 if (!fields[FeedFieldAuthor].len) | |
505 string_append(&fields[FeedFieldAuthor], … | |
506 } | |
507 | |
508 /* enclosure attributes */ | |
509 if (nodes[0].type == JSON_TYPE_OBJECT && | |
510 nodes[1].type == JSON_TYPE_ARRAY && | |
511 nodes[2].type == JSON_TYPE_OBJECT && | |
512 nodes[3].type == JSON_TYPE_ARRAY && | |
513 nodes[4].type == JSON_TYPE_OBJECT && | |
514 (nodes[5].type == JSON_TYPE_STRING || nodes[5].type … | |
515 !attrcmp(nodes[1].name, "items") && | |
516 !attrcmp(nodes[3].name, "attachments") && | |
517 !attrcmp(nodes[5].name, "url")) { | |
518 if (!fields[FeedFieldEnclosure].len) | |
519 string_append(&fields[FeedFieldEnclosure… | |
520 } | |
521 } | |
522 | |
523 if (ferror(stdout)) { | |
524 fprintf(stderr, "write error: <stdout>\n"); | |
525 exit(2); | |
526 } | |
527 } | |
528 | |
529 int | |
530 main(int argc, char *argv[]) | |
531 { | |
532 if (pledge("stdio", NULL) == -1) | |
533 err(1, "pledge"); | |
534 | |
535 switch (parsejson(processnode)) { | |
536 case JSON_ERROR_MEM: | |
537 errx(2, "error: cannot allocate enough memory"); | |
538 case JSON_ERROR_INVALID: | |
539 errx(1, "error: invalid JSON"); | |
540 } | |
541 | |
542 if (itemisopen) | |
543 printfields(); | |
544 | |
545 if (ferror(stdin)) | |
546 errx(2, "read error: <stdin>"); | |
547 if (fflush(stdout) || ferror(stdout)) | |
548 errx(2, "write error: <stdout>"); | |
549 | |
550 return 0; | |
551 } |