sfeed.c - sfeed - RSS and Atom parser | |
git clone git://git.codemadness.org/sfeed | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
sfeed.c (30076B) | |
--- | |
1 #include <errno.h> | |
2 #include <stdint.h> | |
3 #include <stdio.h> | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <strings.h> | |
7 | |
8 #include "util.h" | |
9 #include "xml.h" | |
10 | |
11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) | |
12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) | |
13 | |
14 /* these feed fields support multiple separated values */ | |
15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory) | |
16 | |
17 /* string and byte-length */ | |
18 #define STRP(s) s,sizeof(s)-1 | |
19 | |
20 enum FeedType { | |
21 FeedTypeNone = 0, | |
22 FeedTypeRSS = 1, | |
23 FeedTypeAtom = 2 | |
24 }; | |
25 | |
26 enum ContentType { | |
27 ContentTypeNone = 0, | |
28 ContentTypePlain = 1, | |
29 ContentTypeHTML = 2 | |
30 }; | |
31 static const char *contenttypes[] = { "", "plain", "html" }; | |
32 | |
33 /* String data / memory pool */ | |
34 typedef struct string { | |
35 char *data; /* data */ | |
36 size_t len; /* string length */ | |
37 size_t bufsiz; /* allocated size */ | |
38 } String; | |
39 | |
40 /* NOTE: the order of these fields (content, date, author) indicate the | |
41 * priority to use them, from least important to high. */ | |
42 enum TagId { | |
43 TagUnknown = 0, | |
44 /* RSS */ | |
45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priorit… | |
46 RSSTagTitle, | |
47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, | |
48 RSSTagGuid, | |
49 RSSTagGuidPermalinkFalse, | |
50 RSSTagGuidPermalinkTrue, | |
51 /* must be defined after GUID, because it can be a link (isPerma… | |
52 RSSTagLink, | |
53 RSSTagEnclosure, | |
54 RSSTagAuthor, RSSTagDccreator, | |
55 RSSTagCategory, | |
56 /* Atom */ | |
57 /* creation date has higher priority */ | |
58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished, | |
59 AtomTagTitle, | |
60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent, | |
61 AtomTagId, | |
62 AtomTagLink, | |
63 AtomTagLinkAlternate, | |
64 AtomTagLinkEnclosure, | |
65 AtomTagAuthor, AtomTagAuthorName, | |
66 AtomTagCategory, | |
67 TagLast | |
68 }; | |
69 | |
70 typedef struct feedtag { | |
71 char *name; /* name of tag to match */ | |
72 size_t len; /* len of `name` */ | |
73 enum TagId id; /* unique ID */ | |
74 } FeedTag; | |
75 | |
76 typedef struct field { | |
77 String str; | |
78 enum TagId tagid; /* tagid set previously, used for tag priority… | |
79 } FeedField; | |
80 | |
81 enum { | |
82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldConte… | |
83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCateg… | |
84 FeedFieldLast | |
85 }; | |
86 | |
87 typedef struct feedcontext { | |
88 String *field; /* current FeedItem field String … | |
89 FeedField fields[FeedFieldLast]; /* data for current item… | |
90 FeedTag tag; /* unique current parsed tag */ | |
91 int iscontent; /* in content data */ | |
92 int iscontenttag; /* in content tag */ | |
93 enum ContentType contenttype; /* content-type for item */ | |
94 enum FeedType feedtype; | |
95 int attrcount; /* count item HTML element attrib… | |
96 } FeedContext; | |
97 | |
98 static long long datetounix(long long, int, int, int, int, int); | |
99 static FeedTag * gettag(enum FeedType, const char *, size_t); | |
100 static long gettzoffset(const char *); | |
101 static int isattr(const char *, size_t, const char *, size_t); | |
102 static int istag(const char *, size_t, const char *, size_t); | |
103 static int parsetime(const char *, long long *); | |
104 static void printfields(void); | |
105 static void string_append(String *, const char *, size_t); | |
106 static void string_buffer_realloc(String *, size_t); | |
107 static void string_clear(String *); | |
108 static void string_print_encoded(String *); | |
109 static void string_print_timestamp(String *); | |
110 static void string_print_trimmed(String *); | |
111 static void string_print_trimmed_multi(String *); | |
112 static void string_print_uri(String *); | |
113 static void xmlattr(XMLParser *, const char *, size_t, const char *, siz… | |
114 const char *, size_t); | |
115 static void xmlattrentity(XMLParser *, const char *, size_t, const char … | |
116 size_t, const char *, size_t); | |
117 static void xmlattrend(XMLParser *, const char *, size_t, const char *, | |
118 size_t); | |
119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *, | |
120 size_t); | |
121 static void xmldata(XMLParser *, const char *, size_t); | |
122 static void xmldataentity(XMLParser *, const char *, size_t); | |
123 static void xmltagend(XMLParser *, const char *, size_t, int); | |
124 static void xmltagstart(XMLParser *, const char *, size_t); | |
125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int); | |
126 | |
127 /* map tag name to TagId type */ | |
128 /* RSS, keep this in alphabetical order */ | |
129 static const FeedTag rsstags[] = { | |
130 { STRP("author"), RSSTagAuthor }, | |
131 { STRP("category"), RSSTagCategory }, | |
132 { STRP("content:encoded"), RSSTagContentEncoded }, | |
133 { STRP("dc:creator"), RSSTagDccreator }, | |
134 { STRP("dc:date"), RSSTagDcdate }, | |
135 { STRP("description"), RSSTagDescription }, | |
136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> … | |
137 { STRP("enclosure"), RSSTagEnclosure }, | |
138 { STRP("guid"), RSSTagGuid }, | |
139 { STRP("link"), RSSTagLink }, | |
140 { STRP("media:description"), RSSTagMediaDescription }, | |
141 { STRP("pubdate"), RSSTagPubdate }, | |
142 { STRP("title"), RSSTagTitle } | |
143 }; | |
144 | |
145 /* Atom, keep this in alphabetical order */ | |
146 static const FeedTag atomtags[] = { | |
147 { STRP("author"), AtomTagAuthor }, | |
148 { STRP("category"), AtomTagCategory }, | |
149 { STRP("content"), AtomTagContent }, | |
150 { STRP("id"), AtomTagId }, | |
151 { STRP("issued"), AtomTagIssued }, /* Atom … | |
152 /* Atom: <link href="" />, RSS has <link></link> */ | |
153 { STRP("link"), AtomTagLink }, | |
154 { STRP("media:description"), AtomTagMediaDescription }, | |
155 { STRP("modified"), AtomTagModified }, /* Atom … | |
156 { STRP("published"), AtomTagPublished }, | |
157 { STRP("summary"), AtomTagSummary }, | |
158 { STRP("title"), AtomTagTitle }, | |
159 { STRP("updated"), AtomTagUpdated } | |
160 }; | |
161 | |
162 /* special case: nested <author><name> */ | |
163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; | |
164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorNa… | |
165 | |
166 /* reference to no / unknown tag */ | |
167 static const FeedTag notag = { STRP(""), TagUnknown }; | |
168 | |
169 /* map TagId type to RSS/Atom field, all tags must be defined */ | |
170 static const int fieldmap[TagLast] = { | |
171 [TagUnknown] = -1, | |
172 /* RSS */ | |
173 [RSSTagDcdate] = FeedFieldTime, | |
174 [RSSTagPubdate] = FeedFieldTime, | |
175 [RSSTagTitle] = FeedFieldTitle, | |
176 [RSSTagMediaDescription] = FeedFieldContent, | |
177 [RSSTagDescription] = FeedFieldContent, | |
178 [RSSTagContentEncoded] = FeedFieldContent, | |
179 [RSSTagGuid] = -1, | |
180 [RSSTagGuidPermalinkFalse] = FeedFieldId, | |
181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both … | |
182 [RSSTagLink] = FeedFieldLink, | |
183 [RSSTagEnclosure] = FeedFieldEnclosure, | |
184 [RSSTagAuthor] = FeedFieldAuthor, | |
185 [RSSTagDccreator] = FeedFieldAuthor, | |
186 [RSSTagCategory] = FeedFieldCategory, | |
187 /* Atom */ | |
188 [AtomTagModified] = FeedFieldTime, | |
189 [AtomTagUpdated] = FeedFieldTime, | |
190 [AtomTagIssued] = FeedFieldTime, | |
191 [AtomTagPublished] = FeedFieldTime, | |
192 [AtomTagTitle] = FeedFieldTitle, | |
193 [AtomTagMediaDescription] = FeedFieldContent, | |
194 [AtomTagSummary] = FeedFieldContent, | |
195 [AtomTagContent] = FeedFieldContent, | |
196 [AtomTagId] = FeedFieldId, | |
197 [AtomTagLink] = -1, | |
198 [AtomTagLinkAlternate] = FeedFieldLink, | |
199 [AtomTagLinkEnclosure] = FeedFieldEnclosure, | |
200 [AtomTagAuthor] = -1, | |
201 [AtomTagAuthorName] = FeedFieldAuthor, | |
202 [AtomTagCategory] = FeedFieldCategory | |
203 }; | |
204 | |
205 static const int FieldSeparator = '\t'; | |
206 /* separator for multiple values in a field, separator should be 1 byte … | |
207 static const char FieldMultiSeparator[] = "|"; | |
208 static struct uri baseuri; | |
209 static const char *baseurl; | |
210 | |
211 static FeedContext ctx; | |
212 static XMLParser parser; /* XML parser state */ | |
213 static String attrispermalink, attrrel, attrtype, tmpstr; | |
214 | |
215 /* Unique tag(id) for parsed tag name. */ | |
216 static FeedTag * | |
217 gettag(enum FeedType feedtype, const char *name, size_t namelen) | |
218 { | |
219 FeedTag *r; | |
220 size_t i; | |
221 | |
222 switch (feedtype) { | |
223 case FeedTypeRSS: | |
224 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i+… | |
225 r = (FeedTag *)&rsstags[i]; | |
226 if (r->len == namelen && !strcasecmp(r->name, na… | |
227 return r; | |
228 } | |
229 break; | |
230 case FeedTypeAtom: | |
231 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); … | |
232 r = (FeedTag *)&atomtags[i]; | |
233 if (r->len == namelen && !strcasecmp(r->name, na… | |
234 return r; | |
235 } | |
236 break; | |
237 default: | |
238 break; | |
239 } | |
240 | |
241 return NULL; | |
242 } | |
243 | |
244 static char * | |
245 ltrim(const char *s) | |
246 { | |
247 for (; ISSPACE((unsigned char)*s); s++) | |
248 ; | |
249 return (char *)s; | |
250 } | |
251 | |
252 static char * | |
253 rtrim(const char *s) | |
254 { | |
255 const char *e; | |
256 | |
257 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)… | |
258 ; | |
259 return (char *)e; | |
260 } | |
261 | |
262 /* Clear string only; don't free, prevents unnecessary reallocation. */ | |
263 static void | |
264 string_clear(String *s) | |
265 { | |
266 if (s->data) | |
267 s->data[0] = '\0'; | |
268 s->len = 0; | |
269 } | |
270 | |
271 static void | |
272 string_buffer_realloc(String *s, size_t newlen) | |
273 { | |
274 size_t alloclen; | |
275 | |
276 if (newlen > SIZE_MAX / 2) { | |
277 alloclen = SIZE_MAX; | |
278 } else { | |
279 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) | |
280 ; | |
281 } | |
282 if (!(s->data = realloc(s->data, alloclen))) | |
283 err(1, "realloc"); | |
284 s->bufsiz = alloclen; | |
285 } | |
286 | |
287 /* Append data to String, s->data and data may not overlap. */ | |
288 static void | |
289 string_append(String *s, const char *data, size_t len) | |
290 { | |
291 if (!len) | |
292 return; | |
293 | |
294 if (s->len >= SIZE_MAX - len) { | |
295 errno = ENOMEM; | |
296 err(1, "realloc"); | |
297 } | |
298 | |
299 /* check if allocation is necessary, never shrink the buffer. */ | |
300 if (s->len + len >= s->bufsiz) | |
301 string_buffer_realloc(s, s->len + len + 1); | |
302 memcpy(s->data + s->len, data, len); | |
303 s->len += len; | |
304 s->data[s->len] = '\0'; | |
305 } | |
306 | |
307 /* Print text, encode TABs, newlines and '\', remove other whitespace. | |
308 * Remove leading and trailing whitespace. */ | |
309 static void | |
310 string_print_encoded(String *s) | |
311 { | |
312 const char *p, *e; | |
313 | |
314 if (!s->data || !s->len) | |
315 return; | |
316 | |
317 p = ltrim(s->data); | |
318 e = rtrim(p); | |
319 | |
320 for (; *p && p != e; p++) { | |
321 switch (*p) { | |
322 case '\n': putchar('\\'); putchar('n'); break; | |
323 case '\\': putchar('\\'); putchar('\\'); break; | |
324 case '\t': putchar('\\'); putchar('t'); break; | |
325 default: | |
326 /* ignore control chars */ | |
327 if (!ISCNTRL((unsigned char)*p)) | |
328 putchar(*p); | |
329 break; | |
330 } | |
331 } | |
332 } | |
333 | |
334 static void | |
335 printtrimmed(const char *s) | |
336 { | |
337 char *p, *e; | |
338 | |
339 p = ltrim(s); | |
340 e = rtrim(p); | |
341 for (; *p && p != e; p++) { | |
342 if (ISSPACE((unsigned char)*p)) | |
343 putchar(' '); /* any whitespace to space */ | |
344 else if (!ISCNTRL((unsigned char)*p)) | |
345 /* ignore other control chars */ | |
346 putchar(*p); | |
347 } | |
348 } | |
349 | |
350 /* Print text, replace TABs, carriage return and other whitespace with '… | |
351 * Other control chars are removed. Remove leading and trailing whitespa… | |
352 static void | |
353 string_print_trimmed(String *s) | |
354 { | |
355 if (!s->data || !s->len) | |
356 return; | |
357 | |
358 printtrimmed(s->data); | |
359 } | |
360 | |
361 /* Print each field with trimmed whitespace, separated by '|'. */ | |
362 static void | |
363 string_print_trimmed_multi(String *s) | |
364 { | |
365 char *p, *e; | |
366 int c; | |
367 | |
368 if (!s->data || !s->len) | |
369 return; | |
370 | |
371 for (p = s->data; ; p = e + 1) { | |
372 if ((e = strstr(p, FieldMultiSeparator))) { | |
373 c = *e; | |
374 *e = '\0'; | |
375 printtrimmed(p); | |
376 *e = c; /* restore NUL byte to original characte… | |
377 fputs(FieldMultiSeparator, stdout); | |
378 } else { | |
379 printtrimmed(p); | |
380 break; | |
381 } | |
382 } | |
383 } | |
384 | |
385 /* Print URL, if it is a relative URL then it uses the global `baseurl`.… | |
386 static void | |
387 printuri(char *s) | |
388 { | |
389 char link[4096], *p, *e; | |
390 struct uri newuri, olduri; | |
391 int c, r = -1; | |
392 | |
393 p = ltrim(s); | |
394 e = rtrim(p); | |
395 c = *e; | |
396 *e = '\0'; | |
397 | |
398 if (baseurl && !uri_hasscheme(p) && | |
399 uri_parse(p, &olduri) != -1 && !olduri.proto[0] && | |
400 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.prot… | |
401 r = uri_format(link, sizeof(link), &newuri); | |
402 | |
403 if (r >= 0 && (size_t)r < sizeof(link)) | |
404 printtrimmed(link); | |
405 else | |
406 printtrimmed(p); | |
407 | |
408 *e = c; /* restore NUL byte to original character */ | |
409 } | |
410 | |
411 /* Print URL, if it is a relative URL then it uses the global `baseurl`.… | |
412 static void | |
413 string_print_uri(String *s) | |
414 { | |
415 if (!s->data || !s->len) | |
416 return; | |
417 | |
418 printuri(s->data); | |
419 } | |
420 | |
421 /* Print as UNIX timestamp, print nothing if the time is empty or invali… | |
422 static void | |
423 string_print_timestamp(String *s) | |
424 { | |
425 long long t; | |
426 | |
427 if (!s->data || !s->len) | |
428 return; | |
429 | |
430 if (parsetime(s->data, &t) != -1) | |
431 printf("%lld", t); | |
432 } | |
433 | |
434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestam… | |
435 * Parameters should be passed as they are in a struct tm: | |
436 * that is: year = year - 1900, month = month - 1. */ | |
437 static long long | |
438 datetounix(long long year, int mon, int day, int hour, int min, int sec) | |
439 { | |
440 /* seconds in a month in a regular (non-leap) year */ | |
441 static const long secs_through_month[] = { | |
442 0, 31 * 86400, 59 * 86400, 90 * 86400, | |
443 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, | |
444 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; | |
445 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; | |
446 long long t; | |
447 | |
448 /* optimization: handle common range year 1902 up to and includi… | |
449 if (year - 2ULL <= 136) { | |
450 /* amount of leap days relative to 1970: every 4 years */ | |
451 leaps = (year - 68) >> 2; | |
452 if (!((year - 68) & 3)) { | |
453 leaps--; | |
454 is_leap = 1; | |
455 } else { | |
456 is_leap = 0; | |
457 } | |
458 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 8… | |
459 } else { | |
460 /* general leap year calculation: | |
461 * leap years occur mostly every 4 years but every 100 y… | |
462 * a leap year is skipped unless the year is divisible b… | |
463 cycles = (year - 100) / 400; | |
464 rem = (year - 100) % 400; | |
465 if (rem < 0) { | |
466 cycles--; | |
467 rem += 400; | |
468 } | |
469 if (!rem) { | |
470 is_leap = 1; | |
471 } else { | |
472 if (rem >= 300) { | |
473 centuries = 3; | |
474 rem -= 300; | |
475 } else if (rem >= 200) { | |
476 centuries = 2; | |
477 rem -= 200; | |
478 } else if (rem >= 100) { | |
479 centuries = 1; | |
480 rem -= 100; | |
481 } | |
482 if (rem) { | |
483 leaps = rem / 4U; | |
484 rem %= 4U; | |
485 is_leap = !rem; | |
486 } | |
487 } | |
488 leaps += (97 * cycles) + (24 * centuries) - is_leap; | |
489 | |
490 /* adjust 8 leap days from 1970 up to and including 2000: | |
491 * ((30 * 365) + 8) * 86400 = 946771200 */ | |
492 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 94… | |
493 } | |
494 t += secs_through_month[mon]; | |
495 if (is_leap && mon >= 2) | |
496 t += 86400; | |
497 t += 86400LL * (day - 1); | |
498 t += 3600LL * hour; | |
499 t += 60LL * min; | |
500 t += sec; | |
501 | |
502 return t; | |
503 } | |
504 | |
505 /* Get timezone from string, return time offset in seconds from UTC. | |
506 * NOTE: only parses timezones in RFC 822, many other timezone names are | |
507 * ambiguous anyway. | |
508 * ANSI and military zones are defined wrong in RFC 822 and are unsuppor… | |
509 * see note on RFC 2822 4.3 page 32. */ | |
510 static long | |
511 gettzoffset(const char *s) | |
512 { | |
513 static const struct { | |
514 char *name; | |
515 int offhour; | |
516 } tzones[] = { | |
517 { "CDT", -5 * 3600 }, | |
518 { "CST", -6 * 3600 }, | |
519 { "EDT", -4 * 3600 }, | |
520 { "EST", -5 * 3600 }, | |
521 { "MDT", -6 * 3600 }, | |
522 { "MST", -7 * 3600 }, | |
523 { "PDT", -7 * 3600 }, | |
524 { "PST", -8 * 3600 }, | |
525 }; | |
526 const char *p; | |
527 long tzhour = 0, tzmin = 0; | |
528 size_t i; | |
529 | |
530 for (; ISSPACE((unsigned char)*s); s++) | |
531 ; | |
532 switch (*s) { | |
533 case '-': /* offset */ | |
534 case '+': | |
535 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*… | |
536 tzhour = (tzhour * 10) + (*p - '0'); | |
537 if (*p == ':') | |
538 p++; | |
539 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p+… | |
540 tzmin = (tzmin * 10) + (*p - '0'); | |
541 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ?… | |
542 default: /* timezone name */ | |
543 for (i = 0; ISALPHA((unsigned char)s[i]); i++) | |
544 ; | |
545 if (i != 3) | |
546 return 0; | |
547 /* compare timezone and adjust offset relative to UTC */ | |
548 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { | |
549 if (!memcmp(s, tzones[i].name, 3)) | |
550 return tzones[i].offhour; | |
551 } | |
552 } | |
553 return 0; | |
554 } | |
555 | |
556 /* Parse time string `s` into the UNIX timestamp `tp`. | |
557 * Returns 0 on success or -1 on failure. */ | |
558 static int | |
559 parsetime(const char *s, long long *tp) | |
560 { | |
561 static const struct { | |
562 char *name; | |
563 int len; | |
564 } mons[] = { | |
565 { STRP("January"), }, | |
566 { STRP("February"), }, | |
567 { STRP("March"), }, | |
568 { STRP("April"), }, | |
569 { STRP("May"), }, | |
570 { STRP("June"), }, | |
571 { STRP("July"), }, | |
572 { STRP("August"), }, | |
573 { STRP("September"), }, | |
574 { STRP("October"), }, | |
575 { STRP("November"), }, | |
576 { STRP("December"), }, | |
577 }; | |
578 int va[6] = { 0 }, i, j, v, vi; | |
579 size_t m; | |
580 | |
581 for (; ISSPACE((unsigned char)*s); s++) | |
582 ; | |
583 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s)) | |
584 return -1; | |
585 | |
586 if (ISDIGIT((unsigned char)s[0]) && | |
587 ISDIGIT((unsigned char)s[1]) && | |
588 ISDIGIT((unsigned char)s[2]) && | |
589 ISDIGIT((unsigned char)s[3])) { | |
590 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "… | |
591 vi = 0; | |
592 } else { | |
593 /* format: "[%a, ]%d %b %Y %H:%M:%S" */ | |
594 /* parse "[%a, ]%d %b %Y " part, then use time parsing a… | |
595 for (; ISALPHA((unsigned char)*s); s++) | |
596 ; | |
597 for (; ISSPACE((unsigned char)*s); s++) | |
598 ; | |
599 if (*s == ',') | |
600 s++; | |
601 for (; ISSPACE((unsigned char)*s); s++) | |
602 ; | |
603 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); … | |
604 v = (v * 10) + (*s - '0'); | |
605 va[2] = v; /* day */ | |
606 for (; ISSPACE((unsigned char)*s); s++) | |
607 ; | |
608 /* end of word month */ | |
609 for (j = 0; ISALPHA((unsigned char)s[j]); j++) | |
610 ; | |
611 /* check month name */ | |
612 if (j < 3 || j > 9) | |
613 return -1; /* month cannot match */ | |
614 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { | |
615 /* abbreviation (3 length) or long name */ | |
616 if ((j == 3 || j == mons[m].len) && | |
617 !strncasecmp(mons[m].name, s, j)) { | |
618 va[1] = m + 1; | |
619 s += j; | |
620 break; | |
621 } | |
622 } | |
623 if (m >= 12) | |
624 return -1; /* no month found */ | |
625 for (; ISSPACE((unsigned char)*s); s++) | |
626 ; | |
627 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); … | |
628 v = (v * 10) + (*s - '0'); | |
629 /* obsolete short year: RFC 2822 4.3 */ | |
630 if (i == 2 || i == 3) | |
631 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900; | |
632 va[0] = v; /* year */ | |
633 for (; ISSPACE((unsigned char)*s); s++) | |
634 ; | |
635 /* parse only regular time part, see below */ | |
636 vi = 3; | |
637 } | |
638 | |
639 /* parse time parts (and possibly remaining date parts) */ | |
640 for (; *s && vi < 6; vi++) { | |
641 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && | |
642 ISDIGIT((unsigned char)*s); s++, i++)… | |
643 v = (v * 10) + (*s - '0'); | |
644 } | |
645 va[vi] = v; | |
646 | |
647 if ((vi < 2 && (*s == '-' || *s == '/')) || | |
648 (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsi… | |
649 (vi > 2 && *s == ':')) | |
650 s++; | |
651 } | |
652 | |
653 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ | |
654 if (*s == '.' || *s == ',') { | |
655 for (s++; ISDIGIT((unsigned char)*s); s++) | |
656 ; | |
657 } | |
658 | |
659 /* invalid range */ | |
660 if (va[0] < 0 || va[0] > 9999 || | |
661 va[1] < 1 || va[1] > 12 || | |
662 va[2] < 1 || va[2] > 31 || | |
663 va[3] < 0 || va[3] > 23 || | |
664 va[4] < 0 || va[4] > 59 || | |
665 va[5] < 0 || va[5] > 60) /* allow leap second */ | |
666 return -1; | |
667 | |
668 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], v… | |
669 gettzoffset(s); | |
670 | |
671 return 0; | |
672 } | |
673 | |
674 static void | |
675 printfields(void) | |
676 { | |
677 string_print_timestamp(&ctx.fields[FeedFieldTime].str); | |
678 putchar(FieldSeparator); | |
679 string_print_trimmed(&ctx.fields[FeedFieldTitle].str); | |
680 putchar(FieldSeparator); | |
681 string_print_uri(&ctx.fields[FeedFieldLink].str); | |
682 putchar(FieldSeparator); | |
683 string_print_encoded(&ctx.fields[FeedFieldContent].str); | |
684 putchar(FieldSeparator); | |
685 fputs(contenttypes[ctx.contenttype], stdout); | |
686 putchar(FieldSeparator); | |
687 string_print_trimmed(&ctx.fields[FeedFieldId].str); | |
688 putchar(FieldSeparator); | |
689 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str); | |
690 putchar(FieldSeparator); | |
691 string_print_uri(&ctx.fields[FeedFieldEnclosure].str); | |
692 putchar(FieldSeparator); | |
693 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str); | |
694 putchar('\n'); | |
695 | |
696 if (ferror(stdout)) /* check for errors but do not flush */ | |
697 checkfileerror(stdout, "<stdout>", 'w'); | |
698 } | |
699 | |
700 static int | |
701 istag(const char *name, size_t len, const char *name2, size_t len2) | |
702 { | |
703 return (len == len2 && !strcasecmp(name, name2)); | |
704 } | |
705 | |
706 static int | |
707 isattr(const char *name, size_t len, const char *name2, size_t len2) | |
708 { | |
709 return (len == len2 && !strcasecmp(name, name2)); | |
710 } | |
711 | |
712 static void | |
713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, | |
714 const char *v, size_t vl) | |
715 { | |
716 /* handles transforming inline XML to data */ | |
717 if (ISINCONTENT(ctx)) { | |
718 if (ctx.contenttype == ContentTypeHTML) | |
719 xmldata(p, v, vl); | |
720 return; | |
721 } | |
722 | |
723 if (!ctx.tag.id) | |
724 return; | |
725 | |
726 /* content-type may be for Atom: text, xhtml, html or a mime-typ… | |
727 * for MRSS (media:description): plain, html. */ | |
728 if (ISCONTENTTAG(ctx)) { | |
729 if (isattr(n, nl, STRP("type"))) | |
730 string_append(&attrtype, v, vl); | |
731 return; | |
732 } | |
733 | |
734 if (ctx.feedtype == FeedTypeRSS) { | |
735 if (ctx.tag.id == RSSTagEnclosure && | |
736 isattr(n, nl, STRP("url"))) { | |
737 string_append(&tmpstr, v, vl); | |
738 } else if (ctx.tag.id == RSSTagGuid && | |
739 isattr(n, nl, STRP("ispermalink"))) { | |
740 string_append(&attrispermalink, v, vl); | |
741 } | |
742 } else if (ctx.feedtype == FeedTypeAtom) { | |
743 if (ctx.tag.id == AtomTagLink) { | |
744 if (isattr(n, nl, STRP("rel"))) { | |
745 string_append(&attrrel, v, vl); | |
746 } else if (isattr(n, nl, STRP("href"))) { | |
747 string_append(&tmpstr, v, vl); | |
748 } | |
749 } else if (ctx.tag.id == AtomTagCategory && | |
750 isattr(n, nl, STRP("term"))) { | |
751 string_append(&tmpstr, v, vl); | |
752 } | |
753 } | |
754 } | |
755 | |
756 static void | |
757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, siz… | |
758 const char *data, size_t datalen) | |
759 { | |
760 char buf[8]; | |
761 int len; | |
762 | |
763 /* handles transforming inline XML to data */ | |
764 if (ISINCONTENT(ctx)) { | |
765 if (ctx.contenttype == ContentTypeHTML) | |
766 xmldata(p, data, datalen); | |
767 return; | |
768 } | |
769 | |
770 if (!ctx.tag.id) | |
771 return; | |
772 | |
773 /* try to translate entity, else just pass as data to | |
774 * xmlattr handler. */ | |
775 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) | |
776 xmlattr(p, t, tl, n, nl, buf, (size_t)len); | |
777 else | |
778 xmlattr(p, t, tl, n, nl, data, datalen); | |
779 } | |
780 | |
781 static void | |
782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t… | |
783 { | |
784 if (ISINCONTENT(ctx)) { | |
785 if (ctx.contenttype == ContentTypeHTML) { | |
786 /* handles transforming inline XML to data */ | |
787 xmldata(p, "\"", 1); | |
788 ctx.attrcount = 0; | |
789 } | |
790 return; | |
791 } | |
792 } | |
793 | |
794 static void | |
795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size… | |
796 { | |
797 if (ISINCONTENT(ctx)) { | |
798 if (ctx.contenttype == ContentTypeHTML) { | |
799 /* handles transforming inline XML to data */ | |
800 if (!ctx.attrcount) | |
801 xmldata(p, " ", 1); | |
802 ctx.attrcount++; | |
803 xmldata(p, n, nl); | |
804 xmldata(p, "=\"", 2); | |
805 } | |
806 return; | |
807 } | |
808 | |
809 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink"))) | |
810 string_clear(&attrispermalink); | |
811 else if (attrrel.len && isattr(n, nl, STRP("rel"))) | |
812 string_clear(&attrrel); | |
813 else if (attrtype.len && isattr(n, nl, STRP("type"))) | |
814 string_clear(&attrtype); | |
815 else if (tmpstr.len && | |
816 (isattr(n, nl, STRP("href")) || | |
817 isattr(n, nl, STRP("term")) || | |
818 isattr(n, nl, STRP("url")))) | |
819 string_clear(&tmpstr); /* use the last value for multipl… | |
820 } | |
821 | |
822 static void | |
823 xmldata(XMLParser *p, const char *s, size_t len) | |
824 { | |
825 if (!ctx.field) | |
826 return; | |
827 | |
828 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) | |
829 string_append(&tmpstr, s, len); | |
830 else | |
831 string_append(ctx.field, s, len); | |
832 } | |
833 | |
834 static void | |
835 xmldataentity(XMLParser *p, const char *data, size_t datalen) | |
836 { | |
837 char buf[8]; | |
838 int len; | |
839 | |
840 if (!ctx.field) | |
841 return; | |
842 | |
843 /* try to translate entity, else just pass as data to | |
844 * xmldata handler. */ | |
845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) | |
846 xmldata(p, buf, (size_t)len); | |
847 else | |
848 xmldata(p, data, datalen); | |
849 } | |
850 | |
851 static void | |
852 xmltagstart(XMLParser *p, const char *t, size_t tl) | |
853 { | |
854 const FeedTag *f; | |
855 | |
856 if (ISINCONTENT(ctx)) { | |
857 if (ctx.contenttype == ContentTypeHTML) { | |
858 ctx.attrcount = 0; | |
859 xmldata(p, "<", 1); | |
860 xmldata(p, t, tl); | |
861 } | |
862 return; | |
863 } | |
864 | |
865 /* start of RSS or Atom item / entry */ | |
866 if (ctx.feedtype == FeedTypeNone) { | |
867 if (istag(t, tl, STRP("entry"))) | |
868 ctx.feedtype = FeedTypeAtom; | |
869 else if (istag(t, tl, STRP("item"))) | |
870 ctx.feedtype = FeedTypeRSS; | |
871 return; | |
872 } | |
873 | |
874 /* field tagid already set or nested tags. */ | |
875 if (ctx.tag.id) { | |
876 /* nested <author><name> for Atom */ | |
877 if (ctx.tag.id == AtomTagAuthor && | |
878 istag(t, tl, STRP("name"))) { | |
879 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ct… | |
880 } else { | |
881 return; /* other nested tags are not allowed: re… | |
882 } | |
883 } | |
884 | |
885 /* in item */ | |
886 if (ctx.tag.id == TagUnknown) { | |
887 if (!(f = gettag(ctx.feedtype, t, tl))) | |
888 f = ¬ag; | |
889 memcpy(&(ctx.tag), f, sizeof(ctx.tag)); | |
890 } | |
891 | |
892 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); | |
893 string_clear(&attrispermalink); | |
894 string_clear(&attrrel); | |
895 string_clear(&attrtype); | |
896 } | |
897 | |
898 static void | |
899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) | |
900 { | |
901 enum TagId tagid; | |
902 | |
903 if (ISINCONTENT(ctx)) { | |
904 if (ctx.contenttype == ContentTypeHTML) { | |
905 if (isshort) | |
906 xmldata(p, "/>", 2); | |
907 else | |
908 xmldata(p, ">", 1); | |
909 } | |
910 return; | |
911 } | |
912 | |
913 /* set tag type based on its attribute value */ | |
914 if (ctx.tag.id == RSSTagGuid) { | |
915 /* if empty the default is "true" */ | |
916 if (!attrispermalink.len || | |
917 isattr(attrispermalink.data, attrispermalink.len, ST… | |
918 ctx.tag.id = RSSTagGuidPermalinkTrue; | |
919 else | |
920 ctx.tag.id = RSSTagGuidPermalinkFalse; | |
921 } else if (ctx.tag.id == AtomTagLink) { | |
922 /* empty or "alternate": other types could be | |
923 * "enclosure", "related", "self" or "via" */ | |
924 if (!attrrel.len || isattr(attrrel.data, attrrel.len, ST… | |
925 ctx.tag.id = AtomTagLinkAlternate; | |
926 else if (isattr(attrrel.data, attrrel.len, STRP("enclosu… | |
927 ctx.tag.id = AtomTagLinkEnclosure; | |
928 else | |
929 ctx.tag.id = AtomTagLink; /* unknown */ | |
930 } | |
931 | |
932 tagid = ctx.tag.id; | |
933 | |
934 /* map tag type to field: unknown or lesser priority is ignored, | |
935 * when tags of the same type are repeated only the first is use… | |
936 if (fieldmap[tagid] == -1 || | |
937 (!ISFEEDFIELDMULTI(fieldmap[tagid]) && | |
938 tagid <= ctx.fields[fieldmap[tagid]].tagid)) { | |
939 return; | |
940 } | |
941 | |
942 if (ctx.iscontenttag) { | |
943 ctx.iscontent = 1; | |
944 ctx.iscontenttag = 0; | |
945 | |
946 /* detect content-type based on type attribute */ | |
947 if (attrtype.len) { | |
948 if (isattr(attrtype.data, attrtype.len, STRP("ht… | |
949 isattr(attrtype.data, attrtype.len, STRP("xh… | |
950 isattr(attrtype.data, attrtype.len, STRP("te… | |
951 isattr(attrtype.data, attrtype.len, STRP("te… | |
952 isattr(attrtype.data, attrtype.len, STRP("ap… | |
953 ctx.contenttype = ContentTypeHTML; | |
954 else /* unknown: handle as base64 text data */ | |
955 ctx.contenttype = ContentTypePlain; | |
956 } else { | |
957 /* default content-type */ | |
958 if (tagid == RSSTagContentEncoded || tagid == RS… | |
959 ctx.contenttype = ContentTypeHTML; | |
960 else | |
961 ctx.contenttype = ContentTypePlain; | |
962 } | |
963 } | |
964 | |
965 ctx.field = &(ctx.fields[fieldmap[tagid]].str); | |
966 ctx.fields[fieldmap[tagid]].tagid = tagid; | |
967 | |
968 /* clear field if it is overwritten (with a priority order) for … | |
969 * value, if the field can have multiple values then do not clea… | |
970 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) | |
971 string_clear(ctx.field); | |
972 } | |
973 | |
974 static void | |
975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
976 { | |
977 size_t i; | |
978 | |
979 if (ctx.feedtype == FeedTypeNone) | |
980 return; | |
981 | |
982 if (ISINCONTENT(ctx)) { | |
983 /* not a closed content field */ | |
984 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) { | |
985 if (!isshort && ctx.contenttype == ContentTypeHT… | |
986 xmldata(p, "</", 2); | |
987 xmldata(p, t, tl); | |
988 xmldata(p, ">", 1); | |
989 } | |
990 return; | |
991 } | |
992 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)… | |
993 /* matched tag end: close it. | |
994 * copy also to the link field if the attribute isPermaL… | |
995 * and it is not set by a tag with higher priority. */ | |
996 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field && | |
997 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) { | |
998 string_clear(&ctx.fields[FeedFieldLink].str); | |
999 string_append(&ctx.fields[FeedFieldLink].str, | |
1000 ctx.field->data, ctx.field->len); | |
1001 ctx.fields[FeedFieldLink].tagid = ctx.tag.id; | |
1002 } | |
1003 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && | |
1004 istag(t, tl, STRP("entry"))) || /* Atom */ | |
1005 (ctx.feedtype == FeedTypeRSS && | |
1006 istag(t, tl, STRP("item"))))) /* RSS */ | |
1007 { | |
1008 /* end of RSS or Atom entry / item */ | |
1009 printfields(); | |
1010 | |
1011 /* clear strings */ | |
1012 for (i = 0; i < FeedFieldLast; i++) { | |
1013 string_clear(&ctx.fields[i].str); | |
1014 ctx.fields[i].tagid = TagUnknown; | |
1015 } | |
1016 ctx.contenttype = ContentTypeNone; | |
1017 /* allow parsing of Atom and RSS concatenated in one XML… | |
1018 ctx.feedtype = FeedTypeNone; | |
1019 } else { | |
1020 return; /* not end of field */ | |
1021 } | |
1022 | |
1023 /* temporary string: for fields that cannot be processed | |
1024 * directly and need more context, for example by its tag | |
1025 * attributes, like the Atom link rel="alternate|enclosure". */ | |
1026 if (tmpstr.len && ctx.field) { | |
1027 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) { | |
1028 if (ctx.field->len) | |
1029 string_append(ctx.field, FieldMultiSepar… | |
1030 string_append(ctx.field, tmpstr.data, tmpstr.len… | |
1031 } else { | |
1032 string_clear(ctx.field); | |
1033 string_append(ctx.field, tmpstr.data, tmpstr.len… | |
1034 } | |
1035 } | |
1036 | |
1037 /* close field */ | |
1038 string_clear(&tmpstr); /* reuse and clear temporary string */ | |
1039 | |
1040 if (ctx.tag.id == AtomTagAuthorName) | |
1041 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* … | |
1042 else | |
1043 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); | |
1044 | |
1045 ctx.iscontent = 0; | |
1046 ctx.field = NULL; | |
1047 } | |
1048 | |
1049 int | |
1050 main(int argc, char *argv[]) | |
1051 { | |
1052 if (pledge("stdio", NULL) == -1) | |
1053 err(1, "pledge"); | |
1054 | |
1055 if (argc > 1) { | |
1056 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[… | |
1057 baseurl = argv[1]; | |
1058 else | |
1059 errx(1, "baseurl incorrect or too long"); | |
1060 } | |
1061 | |
1062 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); | |
1063 | |
1064 parser.xmlattr = xmlattr; | |
1065 parser.xmlattrentity = xmlattrentity; | |
1066 parser.xmlattrend = xmlattrend; | |
1067 parser.xmlattrstart = xmlattrstart; | |
1068 parser.xmlcdata = xmldata; | |
1069 parser.xmldata = xmldata; | |
1070 parser.xmldataentity = xmldataentity; | |
1071 parser.xmltagend = xmltagend; | |
1072 parser.xmltagstart = xmltagstart; | |
1073 parser.xmltagstartparsed = xmltagstartparsed; | |
1074 | |
1075 /* NOTE: GETNEXT is defined in xml.h for inline optimization */ | |
1076 xml_parse(&parser); | |
1077 | |
1078 checkfileerror(stdin, "<stdin>", 'r'); | |
1079 checkfileerror(stdout, "<stdout>", 'w'); | |
1080 | |
1081 return 0; | |
1082 } |