Introduction
Introduction Statistics Contact Development Disclaimer Help
sfeed.c - sfeed - RSS and Atom parser
git clone git://git.codemadness.org/sfeed
Log
Files
Refs
README
LICENSE
---
sfeed.c (30076B)
---
1 #include <errno.h>
2 #include <stdint.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <strings.h>
7
8 #include "util.h"
9 #include "xml.h"
10
11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
13
14 /* these feed fields support multiple separated values */
15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
16
17 /* string and byte-length */
18 #define STRP(s) s,sizeof(s)-1
19
20 enum FeedType {
21 FeedTypeNone = 0,
22 FeedTypeRSS = 1,
23 FeedTypeAtom = 2
24 };
25
26 enum ContentType {
27 ContentTypeNone = 0,
28 ContentTypePlain = 1,
29 ContentTypeHTML = 2
30 };
31 static const char *contenttypes[] = { "", "plain", "html" };
32
33 /* String data / memory pool */
34 typedef struct string {
35 char *data; /* data */
36 size_t len; /* string length */
37 size_t bufsiz; /* allocated size */
38 } String;
39
40 /* NOTE: the order of these fields (content, date, author) indicate the
41 * priority to use them, from least important to high. */
42 enum TagId {
43 TagUnknown = 0,
44 /* RSS */
45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priorit…
46 RSSTagTitle,
47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
48 RSSTagGuid,
49 RSSTagGuidPermalinkFalse,
50 RSSTagGuidPermalinkTrue,
51 /* must be defined after GUID, because it can be a link (isPerma…
52 RSSTagLink,
53 RSSTagEnclosure,
54 RSSTagAuthor, RSSTagDccreator,
55 RSSTagCategory,
56 /* Atom */
57 /* creation date has higher priority */
58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
59 AtomTagTitle,
60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
61 AtomTagId,
62 AtomTagLink,
63 AtomTagLinkAlternate,
64 AtomTagLinkEnclosure,
65 AtomTagAuthor, AtomTagAuthorName,
66 AtomTagCategory,
67 TagLast
68 };
69
70 typedef struct feedtag {
71 char *name; /* name of tag to match */
72 size_t len; /* len of `name` */
73 enum TagId id; /* unique ID */
74 } FeedTag;
75
76 typedef struct field {
77 String str;
78 enum TagId tagid; /* tagid set previously, used for tag priority…
79 } FeedField;
80
81 enum {
82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldConte…
83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCateg…
84 FeedFieldLast
85 };
86
87 typedef struct feedcontext {
88 String *field; /* current FeedItem field String …
89 FeedField fields[FeedFieldLast]; /* data for current item…
90 FeedTag tag; /* unique current parsed tag */
91 int iscontent; /* in content data */
92 int iscontenttag; /* in content tag */
93 enum ContentType contenttype; /* content-type for item */
94 enum FeedType feedtype;
95 int attrcount; /* count item HTML element attrib…
96 } FeedContext;
97
98 static long long datetounix(long long, int, int, int, int, int);
99 static FeedTag * gettag(enum FeedType, const char *, size_t);
100 static long gettzoffset(const char *);
101 static int isattr(const char *, size_t, const char *, size_t);
102 static int istag(const char *, size_t, const char *, size_t);
103 static int parsetime(const char *, long long *);
104 static void printfields(void);
105 static void string_append(String *, const char *, size_t);
106 static void string_buffer_realloc(String *, size_t);
107 static void string_clear(String *);
108 static void string_print_encoded(String *);
109 static void string_print_timestamp(String *);
110 static void string_print_trimmed(String *);
111 static void string_print_trimmed_multi(String *);
112 static void string_print_uri(String *);
113 static void xmlattr(XMLParser *, const char *, size_t, const char *, siz…
114 const char *, size_t);
115 static void xmlattrentity(XMLParser *, const char *, size_t, const char …
116 size_t, const char *, size_t);
117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
118 size_t);
119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
120 size_t);
121 static void xmldata(XMLParser *, const char *, size_t);
122 static void xmldataentity(XMLParser *, const char *, size_t);
123 static void xmltagend(XMLParser *, const char *, size_t, int);
124 static void xmltagstart(XMLParser *, const char *, size_t);
125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
126
127 /* map tag name to TagId type */
128 /* RSS, keep this in alphabetical order */
129 static const FeedTag rsstags[] = {
130 { STRP("author"), RSSTagAuthor },
131 { STRP("category"), RSSTagCategory },
132 { STRP("content:encoded"), RSSTagContentEncoded },
133 { STRP("dc:creator"), RSSTagDccreator },
134 { STRP("dc:date"), RSSTagDcdate },
135 { STRP("description"), RSSTagDescription },
136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> …
137 { STRP("enclosure"), RSSTagEnclosure },
138 { STRP("guid"), RSSTagGuid },
139 { STRP("link"), RSSTagLink },
140 { STRP("media:description"), RSSTagMediaDescription },
141 { STRP("pubdate"), RSSTagPubdate },
142 { STRP("title"), RSSTagTitle }
143 };
144
145 /* Atom, keep this in alphabetical order */
146 static const FeedTag atomtags[] = {
147 { STRP("author"), AtomTagAuthor },
148 { STRP("category"), AtomTagCategory },
149 { STRP("content"), AtomTagContent },
150 { STRP("id"), AtomTagId },
151 { STRP("issued"), AtomTagIssued }, /* Atom …
152 /* Atom: <link href="" />, RSS has <link></link> */
153 { STRP("link"), AtomTagLink },
154 { STRP("media:description"), AtomTagMediaDescription },
155 { STRP("modified"), AtomTagModified }, /* Atom …
156 { STRP("published"), AtomTagPublished },
157 { STRP("summary"), AtomTagSummary },
158 { STRP("title"), AtomTagTitle },
159 { STRP("updated"), AtomTagUpdated }
160 };
161
162 /* special case: nested <author><name> */
163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorNa…
165
166 /* reference to no / unknown tag */
167 static const FeedTag notag = { STRP(""), TagUnknown };
168
169 /* map TagId type to RSS/Atom field, all tags must be defined */
170 static const int fieldmap[TagLast] = {
171 [TagUnknown] = -1,
172 /* RSS */
173 [RSSTagDcdate] = FeedFieldTime,
174 [RSSTagPubdate] = FeedFieldTime,
175 [RSSTagTitle] = FeedFieldTitle,
176 [RSSTagMediaDescription] = FeedFieldContent,
177 [RSSTagDescription] = FeedFieldContent,
178 [RSSTagContentEncoded] = FeedFieldContent,
179 [RSSTagGuid] = -1,
180 [RSSTagGuidPermalinkFalse] = FeedFieldId,
181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both …
182 [RSSTagLink] = FeedFieldLink,
183 [RSSTagEnclosure] = FeedFieldEnclosure,
184 [RSSTagAuthor] = FeedFieldAuthor,
185 [RSSTagDccreator] = FeedFieldAuthor,
186 [RSSTagCategory] = FeedFieldCategory,
187 /* Atom */
188 [AtomTagModified] = FeedFieldTime,
189 [AtomTagUpdated] = FeedFieldTime,
190 [AtomTagIssued] = FeedFieldTime,
191 [AtomTagPublished] = FeedFieldTime,
192 [AtomTagTitle] = FeedFieldTitle,
193 [AtomTagMediaDescription] = FeedFieldContent,
194 [AtomTagSummary] = FeedFieldContent,
195 [AtomTagContent] = FeedFieldContent,
196 [AtomTagId] = FeedFieldId,
197 [AtomTagLink] = -1,
198 [AtomTagLinkAlternate] = FeedFieldLink,
199 [AtomTagLinkEnclosure] = FeedFieldEnclosure,
200 [AtomTagAuthor] = -1,
201 [AtomTagAuthorName] = FeedFieldAuthor,
202 [AtomTagCategory] = FeedFieldCategory
203 };
204
205 static const int FieldSeparator = '\t';
206 /* separator for multiple values in a field, separator should be 1 byte …
207 static const char FieldMultiSeparator[] = "|";
208 static struct uri baseuri;
209 static const char *baseurl;
210
211 static FeedContext ctx;
212 static XMLParser parser; /* XML parser state */
213 static String attrispermalink, attrrel, attrtype, tmpstr;
214
215 /* Unique tag(id) for parsed tag name. */
216 static FeedTag *
217 gettag(enum FeedType feedtype, const char *name, size_t namelen)
218 {
219 FeedTag *r;
220 size_t i;
221
222 switch (feedtype) {
223 case FeedTypeRSS:
224 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i+…
225 r = (FeedTag *)&rsstags[i];
226 if (r->len == namelen && !strcasecmp(r->name, na…
227 return r;
228 }
229 break;
230 case FeedTypeAtom:
231 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); …
232 r = (FeedTag *)&atomtags[i];
233 if (r->len == namelen && !strcasecmp(r->name, na…
234 return r;
235 }
236 break;
237 default:
238 break;
239 }
240
241 return NULL;
242 }
243
244 static char *
245 ltrim(const char *s)
246 {
247 for (; ISSPACE((unsigned char)*s); s++)
248 ;
249 return (char *)s;
250 }
251
252 static char *
253 rtrim(const char *s)
254 {
255 const char *e;
256
257 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)…
258 ;
259 return (char *)e;
260 }
261
262 /* Clear string only; don't free, prevents unnecessary reallocation. */
263 static void
264 string_clear(String *s)
265 {
266 if (s->data)
267 s->data[0] = '\0';
268 s->len = 0;
269 }
270
271 static void
272 string_buffer_realloc(String *s, size_t newlen)
273 {
274 size_t alloclen;
275
276 if (newlen > SIZE_MAX / 2) {
277 alloclen = SIZE_MAX;
278 } else {
279 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
280 ;
281 }
282 if (!(s->data = realloc(s->data, alloclen)))
283 err(1, "realloc");
284 s->bufsiz = alloclen;
285 }
286
287 /* Append data to String, s->data and data may not overlap. */
288 static void
289 string_append(String *s, const char *data, size_t len)
290 {
291 if (!len)
292 return;
293
294 if (s->len >= SIZE_MAX - len) {
295 errno = ENOMEM;
296 err(1, "realloc");
297 }
298
299 /* check if allocation is necessary, never shrink the buffer. */
300 if (s->len + len >= s->bufsiz)
301 string_buffer_realloc(s, s->len + len + 1);
302 memcpy(s->data + s->len, data, len);
303 s->len += len;
304 s->data[s->len] = '\0';
305 }
306
307 /* Print text, encode TABs, newlines and '\', remove other whitespace.
308 * Remove leading and trailing whitespace. */
309 static void
310 string_print_encoded(String *s)
311 {
312 const char *p, *e;
313
314 if (!s->data || !s->len)
315 return;
316
317 p = ltrim(s->data);
318 e = rtrim(p);
319
320 for (; *p && p != e; p++) {
321 switch (*p) {
322 case '\n': putchar('\\'); putchar('n'); break;
323 case '\\': putchar('\\'); putchar('\\'); break;
324 case '\t': putchar('\\'); putchar('t'); break;
325 default:
326 /* ignore control chars */
327 if (!ISCNTRL((unsigned char)*p))
328 putchar(*p);
329 break;
330 }
331 }
332 }
333
334 static void
335 printtrimmed(const char *s)
336 {
337 char *p, *e;
338
339 p = ltrim(s);
340 e = rtrim(p);
341 for (; *p && p != e; p++) {
342 if (ISSPACE((unsigned char)*p))
343 putchar(' '); /* any whitespace to space */
344 else if (!ISCNTRL((unsigned char)*p))
345 /* ignore other control chars */
346 putchar(*p);
347 }
348 }
349
350 /* Print text, replace TABs, carriage return and other whitespace with '…
351 * Other control chars are removed. Remove leading and trailing whitespa…
352 static void
353 string_print_trimmed(String *s)
354 {
355 if (!s->data || !s->len)
356 return;
357
358 printtrimmed(s->data);
359 }
360
361 /* Print each field with trimmed whitespace, separated by '|'. */
362 static void
363 string_print_trimmed_multi(String *s)
364 {
365 char *p, *e;
366 int c;
367
368 if (!s->data || !s->len)
369 return;
370
371 for (p = s->data; ; p = e + 1) {
372 if ((e = strstr(p, FieldMultiSeparator))) {
373 c = *e;
374 *e = '\0';
375 printtrimmed(p);
376 *e = c; /* restore NUL byte to original characte…
377 fputs(FieldMultiSeparator, stdout);
378 } else {
379 printtrimmed(p);
380 break;
381 }
382 }
383 }
384
385 /* Print URL, if it is a relative URL then it uses the global `baseurl`.…
386 static void
387 printuri(char *s)
388 {
389 char link[4096], *p, *e;
390 struct uri newuri, olduri;
391 int c, r = -1;
392
393 p = ltrim(s);
394 e = rtrim(p);
395 c = *e;
396 *e = '\0';
397
398 if (baseurl && !uri_hasscheme(p) &&
399 uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
400 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.prot…
401 r = uri_format(link, sizeof(link), &newuri);
402
403 if (r >= 0 && (size_t)r < sizeof(link))
404 printtrimmed(link);
405 else
406 printtrimmed(p);
407
408 *e = c; /* restore NUL byte to original character */
409 }
410
411 /* Print URL, if it is a relative URL then it uses the global `baseurl`.…
412 static void
413 string_print_uri(String *s)
414 {
415 if (!s->data || !s->len)
416 return;
417
418 printuri(s->data);
419 }
420
421 /* Print as UNIX timestamp, print nothing if the time is empty or invali…
422 static void
423 string_print_timestamp(String *s)
424 {
425 long long t;
426
427 if (!s->data || !s->len)
428 return;
429
430 if (parsetime(s->data, &t) != -1)
431 printf("%lld", t);
432 }
433
434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestam…
435 * Parameters should be passed as they are in a struct tm:
436 * that is: year = year - 1900, month = month - 1. */
437 static long long
438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
439 {
440 /* seconds in a month in a regular (non-leap) year */
441 static const long secs_through_month[] = {
442 0, 31 * 86400, 59 * 86400, 90 * 86400,
443 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
444 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
445 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
446 long long t;
447
448 /* optimization: handle common range year 1902 up to and includi…
449 if (year - 2ULL <= 136) {
450 /* amount of leap days relative to 1970: every 4 years */
451 leaps = (year - 68) >> 2;
452 if (!((year - 68) & 3)) {
453 leaps--;
454 is_leap = 1;
455 } else {
456 is_leap = 0;
457 }
458 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 8…
459 } else {
460 /* general leap year calculation:
461 * leap years occur mostly every 4 years but every 100 y…
462 * a leap year is skipped unless the year is divisible b…
463 cycles = (year - 100) / 400;
464 rem = (year - 100) % 400;
465 if (rem < 0) {
466 cycles--;
467 rem += 400;
468 }
469 if (!rem) {
470 is_leap = 1;
471 } else {
472 if (rem >= 300) {
473 centuries = 3;
474 rem -= 300;
475 } else if (rem >= 200) {
476 centuries = 2;
477 rem -= 200;
478 } else if (rem >= 100) {
479 centuries = 1;
480 rem -= 100;
481 }
482 if (rem) {
483 leaps = rem / 4U;
484 rem %= 4U;
485 is_leap = !rem;
486 }
487 }
488 leaps += (97 * cycles) + (24 * centuries) - is_leap;
489
490 /* adjust 8 leap days from 1970 up to and including 2000:
491 * ((30 * 365) + 8) * 86400 = 946771200 */
492 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 94…
493 }
494 t += secs_through_month[mon];
495 if (is_leap && mon >= 2)
496 t += 86400;
497 t += 86400LL * (day - 1);
498 t += 3600LL * hour;
499 t += 60LL * min;
500 t += sec;
501
502 return t;
503 }
504
505 /* Get timezone from string, return time offset in seconds from UTC.
506 * NOTE: only parses timezones in RFC 822, many other timezone names are
507 * ambiguous anyway.
508 * ANSI and military zones are defined wrong in RFC 822 and are unsuppor…
509 * see note on RFC 2822 4.3 page 32. */
510 static long
511 gettzoffset(const char *s)
512 {
513 static const struct {
514 char *name;
515 int offhour;
516 } tzones[] = {
517 { "CDT", -5 * 3600 },
518 { "CST", -6 * 3600 },
519 { "EDT", -4 * 3600 },
520 { "EST", -5 * 3600 },
521 { "MDT", -6 * 3600 },
522 { "MST", -7 * 3600 },
523 { "PDT", -7 * 3600 },
524 { "PST", -8 * 3600 },
525 };
526 const char *p;
527 long tzhour = 0, tzmin = 0;
528 size_t i;
529
530 for (; ISSPACE((unsigned char)*s); s++)
531 ;
532 switch (*s) {
533 case '-': /* offset */
534 case '+':
535 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*…
536 tzhour = (tzhour * 10) + (*p - '0');
537 if (*p == ':')
538 p++;
539 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p+…
540 tzmin = (tzmin * 10) + (*p - '0');
541 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ?…
542 default: /* timezone name */
543 for (i = 0; ISALPHA((unsigned char)s[i]); i++)
544 ;
545 if (i != 3)
546 return 0;
547 /* compare timezone and adjust offset relative to UTC */
548 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
549 if (!memcmp(s, tzones[i].name, 3))
550 return tzones[i].offhour;
551 }
552 }
553 return 0;
554 }
555
556 /* Parse time string `s` into the UNIX timestamp `tp`.
557 * Returns 0 on success or -1 on failure. */
558 static int
559 parsetime(const char *s, long long *tp)
560 {
561 static const struct {
562 char *name;
563 int len;
564 } mons[] = {
565 { STRP("January"), },
566 { STRP("February"), },
567 { STRP("March"), },
568 { STRP("April"), },
569 { STRP("May"), },
570 { STRP("June"), },
571 { STRP("July"), },
572 { STRP("August"), },
573 { STRP("September"), },
574 { STRP("October"), },
575 { STRP("November"), },
576 { STRP("December"), },
577 };
578 int va[6] = { 0 }, i, j, v, vi;
579 size_t m;
580
581 for (; ISSPACE((unsigned char)*s); s++)
582 ;
583 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
584 return -1;
585
586 if (ISDIGIT((unsigned char)s[0]) &&
587 ISDIGIT((unsigned char)s[1]) &&
588 ISDIGIT((unsigned char)s[2]) &&
589 ISDIGIT((unsigned char)s[3])) {
590 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "…
591 vi = 0;
592 } else {
593 /* format: "[%a, ]%d %b %Y %H:%M:%S" */
594 /* parse "[%a, ]%d %b %Y " part, then use time parsing a…
595 for (; ISALPHA((unsigned char)*s); s++)
596 ;
597 for (; ISSPACE((unsigned char)*s); s++)
598 ;
599 if (*s == ',')
600 s++;
601 for (; ISSPACE((unsigned char)*s); s++)
602 ;
603 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); …
604 v = (v * 10) + (*s - '0');
605 va[2] = v; /* day */
606 for (; ISSPACE((unsigned char)*s); s++)
607 ;
608 /* end of word month */
609 for (j = 0; ISALPHA((unsigned char)s[j]); j++)
610 ;
611 /* check month name */
612 if (j < 3 || j > 9)
613 return -1; /* month cannot match */
614 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
615 /* abbreviation (3 length) or long name */
616 if ((j == 3 || j == mons[m].len) &&
617 !strncasecmp(mons[m].name, s, j)) {
618 va[1] = m + 1;
619 s += j;
620 break;
621 }
622 }
623 if (m >= 12)
624 return -1; /* no month found */
625 for (; ISSPACE((unsigned char)*s); s++)
626 ;
627 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); …
628 v = (v * 10) + (*s - '0');
629 /* obsolete short year: RFC 2822 4.3 */
630 if (i == 2 || i == 3)
631 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
632 va[0] = v; /* year */
633 for (; ISSPACE((unsigned char)*s); s++)
634 ;
635 /* parse only regular time part, see below */
636 vi = 3;
637 }
638
639 /* parse time parts (and possibly remaining date parts) */
640 for (; *s && vi < 6; vi++) {
641 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
642 ISDIGIT((unsigned char)*s); s++, i++)…
643 v = (v * 10) + (*s - '0');
644 }
645 va[vi] = v;
646
647 if ((vi < 2 && (*s == '-' || *s == '/')) ||
648 (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsi…
649 (vi > 2 && *s == ':'))
650 s++;
651 }
652
653 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
654 if (*s == '.' || *s == ',') {
655 for (s++; ISDIGIT((unsigned char)*s); s++)
656 ;
657 }
658
659 /* invalid range */
660 if (va[0] < 0 || va[0] > 9999 ||
661 va[1] < 1 || va[1] > 12 ||
662 va[2] < 1 || va[2] > 31 ||
663 va[3] < 0 || va[3] > 23 ||
664 va[4] < 0 || va[4] > 59 ||
665 va[5] < 0 || va[5] > 60) /* allow leap second */
666 return -1;
667
668 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], v…
669 gettzoffset(s);
670
671 return 0;
672 }
673
674 static void
675 printfields(void)
676 {
677 string_print_timestamp(&ctx.fields[FeedFieldTime].str);
678 putchar(FieldSeparator);
679 string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
680 putchar(FieldSeparator);
681 string_print_uri(&ctx.fields[FeedFieldLink].str);
682 putchar(FieldSeparator);
683 string_print_encoded(&ctx.fields[FeedFieldContent].str);
684 putchar(FieldSeparator);
685 fputs(contenttypes[ctx.contenttype], stdout);
686 putchar(FieldSeparator);
687 string_print_trimmed(&ctx.fields[FeedFieldId].str);
688 putchar(FieldSeparator);
689 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
690 putchar(FieldSeparator);
691 string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
692 putchar(FieldSeparator);
693 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
694 putchar('\n');
695
696 if (ferror(stdout)) /* check for errors but do not flush */
697 checkfileerror(stdout, "<stdout>", 'w');
698 }
699
700 static int
701 istag(const char *name, size_t len, const char *name2, size_t len2)
702 {
703 return (len == len2 && !strcasecmp(name, name2));
704 }
705
706 static int
707 isattr(const char *name, size_t len, const char *name2, size_t len2)
708 {
709 return (len == len2 && !strcasecmp(name, name2));
710 }
711
712 static void
713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
714 const char *v, size_t vl)
715 {
716 /* handles transforming inline XML to data */
717 if (ISINCONTENT(ctx)) {
718 if (ctx.contenttype == ContentTypeHTML)
719 xmldata(p, v, vl);
720 return;
721 }
722
723 if (!ctx.tag.id)
724 return;
725
726 /* content-type may be for Atom: text, xhtml, html or a mime-typ…
727 * for MRSS (media:description): plain, html. */
728 if (ISCONTENTTAG(ctx)) {
729 if (isattr(n, nl, STRP("type")))
730 string_append(&attrtype, v, vl);
731 return;
732 }
733
734 if (ctx.feedtype == FeedTypeRSS) {
735 if (ctx.tag.id == RSSTagEnclosure &&
736 isattr(n, nl, STRP("url"))) {
737 string_append(&tmpstr, v, vl);
738 } else if (ctx.tag.id == RSSTagGuid &&
739 isattr(n, nl, STRP("ispermalink"))) {
740 string_append(&attrispermalink, v, vl);
741 }
742 } else if (ctx.feedtype == FeedTypeAtom) {
743 if (ctx.tag.id == AtomTagLink) {
744 if (isattr(n, nl, STRP("rel"))) {
745 string_append(&attrrel, v, vl);
746 } else if (isattr(n, nl, STRP("href"))) {
747 string_append(&tmpstr, v, vl);
748 }
749 } else if (ctx.tag.id == AtomTagCategory &&
750 isattr(n, nl, STRP("term"))) {
751 string_append(&tmpstr, v, vl);
752 }
753 }
754 }
755
756 static void
757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, siz…
758 const char *data, size_t datalen)
759 {
760 char buf[8];
761 int len;
762
763 /* handles transforming inline XML to data */
764 if (ISINCONTENT(ctx)) {
765 if (ctx.contenttype == ContentTypeHTML)
766 xmldata(p, data, datalen);
767 return;
768 }
769
770 if (!ctx.tag.id)
771 return;
772
773 /* try to translate entity, else just pass as data to
774 * xmlattr handler. */
775 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
776 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
777 else
778 xmlattr(p, t, tl, n, nl, data, datalen);
779 }
780
781 static void
782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t…
783 {
784 if (ISINCONTENT(ctx)) {
785 if (ctx.contenttype == ContentTypeHTML) {
786 /* handles transforming inline XML to data */
787 xmldata(p, "\"", 1);
788 ctx.attrcount = 0;
789 }
790 return;
791 }
792 }
793
794 static void
795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size…
796 {
797 if (ISINCONTENT(ctx)) {
798 if (ctx.contenttype == ContentTypeHTML) {
799 /* handles transforming inline XML to data */
800 if (!ctx.attrcount)
801 xmldata(p, " ", 1);
802 ctx.attrcount++;
803 xmldata(p, n, nl);
804 xmldata(p, "=\"", 2);
805 }
806 return;
807 }
808
809 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
810 string_clear(&attrispermalink);
811 else if (attrrel.len && isattr(n, nl, STRP("rel")))
812 string_clear(&attrrel);
813 else if (attrtype.len && isattr(n, nl, STRP("type")))
814 string_clear(&attrtype);
815 else if (tmpstr.len &&
816 (isattr(n, nl, STRP("href")) ||
817 isattr(n, nl, STRP("term")) ||
818 isattr(n, nl, STRP("url"))))
819 string_clear(&tmpstr); /* use the last value for multipl…
820 }
821
822 static void
823 xmldata(XMLParser *p, const char *s, size_t len)
824 {
825 if (!ctx.field)
826 return;
827
828 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
829 string_append(&tmpstr, s, len);
830 else
831 string_append(ctx.field, s, len);
832 }
833
834 static void
835 xmldataentity(XMLParser *p, const char *data, size_t datalen)
836 {
837 char buf[8];
838 int len;
839
840 if (!ctx.field)
841 return;
842
843 /* try to translate entity, else just pass as data to
844 * xmldata handler. */
845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
846 xmldata(p, buf, (size_t)len);
847 else
848 xmldata(p, data, datalen);
849 }
850
851 static void
852 xmltagstart(XMLParser *p, const char *t, size_t tl)
853 {
854 const FeedTag *f;
855
856 if (ISINCONTENT(ctx)) {
857 if (ctx.contenttype == ContentTypeHTML) {
858 ctx.attrcount = 0;
859 xmldata(p, "<", 1);
860 xmldata(p, t, tl);
861 }
862 return;
863 }
864
865 /* start of RSS or Atom item / entry */
866 if (ctx.feedtype == FeedTypeNone) {
867 if (istag(t, tl, STRP("entry")))
868 ctx.feedtype = FeedTypeAtom;
869 else if (istag(t, tl, STRP("item")))
870 ctx.feedtype = FeedTypeRSS;
871 return;
872 }
873
874 /* field tagid already set or nested tags. */
875 if (ctx.tag.id) {
876 /* nested <author><name> for Atom */
877 if (ctx.tag.id == AtomTagAuthor &&
878 istag(t, tl, STRP("name"))) {
879 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ct…
880 } else {
881 return; /* other nested tags are not allowed: re…
882 }
883 }
884
885 /* in item */
886 if (ctx.tag.id == TagUnknown) {
887 if (!(f = gettag(ctx.feedtype, t, tl)))
888 f = &notag;
889 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
890 }
891
892 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
893 string_clear(&attrispermalink);
894 string_clear(&attrrel);
895 string_clear(&attrtype);
896 }
897
898 static void
899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
900 {
901 enum TagId tagid;
902
903 if (ISINCONTENT(ctx)) {
904 if (ctx.contenttype == ContentTypeHTML) {
905 if (isshort)
906 xmldata(p, "/>", 2);
907 else
908 xmldata(p, ">", 1);
909 }
910 return;
911 }
912
913 /* set tag type based on its attribute value */
914 if (ctx.tag.id == RSSTagGuid) {
915 /* if empty the default is "true" */
916 if (!attrispermalink.len ||
917 isattr(attrispermalink.data, attrispermalink.len, ST…
918 ctx.tag.id = RSSTagGuidPermalinkTrue;
919 else
920 ctx.tag.id = RSSTagGuidPermalinkFalse;
921 } else if (ctx.tag.id == AtomTagLink) {
922 /* empty or "alternate": other types could be
923 * "enclosure", "related", "self" or "via" */
924 if (!attrrel.len || isattr(attrrel.data, attrrel.len, ST…
925 ctx.tag.id = AtomTagLinkAlternate;
926 else if (isattr(attrrel.data, attrrel.len, STRP("enclosu…
927 ctx.tag.id = AtomTagLinkEnclosure;
928 else
929 ctx.tag.id = AtomTagLink; /* unknown */
930 }
931
932 tagid = ctx.tag.id;
933
934 /* map tag type to field: unknown or lesser priority is ignored,
935 * when tags of the same type are repeated only the first is use…
936 if (fieldmap[tagid] == -1 ||
937 (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
938 tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
939 return;
940 }
941
942 if (ctx.iscontenttag) {
943 ctx.iscontent = 1;
944 ctx.iscontenttag = 0;
945
946 /* detect content-type based on type attribute */
947 if (attrtype.len) {
948 if (isattr(attrtype.data, attrtype.len, STRP("ht…
949 isattr(attrtype.data, attrtype.len, STRP("xh…
950 isattr(attrtype.data, attrtype.len, STRP("te…
951 isattr(attrtype.data, attrtype.len, STRP("te…
952 isattr(attrtype.data, attrtype.len, STRP("ap…
953 ctx.contenttype = ContentTypeHTML;
954 else /* unknown: handle as base64 text data */
955 ctx.contenttype = ContentTypePlain;
956 } else {
957 /* default content-type */
958 if (tagid == RSSTagContentEncoded || tagid == RS…
959 ctx.contenttype = ContentTypeHTML;
960 else
961 ctx.contenttype = ContentTypePlain;
962 }
963 }
964
965 ctx.field = &(ctx.fields[fieldmap[tagid]].str);
966 ctx.fields[fieldmap[tagid]].tagid = tagid;
967
968 /* clear field if it is overwritten (with a priority order) for …
969 * value, if the field can have multiple values then do not clea…
970 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
971 string_clear(ctx.field);
972 }
973
974 static void
975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
976 {
977 size_t i;
978
979 if (ctx.feedtype == FeedTypeNone)
980 return;
981
982 if (ISINCONTENT(ctx)) {
983 /* not a closed content field */
984 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
985 if (!isshort && ctx.contenttype == ContentTypeHT…
986 xmldata(p, "</", 2);
987 xmldata(p, t, tl);
988 xmldata(p, ">", 1);
989 }
990 return;
991 }
992 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)…
993 /* matched tag end: close it.
994 * copy also to the link field if the attribute isPermaL…
995 * and it is not set by a tag with higher priority. */
996 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
997 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
998 string_clear(&ctx.fields[FeedFieldLink].str);
999 string_append(&ctx.fields[FeedFieldLink].str,
1000 ctx.field->data, ctx.field->len);
1001 ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
1002 }
1003 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
1004 istag(t, tl, STRP("entry"))) || /* Atom */
1005 (ctx.feedtype == FeedTypeRSS &&
1006 istag(t, tl, STRP("item"))))) /* RSS */
1007 {
1008 /* end of RSS or Atom entry / item */
1009 printfields();
1010
1011 /* clear strings */
1012 for (i = 0; i < FeedFieldLast; i++) {
1013 string_clear(&ctx.fields[i].str);
1014 ctx.fields[i].tagid = TagUnknown;
1015 }
1016 ctx.contenttype = ContentTypeNone;
1017 /* allow parsing of Atom and RSS concatenated in one XML…
1018 ctx.feedtype = FeedTypeNone;
1019 } else {
1020 return; /* not end of field */
1021 }
1022
1023 /* temporary string: for fields that cannot be processed
1024 * directly and need more context, for example by its tag
1025 * attributes, like the Atom link rel="alternate|enclosure". */
1026 if (tmpstr.len && ctx.field) {
1027 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
1028 if (ctx.field->len)
1029 string_append(ctx.field, FieldMultiSepar…
1030 string_append(ctx.field, tmpstr.data, tmpstr.len…
1031 } else {
1032 string_clear(ctx.field);
1033 string_append(ctx.field, tmpstr.data, tmpstr.len…
1034 }
1035 }
1036
1037 /* close field */
1038 string_clear(&tmpstr); /* reuse and clear temporary string */
1039
1040 if (ctx.tag.id == AtomTagAuthorName)
1041 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* …
1042 else
1043 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
1044
1045 ctx.iscontent = 0;
1046 ctx.field = NULL;
1047 }
1048
1049 int
1050 main(int argc, char *argv[])
1051 {
1052 if (pledge("stdio", NULL) == -1)
1053 err(1, "pledge");
1054
1055 if (argc > 1) {
1056 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[…
1057 baseurl = argv[1];
1058 else
1059 errx(1, "baseurl incorrect or too long");
1060 }
1061
1062 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
1063
1064 parser.xmlattr = xmlattr;
1065 parser.xmlattrentity = xmlattrentity;
1066 parser.xmlattrend = xmlattrend;
1067 parser.xmlattrstart = xmlattrstart;
1068 parser.xmlcdata = xmldata;
1069 parser.xmldata = xmldata;
1070 parser.xmldataentity = xmldataentity;
1071 parser.xmltagend = xmltagend;
1072 parser.xmltagstart = xmltagstart;
1073 parser.xmltagstartparsed = xmltagstartparsed;
1074
1075 /* NOTE: GETNEXT is defined in xml.h for inline optimization */
1076 xml_parse(&parser);
1077
1078 checkfileerror(stdin, "<stdin>", 'r');
1079 checkfileerror(stdout, "<stdout>", 'w');
1080
1081 return 0;
1082 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.