GopherProxy

	sfeed.c - sfeed - RSS and Atom parser
	git clone git://git.codemadness.org/sfeed
	Log
	Files
	Refs
	README
	LICENSE
	---
	sfeed.c (30076B)
	---
	1 #include <errno.h>
	2 #include <stdint.h>
	3 #include <stdio.h>
	4 #include <stdlib.h>
	5 #include <string.h>
	6 #include <strings.h>
	7
	8 #include "util.h"
	9 #include "xml.h"
	10
	11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
	12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
	13
	14 /* these feed fields support multiple separated values */
	15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
	16
	17 /* string and byte-length */
	18 #define STRP(s) s,sizeof(s)-1
	19
	20 enum FeedType {
	21 FeedTypeNone = 0,
	22 FeedTypeRSS = 1,
	23 FeedTypeAtom = 2
	24 };
	25
	26 enum ContentType {
	27 ContentTypeNone = 0,
	28 ContentTypePlain = 1,
	29 ContentTypeHTML = 2
	30 };
	31 static const char *contenttypes[] = { "", "plain", "html" };
	32
	33 /* String data / memory pool */
	34 typedef struct string {
	35 char data; / data */
	36 size_t len; /* string length */
	37 size_t bufsiz; /* allocated size */
	38 } String;
	39
	40 /* NOTE: the order of these fields (content, date, author) indicate the
	41 * priority to use them, from least important to high. */
	42 enum TagId {
	43 TagUnknown = 0,
	44 /* RSS */
	45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priorit…
	46 RSSTagTitle,
	47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
	48 RSSTagGuid,
	49 RSSTagGuidPermalinkFalse,
	50 RSSTagGuidPermalinkTrue,
	51 /* must be defined after GUID, because it can be a link (isPerma…
	52 RSSTagLink,
	53 RSSTagEnclosure,
	54 RSSTagAuthor, RSSTagDccreator,
	55 RSSTagCategory,
	56 /* Atom */
	57 /* creation date has higher priority */
	58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
	59 AtomTagTitle,
	60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
	61 AtomTagId,
	62 AtomTagLink,
	63 AtomTagLinkAlternate,
	64 AtomTagLinkEnclosure,
	65 AtomTagAuthor, AtomTagAuthorName,
	66 AtomTagCategory,
	67 TagLast
	68 };
	69
	70 typedef struct feedtag {
	71 char name; / name of tag to match */
	72 size_t len; /* len of `name` */
	73 enum TagId id; /* unique ID */
	74 } FeedTag;
	75
	76 typedef struct field {
	77 String str;
	78 enum TagId tagid; /* tagid set previously, used for tag priority…
	79 } FeedField;
	80
	81 enum {
	82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldConte…
	83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCateg…
	84 FeedFieldLast
	85 };
	86
	87 typedef struct feedcontext {
	88 String field; / current FeedItem field String …
	89 FeedField fields[FeedFieldLast]; /* data for current item…
	90 FeedTag tag; /* unique current parsed tag */
	91 int iscontent; /* in content data */
	92 int iscontenttag; /* in content tag */
	93 enum ContentType contenttype; /* content-type for item */
	94 enum FeedType feedtype;
	95 int attrcount; /* count item HTML element attrib…
	96 } FeedContext;
	97
	98 static long long datetounix(long long, int, int, int, int, int);
	99 static FeedTag * gettag(enum FeedType, const char *, size_t);
	100 static long gettzoffset(const char *);
	101 static int isattr(const char , size_t, const char , size_t);
	102 static int istag(const char , size_t, const char , size_t);
	103 static int parsetime(const char , long long );
	104 static void printfields(void);
	105 static void string_append(String , const char , size_t);
	106 static void string_buffer_realloc(String *, size_t);
	107 static void string_clear(String *);
	108 static void string_print_encoded(String *);
	109 static void string_print_timestamp(String *);
	110 static void string_print_trimmed(String *);
	111 static void string_print_trimmed_multi(String *);
	112 static void string_print_uri(String *);
	113 static void xmlattr(XMLParser , const char , size_t, const char *, siz…
	114 const char *, size_t);
	115 static void xmlattrentity(XMLParser , const char , size_t, const char …
	116 size_t, const char *, size_t);
	117 static void xmlattrend(XMLParser , const char , size_t, const char *,
	118 size_t);
	119 static void xmlattrstart(XMLParser , const char , size_t, const char *,
	120 size_t);
	121 static void xmldata(XMLParser , const char , size_t);
	122 static void xmldataentity(XMLParser , const char , size_t);
	123 static void xmltagend(XMLParser , const char , size_t, int);
	124 static void xmltagstart(XMLParser , const char , size_t);
	125 static void xmltagstartparsed(XMLParser , const char , size_t, int);
	126
	127 /* map tag name to TagId type */
	128 /* RSS, keep this in alphabetical order */
	129 static const FeedTag rsstags[] = {
	130 { STRP("author"), RSSTagAuthor },
	131 { STRP("category"), RSSTagCategory },
	132 { STRP("content:encoded"), RSSTagContentEncoded },
	133 { STRP("dc:creator"), RSSTagDccreator },
	134 { STRP("dc:date"), RSSTagDcdate },
	135 { STRP("description"), RSSTagDescription },
	136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> …
	137 { STRP("enclosure"), RSSTagEnclosure },
	138 { STRP("guid"), RSSTagGuid },
	139 { STRP("link"), RSSTagLink },
	140 { STRP("media:description"), RSSTagMediaDescription },
	141 { STRP("pubdate"), RSSTagPubdate },
	142 { STRP("title"), RSSTagTitle }
	143 };
	144
	145 /* Atom, keep this in alphabetical order */
	146 static const FeedTag atomtags[] = {
	147 { STRP("author"), AtomTagAuthor },
	148 { STRP("category"), AtomTagCategory },
	149 { STRP("content"), AtomTagContent },
	150 { STRP("id"), AtomTagId },
	151 { STRP("issued"), AtomTagIssued }, /* Atom …
	152 /* Atom: <link href="" />, RSS has <link></link> */
	153 { STRP("link"), AtomTagLink },
	154 { STRP("media:description"), AtomTagMediaDescription },
	155 { STRP("modified"), AtomTagModified }, /* Atom …
	156 { STRP("published"), AtomTagPublished },
	157 { STRP("summary"), AtomTagSummary },
	158 { STRP("title"), AtomTagTitle },
	159 { STRP("updated"), AtomTagUpdated }
	160 };
	161
	162 /* special case: nested <author><name> */
	163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
	164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorNa…
	165
	166 /* reference to no / unknown tag */
	167 static const FeedTag notag = { STRP(""), TagUnknown };
	168
	169 /* map TagId type to RSS/Atom field, all tags must be defined */
	170 static const int fieldmap[TagLast] = {
	171 [TagUnknown] = -1,
	172 /* RSS */
	173 [RSSTagDcdate] = FeedFieldTime,
	174 [RSSTagPubdate] = FeedFieldTime,
	175 [RSSTagTitle] = FeedFieldTitle,
	176 [RSSTagMediaDescription] = FeedFieldContent,
	177 [RSSTagDescription] = FeedFieldContent,
	178 [RSSTagContentEncoded] = FeedFieldContent,
	179 [RSSTagGuid] = -1,
	180 [RSSTagGuidPermalinkFalse] = FeedFieldId,
	181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both …
	182 [RSSTagLink] = FeedFieldLink,
	183 [RSSTagEnclosure] = FeedFieldEnclosure,
	184 [RSSTagAuthor] = FeedFieldAuthor,
	185 [RSSTagDccreator] = FeedFieldAuthor,
	186 [RSSTagCategory] = FeedFieldCategory,
	187 /* Atom */
	188 [AtomTagModified] = FeedFieldTime,
	189 [AtomTagUpdated] = FeedFieldTime,
	190 [AtomTagIssued] = FeedFieldTime,
	191 [AtomTagPublished] = FeedFieldTime,
	192 [AtomTagTitle] = FeedFieldTitle,
	193 [AtomTagMediaDescription] = FeedFieldContent,
	194 [AtomTagSummary] = FeedFieldContent,
	195 [AtomTagContent] = FeedFieldContent,
	196 [AtomTagId] = FeedFieldId,
	197 [AtomTagLink] = -1,
	198 [AtomTagLinkAlternate] = FeedFieldLink,
	199 [AtomTagLinkEnclosure] = FeedFieldEnclosure,
	200 [AtomTagAuthor] = -1,
	201 [AtomTagAuthorName] = FeedFieldAuthor,
	202 [AtomTagCategory] = FeedFieldCategory
	203 };
	204
	205 static const int FieldSeparator = '\t';
	206 /* separator for multiple values in a field, separator should be 1 byte …
	207 static const char FieldMultiSeparator[] = "\|";
	208 static struct uri baseuri;
	209 static const char *baseurl;
	210
	211 static FeedContext ctx;
	212 static XMLParser parser; /* XML parser state */
	213 static String attrispermalink, attrrel, attrtype, tmpstr;
	214
	215 /* Unique tag(id) for parsed tag name. */
	216 static FeedTag *
	217 gettag(enum FeedType feedtype, const char *name, size_t namelen)
	218 {
	219 FeedTag *r;
	220 size_t i;
	221
	222 switch (feedtype) {
	223 case FeedTypeRSS:
	224 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i+…
	225 r = (FeedTag *)&rsstags[i];
	226 if (r->len == namelen && !strcasecmp(r->name, na…
	227 return r;
	228 }
	229 break;
	230 case FeedTypeAtom:
	231 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); …
	232 r = (FeedTag *)&atomtags[i];
	233 if (r->len == namelen && !strcasecmp(r->name, na…
	234 return r;
	235 }
	236 break;
	237 default:
	238 break;
	239 }
	240
	241 return NULL;
	242 }
	243
	244 static char *
	245 ltrim(const char *s)
	246 {
	247 for (; ISSPACE((unsigned char)*s); s++)
	248 ;
	249 return (char *)s;
	250 }
	251
	252 static char *
	253 rtrim(const char *s)
	254 {
	255 const char *e;
	256
	257 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)…
	258 ;
	259 return (char *)e;
	260 }
	261
	262 /* Clear string only; don't free, prevents unnecessary reallocation. */
	263 static void
	264 string_clear(String *s)
	265 {
	266 if (s->data)
	267 s->data[0] = '\0';
	268 s->len = 0;
	269 }
	270
	271 static void
	272 string_buffer_realloc(String *s, size_t newlen)
	273 {
	274 size_t alloclen;
	275
	276 if (newlen > SIZE_MAX / 2) {
	277 alloclen = SIZE_MAX;
	278 } else {
	279 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
	280 ;
	281 }
	282 if (!(s->data = realloc(s->data, alloclen)))
	283 err(1, "realloc");
	284 s->bufsiz = alloclen;
	285 }
	286
	287 /* Append data to String, s->data and data may not overlap. */
	288 static void
	289 string_append(String s, const char data, size_t len)
	290 {
	291 if (!len)
	292 return;
	293
	294 if (s->len >= SIZE_MAX - len) {
	295 errno = ENOMEM;
	296 err(1, "realloc");
	297 }
	298
	299 /* check if allocation is necessary, never shrink the buffer. */
	300 if (s->len + len >= s->bufsiz)
	301 string_buffer_realloc(s, s->len + len + 1);
	302 memcpy(s->data + s->len, data, len);
	303 s->len += len;
	304 s->data[s->len] = '\0';
	305 }
	306
	307 /* Print text, encode TABs, newlines and '\', remove other whitespace.
	308 * Remove leading and trailing whitespace. */
	309 static void
	310 string_print_encoded(String *s)
	311 {
	312 const char p, e;
	313
	314 if (!s->data \|\| !s->len)
	315 return;
	316
	317 p = ltrim(s->data);
	318 e = rtrim(p);
	319
	320 for (; *p && p != e; p++) {
	321 switch (*p) {
	322 case '\n': putchar('\\'); putchar('n'); break;
	323 case '\\': putchar('\\'); putchar('\\'); break;
	324 case '\t': putchar('\\'); putchar('t'); break;
	325 default:
	326 /* ignore control chars */
	327 if (!ISCNTRL((unsigned char)*p))
	328 putchar(*p);
	329 break;
	330 }
	331 }
	332 }
	333
	334 static void
	335 printtrimmed(const char *s)
	336 {
	337 char p, e;
	338
	339 p = ltrim(s);
	340 e = rtrim(p);
	341 for (; *p && p != e; p++) {
	342 if (ISSPACE((unsigned char)*p))
	343 putchar(' '); /* any whitespace to space */
	344 else if (!ISCNTRL((unsigned char)*p))
	345 /* ignore other control chars */
	346 putchar(*p);
	347 }
	348 }
	349
	350 /* Print text, replace TABs, carriage return and other whitespace with '…
	351 * Other control chars are removed. Remove leading and trailing whitespa…
	352 static void
	353 string_print_trimmed(String *s)
	354 {
	355 if (!s->data \|\| !s->len)
	356 return;
	357
	358 printtrimmed(s->data);
	359 }
	360
	361 /* Print each field with trimmed whitespace, separated by '\|'. */
	362 static void
	363 string_print_trimmed_multi(String *s)
	364 {
	365 char p, e;
	366 int c;
	367
	368 if (!s->data \|\| !s->len)
	369 return;
	370
	371 for (p = s->data; ; p = e + 1) {
	372 if ((e = strstr(p, FieldMultiSeparator))) {
	373 c = *e;
	374 *e = '\0';
	375 printtrimmed(p);
	376 e = c; / restore NUL byte to original characte…
	377 fputs(FieldMultiSeparator, stdout);
	378 } else {
	379 printtrimmed(p);
	380 break;
	381 }
	382 }
	383 }
	384
	385 /* Print URL, if it is a relative URL then it uses the global `baseurl`.…
	386 static void
	387 printuri(char *s)
	388 {
	389 char link[4096], p, e;
	390 struct uri newuri, olduri;
	391 int c, r = -1;
	392
	393 p = ltrim(s);
	394 e = rtrim(p);
	395 c = *e;
	396 *e = '\0';
	397
	398 if (baseurl && !uri_hasscheme(p) &&
	399 uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
	400 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.prot…
	401 r = uri_format(link, sizeof(link), &newuri);
	402
	403 if (r >= 0 && (size_t)r < sizeof(link))
	404 printtrimmed(link);
	405 else
	406 printtrimmed(p);
	407
	408 e = c; / restore NUL byte to original character */
	409 }
	410
	411 /* Print URL, if it is a relative URL then it uses the global `baseurl`.…
	412 static void
	413 string_print_uri(String *s)
	414 {
	415 if (!s->data \|\| !s->len)
	416 return;
	417
	418 printuri(s->data);
	419 }
	420
	421 /* Print as UNIX timestamp, print nothing if the time is empty or invali…
	422 static void
	423 string_print_timestamp(String *s)
	424 {
	425 long long t;
	426
	427 if (!s->data \|\| !s->len)
	428 return;
	429
	430 if (parsetime(s->data, &t) != -1)
	431 printf("%lld", t);
	432 }
	433
	434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestam…
	435 * Parameters should be passed as they are in a struct tm:
	436 * that is: year = year - 1900, month = month - 1. */
	437 static long long
	438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
	439 {
	440 /* seconds in a month in a regular (non-leap) year */
	441 static const long secs_through_month[] = {
	442 0, 31 * 86400, 59 * 86400, 90 * 86400,
	443 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
	444 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
	445 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
	446 long long t;
	447
	448 /* optimization: handle common range year 1902 up to and includi…
	449 if (year - 2ULL <= 136) {
	450 /* amount of leap days relative to 1970: every 4 years */
	451 leaps = (year - 68) >> 2;
	452 if (!((year - 68) & 3)) {
	453 leaps--;
	454 is_leap = 1;
	455 } else {
	456 is_leap = 0;
	457 }
	458 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 8…
	459 } else {
	460 /* general leap year calculation:
	461 * leap years occur mostly every 4 years but every 100 y…
	462 * a leap year is skipped unless the year is divisible b…
	463 cycles = (year - 100) / 400;
	464 rem = (year - 100) % 400;
	465 if (rem < 0) {
	466 cycles--;
	467 rem += 400;
	468 }
	469 if (!rem) {
	470 is_leap = 1;
	471 } else {
	472 if (rem >= 300) {
	473 centuries = 3;
	474 rem -= 300;
	475 } else if (rem >= 200) {
	476 centuries = 2;
	477 rem -= 200;
	478 } else if (rem >= 100) {
	479 centuries = 1;
	480 rem -= 100;
	481 }
	482 if (rem) {
	483 leaps = rem / 4U;
	484 rem %= 4U;
	485 is_leap = !rem;
	486 }
	487 }
	488 leaps += (97 * cycles) + (24 * centuries) - is_leap;
	489
	490 /* adjust 8 leap days from 1970 up to and including 2000:
	491 * ((30 * 365) + 8) * 86400 = 946771200 */
	492 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 94…
	493 }
	494 t += secs_through_month[mon];
	495 if (is_leap && mon >= 2)
	496 t += 86400;
	497 t += 86400LL * (day - 1);
	498 t += 3600LL * hour;
	499 t += 60LL * min;
	500 t += sec;
	501
	502 return t;
	503 }
	504
	505 /* Get timezone from string, return time offset in seconds from UTC.
	506 * NOTE: only parses timezones in RFC 822, many other timezone names are
	507 * ambiguous anyway.
	508 * ANSI and military zones are defined wrong in RFC 822 and are unsuppor…
	509 * see note on RFC 2822 4.3 page 32. */
	510 static long
	511 gettzoffset(const char *s)
	512 {
	513 static const struct {
	514 char *name;
	515 int offhour;
	516 } tzones[] = {
	517 { "CDT", -5 * 3600 },
	518 { "CST", -6 * 3600 },
	519 { "EDT", -4 * 3600 },
	520 { "EST", -5 * 3600 },
	521 { "MDT", -6 * 3600 },
	522 { "MST", -7 * 3600 },
	523 { "PDT", -7 * 3600 },
	524 { "PST", -8 * 3600 },
	525 };
	526 const char *p;
	527 long tzhour = 0, tzmin = 0;
	528 size_t i;
	529
	530 for (; ISSPACE((unsigned char)*s); s++)
	531 ;
	532 switch (*s) {
	533 case '-': /* offset */
	534 case '+':
	535 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*…
	536 tzhour = (tzhour * 10) + (*p - '0');
	537 if (*p == ':')
	538 p++;
	539 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p+…
	540 tzmin = (tzmin * 10) + (*p - '0');
	541 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ?…
	542 default: /* timezone name */
	543 for (i = 0; ISALPHA((unsigned char)s[i]); i++)
	544 ;
	545 if (i != 3)
	546 return 0;
	547 /* compare timezone and adjust offset relative to UTC */
	548 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
	549 if (!memcmp(s, tzones[i].name, 3))
	550 return tzones[i].offhour;
	551 }
	552 }
	553 return 0;
	554 }
	555
	556 /* Parse time string `s` into the UNIX timestamp `tp`.
	557 * Returns 0 on success or -1 on failure. */
	558 static int
	559 parsetime(const char s, long long tp)
	560 {
	561 static const struct {
	562 char *name;
	563 int len;
	564 } mons[] = {
	565 { STRP("January"), },
	566 { STRP("February"), },
	567 { STRP("March"), },
	568 { STRP("April"), },
	569 { STRP("May"), },
	570 { STRP("June"), },
	571 { STRP("July"), },
	572 { STRP("August"), },
	573 { STRP("September"), },
	574 { STRP("October"), },
	575 { STRP("November"), },
	576 { STRP("December"), },
	577 };
	578 int va[6] = { 0 }, i, j, v, vi;
	579 size_t m;
	580
	581 for (; ISSPACE((unsigned char)*s); s++)
	582 ;
	583 if (!ISDIGIT((unsigned char)s) && !ISALPHA((unsigned char)s))
	584 return -1;
	585
	586 if (ISDIGIT((unsigned char)s[0]) &&
	587 ISDIGIT((unsigned char)s[1]) &&
	588 ISDIGIT((unsigned char)s[2]) &&
	589 ISDIGIT((unsigned char)s[3])) {
	590 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "…
	591 vi = 0;
	592 } else {
	593 /* format: "[%a, ]%d %b %Y %H:%M:%S" */
	594 /* parse "[%a, ]%d %b %Y " part, then use time parsing a…
	595 for (; ISALPHA((unsigned char)*s); s++)
	596 ;
	597 for (; ISSPACE((unsigned char)*s); s++)
	598 ;
	599 if (*s == ',')
	600 s++;
	601 for (; ISSPACE((unsigned char)*s); s++)
	602 ;
	603 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); …
	604 v = (v * 10) + (*s - '0');
	605 va[2] = v; /* day */
	606 for (; ISSPACE((unsigned char)*s); s++)
	607 ;
	608 /* end of word month */
	609 for (j = 0; ISALPHA((unsigned char)s[j]); j++)
	610 ;
	611 /* check month name */
	612 if (j < 3 \|\| j > 9)
	613 return -1; /* month cannot match */
	614 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
	615 /* abbreviation (3 length) or long name */
	616 if ((j == 3 \|\| j == mons[m].len) &&
	617 !strncasecmp(mons[m].name, s, j)) {
	618 va[1] = m + 1;
	619 s += j;
	620 break;
	621 }
	622 }
	623 if (m >= 12)
	624 return -1; /* no month found */
	625 for (; ISSPACE((unsigned char)*s); s++)
	626 ;
	627 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); …
	628 v = (v * 10) + (*s - '0');
	629 /* obsolete short year: RFC 2822 4.3 */
	630 if (i == 2 \|\| i == 3)
	631 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
	632 va[0] = v; /* year */
	633 for (; ISSPACE((unsigned char)*s); s++)
	634 ;
	635 /* parse only regular time part, see below */
	636 vi = 3;
	637 }
	638
	639 /* parse time parts (and possibly remaining date parts) */
	640 for (; *s && vi < 6; vi++) {
	641 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
	642 ISDIGIT((unsigned char)*s); s++, i++)…
	643 v = (v * 10) + (*s - '0');
	644 }
	645 va[vi] = v;
	646
	647 if ((vi < 2 && (s == '-' \|\| s == '/')) \|\|
	648 (vi == 2 && (s == 'T' \|\| s == 't' \|\| ISSPACE((unsi…
	649 (vi > 2 && *s == ':'))
	650 s++;
	651 }
	652
	653 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
	654 if (s == '.' \|\| s == ',') {
	655 for (s++; ISDIGIT((unsigned char)*s); s++)
	656 ;
	657 }
	658
	659 /* invalid range */
	660 if (va[0] < 0 \|\| va[0] > 9999 \|\|
	661 va[1] < 1 \|\| va[1] > 12 \|\|
	662 va[2] < 1 \|\| va[2] > 31 \|\|
	663 va[3] < 0 \|\| va[3] > 23 \|\|
	664 va[4] < 0 \|\| va[4] > 59 \|\|
	665 va[5] < 0 \|\| va[5] > 60) /* allow leap second */
	666 return -1;
	667
	668 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], v…
	669 gettzoffset(s);
	670
	671 return 0;
	672 }
	673
	674 static void
	675 printfields(void)
	676 {
	677 string_print_timestamp(&ctx.fields[FeedFieldTime].str);
	678 putchar(FieldSeparator);
	679 string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
	680 putchar(FieldSeparator);
	681 string_print_uri(&ctx.fields[FeedFieldLink].str);
	682 putchar(FieldSeparator);
	683 string_print_encoded(&ctx.fields[FeedFieldContent].str);
	684 putchar(FieldSeparator);
	685 fputs(contenttypes[ctx.contenttype], stdout);
	686 putchar(FieldSeparator);
	687 string_print_trimmed(&ctx.fields[FeedFieldId].str);
	688 putchar(FieldSeparator);
	689 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
	690 putchar(FieldSeparator);
	691 string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
	692 putchar(FieldSeparator);
	693 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
	694 putchar('\n');
	695
	696 if (ferror(stdout)) /* check for errors but do not flush */
	697 checkfileerror(stdout, "<stdout>", 'w');
	698 }
	699
	700 static int
	701 istag(const char name, size_t len, const char name2, size_t len2)
	702 {
	703 return (len == len2 && !strcasecmp(name, name2));
	704 }
	705
	706 static int
	707 isattr(const char name, size_t len, const char name2, size_t len2)
	708 {
	709 return (len == len2 && !strcasecmp(name, name2));
	710 }
	711
	712 static void
	713 xmlattr(XMLParser p, const char t, size_t tl, const char *n, size_t nl,
	714 const char *v, size_t vl)
	715 {
	716 /* handles transforming inline XML to data */
	717 if (ISINCONTENT(ctx)) {
	718 if (ctx.contenttype == ContentTypeHTML)
	719 xmldata(p, v, vl);
	720 return;
	721 }
	722
	723 if (!ctx.tag.id)
	724 return;
	725
	726 /* content-type may be for Atom: text, xhtml, html or a mime-typ…
	727 * for MRSS (media:description): plain, html. */
	728 if (ISCONTENTTAG(ctx)) {
	729 if (isattr(n, nl, STRP("type")))
	730 string_append(&attrtype, v, vl);
	731 return;
	732 }
	733
	734 if (ctx.feedtype == FeedTypeRSS) {
	735 if (ctx.tag.id == RSSTagEnclosure &&
	736 isattr(n, nl, STRP("url"))) {
	737 string_append(&tmpstr, v, vl);
	738 } else if (ctx.tag.id == RSSTagGuid &&
	739 isattr(n, nl, STRP("ispermalink"))) {
	740 string_append(&attrispermalink, v, vl);
	741 }
	742 } else if (ctx.feedtype == FeedTypeAtom) {
	743 if (ctx.tag.id == AtomTagLink) {
	744 if (isattr(n, nl, STRP("rel"))) {
	745 string_append(&attrrel, v, vl);
	746 } else if (isattr(n, nl, STRP("href"))) {
	747 string_append(&tmpstr, v, vl);
	748 }
	749 } else if (ctx.tag.id == AtomTagCategory &&
	750 isattr(n, nl, STRP("term"))) {
	751 string_append(&tmpstr, v, vl);
	752 }
	753 }
	754 }
	755
	756 static void
	757 xmlattrentity(XMLParser p, const char t, size_t tl, const char *n, siz…
	758 const char *data, size_t datalen)
	759 {
	760 char buf[8];
	761 int len;
	762
	763 /* handles transforming inline XML to data */
	764 if (ISINCONTENT(ctx)) {
	765 if (ctx.contenttype == ContentTypeHTML)
	766 xmldata(p, data, datalen);
	767 return;
	768 }
	769
	770 if (!ctx.tag.id)
	771 return;
	772
	773 /* try to translate entity, else just pass as data to
	774 * xmlattr handler. */
	775 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
	776 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
	777 else
	778 xmlattr(p, t, tl, n, nl, data, datalen);
	779 }
	780
	781 static void
	782 xmlattrend(XMLParser p, const char t, size_t tl, const char *n, size_t…
	783 {
	784 if (ISINCONTENT(ctx)) {
	785 if (ctx.contenttype == ContentTypeHTML) {
	786 /* handles transforming inline XML to data */
	787 xmldata(p, "\"", 1);
	788 ctx.attrcount = 0;
	789 }
	790 return;
	791 }
	792 }
	793
	794 static void
	795 xmlattrstart(XMLParser p, const char t, size_t tl, const char *n, size…
	796 {
	797 if (ISINCONTENT(ctx)) {
	798 if (ctx.contenttype == ContentTypeHTML) {
	799 /* handles transforming inline XML to data */
	800 if (!ctx.attrcount)
	801 xmldata(p, " ", 1);
	802 ctx.attrcount++;
	803 xmldata(p, n, nl);
	804 xmldata(p, "=\"", 2);
	805 }
	806 return;
	807 }
	808
	809 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
	810 string_clear(&attrispermalink);
	811 else if (attrrel.len && isattr(n, nl, STRP("rel")))
	812 string_clear(&attrrel);
	813 else if (attrtype.len && isattr(n, nl, STRP("type")))
	814 string_clear(&attrtype);
	815 else if (tmpstr.len &&
	816 (isattr(n, nl, STRP("href")) \|\|
	817 isattr(n, nl, STRP("term")) \|\|
	818 isattr(n, nl, STRP("url"))))
	819 string_clear(&tmpstr); /* use the last value for multipl…
	820 }
	821
	822 static void
	823 xmldata(XMLParser p, const char s, size_t len)
	824 {
	825 if (!ctx.field)
	826 return;
	827
	828 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
	829 string_append(&tmpstr, s, len);
	830 else
	831 string_append(ctx.field, s, len);
	832 }
	833
	834 static void
	835 xmldataentity(XMLParser p, const char data, size_t datalen)
	836 {
	837 char buf[8];
	838 int len;
	839
	840 if (!ctx.field)
	841 return;
	842
	843 /* try to translate entity, else just pass as data to
	844 * xmldata handler. */
	845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
	846 xmldata(p, buf, (size_t)len);
	847 else
	848 xmldata(p, data, datalen);
	849 }
	850
	851 static void
	852 xmltagstart(XMLParser p, const char t, size_t tl)
	853 {
	854 const FeedTag *f;
	855
	856 if (ISINCONTENT(ctx)) {
	857 if (ctx.contenttype == ContentTypeHTML) {
	858 ctx.attrcount = 0;
	859 xmldata(p, "<", 1);
	860 xmldata(p, t, tl);
	861 }
	862 return;
	863 }
	864
	865 /* start of RSS or Atom item / entry */
	866 if (ctx.feedtype == FeedTypeNone) {
	867 if (istag(t, tl, STRP("entry")))
	868 ctx.feedtype = FeedTypeAtom;
	869 else if (istag(t, tl, STRP("item")))
	870 ctx.feedtype = FeedTypeRSS;
	871 return;
	872 }
	873
	874 /* field tagid already set or nested tags. */
	875 if (ctx.tag.id) {
	876 /* nested <author><name> for Atom */
	877 if (ctx.tag.id == AtomTagAuthor &&
	878 istag(t, tl, STRP("name"))) {
	879 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ct…
	880 } else {
	881 return; /* other nested tags are not allowed: re…
	882 }
	883 }
	884
	885 /* in item */
	886 if (ctx.tag.id == TagUnknown) {
	887 if (!(f = gettag(ctx.feedtype, t, tl)))
	888 f = &notag;
	889 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
	890 }
	891
	892 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
	893 string_clear(&attrispermalink);
	894 string_clear(&attrrel);
	895 string_clear(&attrtype);
	896 }
	897
	898 static void
	899 xmltagstartparsed(XMLParser p, const char t, size_t tl, int isshort)
	900 {
	901 enum TagId tagid;
	902
	903 if (ISINCONTENT(ctx)) {
	904 if (ctx.contenttype == ContentTypeHTML) {
	905 if (isshort)
	906 xmldata(p, "/>", 2);
	907 else
	908 xmldata(p, ">", 1);
	909 }
	910 return;
	911 }
	912
	913 /* set tag type based on its attribute value */
	914 if (ctx.tag.id == RSSTagGuid) {
	915 /* if empty the default is "true" */
	916 if (!attrispermalink.len \|\|
	917 isattr(attrispermalink.data, attrispermalink.len, ST…
	918 ctx.tag.id = RSSTagGuidPermalinkTrue;
	919 else
	920 ctx.tag.id = RSSTagGuidPermalinkFalse;
	921 } else if (ctx.tag.id == AtomTagLink) {
	922 /* empty or "alternate": other types could be
	923 * "enclosure", "related", "self" or "via" */
	924 if (!attrrel.len \|\| isattr(attrrel.data, attrrel.len, ST…
	925 ctx.tag.id = AtomTagLinkAlternate;
	926 else if (isattr(attrrel.data, attrrel.len, STRP("enclosu…
	927 ctx.tag.id = AtomTagLinkEnclosure;
	928 else
	929 ctx.tag.id = AtomTagLink; /* unknown */
	930 }
	931
	932 tagid = ctx.tag.id;
	933
	934 /* map tag type to field: unknown or lesser priority is ignored,
	935 * when tags of the same type are repeated only the first is use…
	936 if (fieldmap[tagid] == -1 \|\|
	937 (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
	938 tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
	939 return;
	940 }
	941
	942 if (ctx.iscontenttag) {
	943 ctx.iscontent = 1;
	944 ctx.iscontenttag = 0;
	945
	946 /* detect content-type based on type attribute */
	947 if (attrtype.len) {
	948 if (isattr(attrtype.data, attrtype.len, STRP("ht…
	949 isattr(attrtype.data, attrtype.len, STRP("xh…
	950 isattr(attrtype.data, attrtype.len, STRP("te…
	951 isattr(attrtype.data, attrtype.len, STRP("te…
	952 isattr(attrtype.data, attrtype.len, STRP("ap…
	953 ctx.contenttype = ContentTypeHTML;
	954 else /* unknown: handle as base64 text data */
	955 ctx.contenttype = ContentTypePlain;
	956 } else {
	957 /* default content-type */
	958 if (tagid == RSSTagContentEncoded \|\| tagid == RS…
	959 ctx.contenttype = ContentTypeHTML;
	960 else
	961 ctx.contenttype = ContentTypePlain;
	962 }
	963 }
	964
	965 ctx.field = &(ctx.fields[fieldmap[tagid]].str);
	966 ctx.fields[fieldmap[tagid]].tagid = tagid;
	967
	968 /* clear field if it is overwritten (with a priority order) for …
	969 * value, if the field can have multiple values then do not clea…
	970 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
	971 string_clear(ctx.field);
	972 }
	973
	974 static void
	975 xmltagend(XMLParser p, const char t, size_t tl, int isshort)
	976 {
	977 size_t i;
	978
	979 if (ctx.feedtype == FeedTypeNone)
	980 return;
	981
	982 if (ISINCONTENT(ctx)) {
	983 /* not a closed content field */
	984 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
	985 if (!isshort && ctx.contenttype == ContentTypeHT…
	986 xmldata(p, "</", 2);
	987 xmldata(p, t, tl);
	988 xmldata(p, ">", 1);
	989 }
	990 return;
	991 }
	992 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)…
	993 /* matched tag end: close it.
	994 * copy also to the link field if the attribute isPermaL…
	995 * and it is not set by a tag with higher priority. */
	996 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
	997 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
	998 string_clear(&ctx.fields[FeedFieldLink].str);
	999 string_append(&ctx.fields[FeedFieldLink].str,
	1000 ctx.field->data, ctx.field->len);
	1001 ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
	1002 }
	1003 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
	1004 istag(t, tl, STRP("entry"))) \|\| /* Atom */
	1005 (ctx.feedtype == FeedTypeRSS &&
	1006 istag(t, tl, STRP("item"))))) /* RSS */
	1007 {
	1008 /* end of RSS or Atom entry / item */
	1009 printfields();
	1010
	1011 /* clear strings */
	1012 for (i = 0; i < FeedFieldLast; i++) {
	1013 string_clear(&ctx.fields[i].str);
	1014 ctx.fields[i].tagid = TagUnknown;
	1015 }
	1016 ctx.contenttype = ContentTypeNone;
	1017 /* allow parsing of Atom and RSS concatenated in one XML…
	1018 ctx.feedtype = FeedTypeNone;
	1019 } else {
	1020 return; /* not end of field */
	1021 }
	1022
	1023 /* temporary string: for fields that cannot be processed
	1024 * directly and need more context, for example by its tag
	1025 * attributes, like the Atom link rel="alternate\|enclosure". */
	1026 if (tmpstr.len && ctx.field) {
	1027 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
	1028 if (ctx.field->len)
	1029 string_append(ctx.field, FieldMultiSepar…
	1030 string_append(ctx.field, tmpstr.data, tmpstr.len…
	1031 } else {
	1032 string_clear(ctx.field);
	1033 string_append(ctx.field, tmpstr.data, tmpstr.len…
	1034 }
	1035 }
	1036
	1037 /* close field */
	1038 string_clear(&tmpstr); /* reuse and clear temporary string */
	1039
	1040 if (ctx.tag.id == AtomTagAuthorName)
	1041 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* …
	1042 else
	1043 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
	1044
	1045 ctx.iscontent = 0;
	1046 ctx.field = NULL;
	1047 }
	1048
	1049 int
	1050 main(int argc, char *argv[])
	1051 {
	1052 if (pledge("stdio", NULL) == -1)
	1053 err(1, "pledge");
	1054
	1055 if (argc > 1) {
	1056 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[…
	1057 baseurl = argv[1];
	1058 else
	1059 errx(1, "baseurl incorrect or too long");
	1060 }
	1061
	1062 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
	1063
	1064 parser.xmlattr = xmlattr;
	1065 parser.xmlattrentity = xmlattrentity;
	1066 parser.xmlattrend = xmlattrend;
	1067 parser.xmlattrstart = xmlattrstart;
	1068 parser.xmlcdata = xmldata;
	1069 parser.xmldata = xmldata;
	1070 parser.xmldataentity = xmldataentity;
	1071 parser.xmltagend = xmltagend;
	1072 parser.xmltagstart = xmltagstart;
	1073 parser.xmltagstartparsed = xmltagstartparsed;
	1074
	1075 /* NOTE: GETNEXT is defined in xml.h for inline optimization */
	1076 xml_parse(&parser);
	1077
	1078 checkfileerror(stdin, "<stdin>", 'r');
	1079 checkfileerror(stdout, "<stdout>", 'w');
	1080
	1081 return 0;
	1082 }