sfeed

simple feed reader - forked from git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeed
Log | Files | Refs | Atom | README | LICENSE

sfeed.c (29354B)


      1 #include <errno.h>
      2 #include <stdint.h>
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 #include <strings.h>
      7 
      8 #include "util.h"
      9 #include "xml.h"
     10 
     11 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
     12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
     13 
     14 /* these feed fields support multiple separated values */
     15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
     16 
     17 /* string and byte-length */
     18 #define STRP(s)           s,sizeof(s)-1
     19 
     20 enum FeedType {
     21 	FeedTypeNone = 0,
     22 	FeedTypeRSS  = 1,
     23 	FeedTypeAtom = 2
     24 };
     25 
     26 enum ContentType {
     27 	ContentTypeNone  = 0,
     28 	ContentTypePlain = 1,
     29 	ContentTypeHTML  = 2
     30 };
     31 static const char *contenttypes[] = { "", "plain", "html" };
     32 
     33 /* String data / memory pool */
     34 typedef struct string {
     35 	char   *data;   /* data */
     36 	size_t  len;    /* string length */
     37 	size_t  bufsiz; /* allocated size */
     38 } String;
     39 
     40 /* NOTE: the order of these fields (content, date, author) indicate the
     41  *       priority to use them, from least important to high. */
     42 enum TagId {
     43 	TagUnknown = 0,
     44 	/* RSS */
     45 	RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
     46 	RSSTagTitle,
     47 	RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
     48 	RSSTagGuid,
     49 	RSSTagGuidPermalinkFalse,
     50 	RSSTagGuidPermalinkTrue,
     51 	/* must be defined after GUID, because it can be a link (isPermaLink) */
     52 	RSSTagLink,
     53 	RSSTagEnclosure,
     54 	RSSTagAuthor, RSSTagDccreator,
     55 	RSSTagCategory,
     56 	/* Atom */
     57 	/* creation date has higher priority */
     58 	AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
     59 	AtomTagTitle,
     60 	AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
     61 	AtomTagId,
     62 	AtomTagLink,
     63 	AtomTagLinkAlternate,
     64 	AtomTagLinkEnclosure,
     65 	AtomTagAuthor, AtomTagAuthorName,
     66 	AtomTagCategory,
     67 	TagLast
     68 };
     69 
     70 typedef struct feedtag {
     71 	char       *name; /* name of tag to match */
     72 	size_t      len;  /* len of `name` */
     73 	enum TagId  id;   /* unique ID */
     74 } FeedTag;
     75 
     76 typedef struct field {
     77 	String     str;
     78 	enum TagId tagid; /* tagid set previously, used for tag priority */
     79 } FeedField;
     80 
     81 enum {
     82 	FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
     83 	FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
     84 	FeedFieldLast
     85 };
     86 
     87 typedef struct feedcontext {
     88 	String          *field;        /* current FeedItem field String */
     89 	FeedField        fields[FeedFieldLast]; /* data for current item */
     90 	FeedTag          tag;          /* unique current parsed tag */
     91 	int              iscontent;    /* in content data */
     92 	int              iscontenttag; /* in content tag */
     93 	enum ContentType contenttype;  /* content-type for item */
     94 	enum FeedType    feedtype;
     95 	int              attrcount;    /* count item HTML element attributes */
     96 } FeedContext;
     97 
     98 static long long datetounix(long long, int, int, int, int, int);
     99 static FeedTag * gettag(enum FeedType, const char *, size_t);
    100 static long gettzoffset(const char *);
    101 static int  isattr(const char *, size_t, const char *, size_t);
    102 static int  istag(const char *, size_t, const char *, size_t);
    103 static int  parsetime(const char *, long long *);
    104 static void printfields(void);
    105 static void string_append(String *, const char *, size_t);
    106 static void string_buffer_realloc(String *, size_t);
    107 static void string_clear(String *);
    108 static void string_print_encoded(String *);
    109 static void string_print_timestamp(String *);
    110 static void string_print_trimmed(String *);
    111 static void string_print_trimmed_multi(String *);
    112 static void string_print_uri(String *);
    113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
    114                     const char *, size_t);
    115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
    116                           size_t, const char *, size_t);
    117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
    118                        size_t);
    119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
    120                          size_t);
    121 static void xmldata(XMLParser *, const char *, size_t);
    122 static void xmldataentity(XMLParser *, const char *, size_t);
    123 static void xmltagend(XMLParser *, const char *, size_t, int);
    124 static void xmltagstart(XMLParser *, const char *, size_t);
    125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
    126 
    127 /* map tag name to TagId type */
    128 /* RSS, must be alphabetical order */
    129 static const FeedTag rsstags[] = {
    130 	{ STRP("author"),            RSSTagAuthor            },
    131 	{ STRP("category"),          RSSTagCategory          },
    132 	{ STRP("content:encoded"),   RSSTagContentEncoded    },
    133 	{ STRP("dc:creator"),        RSSTagDccreator         },
    134 	{ STRP("dc:date"),           RSSTagDcdate            },
    135 	{ STRP("description"),       RSSTagDescription       },
    136 	/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
    137 	{ STRP("enclosure"),         RSSTagEnclosure         },
    138 	{ STRP("guid"),              RSSTagGuid              },
    139 	{ STRP("link"),              RSSTagLink              },
    140 	{ STRP("media:description"), RSSTagMediaDescription  },
    141 	{ STRP("pubdate"),           RSSTagPubdate           },
    142 	{ STRP("title"),             RSSTagTitle             }
    143 };
    144 
    145 /* Atom, must be alphabetical order */
    146 static const FeedTag atomtags[] = {
    147 	{ STRP("author"),            AtomTagAuthor           },
    148 	{ STRP("category"),          AtomTagCategory         },
    149 	{ STRP("content"),           AtomTagContent          },
    150 	{ STRP("id"),                AtomTagId               },
    151 	{ STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
    152 	/* Atom: <link href="" />, RSS has <link></link> */
    153 	{ STRP("link"),              AtomTagLink             },
    154 	{ STRP("media:description"), AtomTagMediaDescription },
    155 	{ STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
    156 	{ STRP("published"),         AtomTagPublished        },
    157 	{ STRP("summary"),           AtomTagSummary          },
    158 	{ STRP("title"),             AtomTagTitle            },
    159 	{ STRP("updated"),           AtomTagUpdated          }
    160 };
    161 
    162 /* special case: nested <author><name> */
    163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
    164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
    165 
    166 /* reference to no / unknown tag */
    167 static const FeedTag notag = { STRP(""), TagUnknown };
    168 
    169 /* map TagId type to RSS/Atom field, all tags must be defined */
    170 static const int fieldmap[TagLast] = {
    171 	[TagUnknown]               = -1,
    172 	/* RSS */
    173 	[RSSTagDcdate]             = FeedFieldTime,
    174 	[RSSTagPubdate]            = FeedFieldTime,
    175 	[RSSTagTitle]              = FeedFieldTitle,
    176 	[RSSTagMediaDescription]   = FeedFieldContent,
    177 	[RSSTagDescription]        = FeedFieldContent,
    178 	[RSSTagContentEncoded]     = FeedFieldContent,
    179 	[RSSTagGuid]               = -1,
    180 	[RSSTagGuidPermalinkFalse] = FeedFieldId,
    181 	[RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special-case: both a link and an id */
    182 	[RSSTagLink]               = FeedFieldLink,
    183 	[RSSTagEnclosure]          = FeedFieldEnclosure,
    184 	[RSSTagAuthor]             = FeedFieldAuthor,
    185 	[RSSTagDccreator]          = FeedFieldAuthor,
    186 	[RSSTagCategory]           = FeedFieldCategory,
    187 	/* Atom */
    188 	[AtomTagModified]          = FeedFieldTime,
    189 	[AtomTagUpdated]           = FeedFieldTime,
    190 	[AtomTagIssued]            = FeedFieldTime,
    191 	[AtomTagPublished]         = FeedFieldTime,
    192 	[AtomTagTitle]             = FeedFieldTitle,
    193 	[AtomTagMediaDescription]  = FeedFieldContent,
    194 	[AtomTagSummary]           = FeedFieldContent,
    195 	[AtomTagContent]           = FeedFieldContent,
    196 	[AtomTagId]                = FeedFieldId,
    197 	[AtomTagLink]              = -1,
    198 	[AtomTagLinkAlternate]     = FeedFieldLink,
    199 	[AtomTagLinkEnclosure]     = FeedFieldEnclosure,
    200 	[AtomTagAuthor]            = -1,
    201 	[AtomTagAuthorName]        = FeedFieldAuthor,
    202 	[AtomTagCategory]          = FeedFieldCategory
    203 };
    204 
    205 static const int FieldSeparator = '\t';
    206 /* separator for multiple values in a field, separator should be 1 byte */
    207 static const char FieldMultiSeparator[] = "|";
    208 static struct uri baseuri;
    209 static const char *baseurl;
    210 
    211 static FeedContext ctx;
    212 static XMLParser parser; /* XML parser state */
    213 static String attrispermalink, attrrel, attrtype, tmpstr;
    214 
    215 static int
    216 tagcmp(const void *v1, const void *v2)
    217 {
    218 	return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
    219 }
    220 
    221 /* Unique tagid for parsed tag name. */
    222 static FeedTag *
    223 gettag(enum FeedType feedtype, const char *name, size_t namelen)
    224 {
    225 	FeedTag f, *r = NULL;
    226 
    227 	f.name = (char *)name;
    228 
    229 	switch (feedtype) {
    230 	case FeedTypeRSS:
    231 		r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
    232 		        sizeof(rsstags[0]), tagcmp);
    233 		break;
    234 	case FeedTypeAtom:
    235 		r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
    236 		        sizeof(atomtags[0]), tagcmp);
    237 		break;
    238 	default:
    239 		break;
    240 	}
    241 
    242 	return r;
    243 }
    244 
    245 static char *
    246 ltrim(const char *s)
    247 {
    248 	for (; ISSPACE((unsigned char)*s); s++)
    249 		;
    250 	return (char *)s;
    251 }
    252 
    253 static char *
    254 rtrim(const char *s)
    255 {
    256 	const char *e;
    257 
    258 	for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
    259 		;
    260 	return (char *)e;
    261 }
    262 
    263 /* Clear string only; don't free, prevents unnecessary reallocation. */
    264 static void
    265 string_clear(String *s)
    266 {
    267 	if (s->data)
    268 		s->data[0] = '\0';
    269 	s->len = 0;
    270 }
    271 
    272 static void
    273 string_buffer_realloc(String *s, size_t newlen)
    274 {
    275 	size_t alloclen;
    276 
    277 	if (newlen > SIZE_MAX / 2) {
    278 		alloclen = SIZE_MAX;
    279 	} else {
    280 		for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
    281 			;
    282 	}
    283 	if (!(s->data = realloc(s->data, alloclen)))
    284 		err(1, "realloc");
    285 	s->bufsiz = alloclen;
    286 }
    287 
    288 /* Append data to String, s->data and data may not overlap. */
    289 static void
    290 string_append(String *s, const char *data, size_t len)
    291 {
    292 	if (!len)
    293 		return;
    294 
    295 	if (s->len >= SIZE_MAX - len) {
    296 		errno = EOVERFLOW;
    297 		err(1, "realloc");
    298 	}
    299 
    300 	/* check if allocation is necessary, never shrink the buffer. */
    301 	if (s->len + len >= s->bufsiz)
    302 		string_buffer_realloc(s, s->len + len + 1);
    303 	memcpy(s->data + s->len, data, len);
    304 	s->len += len;
    305 	s->data[s->len] = '\0';
    306 }
    307 
    308 /* Print text, encode TABs, newlines and '\', remove other whitespace.
    309  * Remove leading and trailing whitespace. */
    310 static void
    311 string_print_encoded(String *s)
    312 {
    313 	const char *p, *e;
    314 
    315 	if (!s->data || !s->len)
    316 		return;
    317 
    318 	p = ltrim(s->data);
    319 	e = rtrim(p);
    320 
    321 	for (; *p && p != e; p++) {
    322 		switch (*p) {
    323 		case '\n': putchar('\\'); putchar('n'); break;
    324 		case '\\': putchar('\\'); putchar('\\'); break;
    325 		case '\t': putchar('\\'); putchar('t'); break;
    326 		default:
    327 			/* ignore control chars */
    328 			if (!ISCNTRL((unsigned char)*p))
    329 				putchar(*p);
    330 			break;
    331 		}
    332 	}
    333 }
    334 
    335 static void
    336 printtrimmed(const char *s)
    337 {
    338 	char *p, *e;
    339 
    340 	p = ltrim(s);
    341 	e = rtrim(p);
    342 	for (; *p && p != e; p++) {
    343 		if (ISSPACE((unsigned char)*p))
    344 			putchar(' '); /* any whitespace to space */
    345 		else if (!ISCNTRL((unsigned char)*p))
    346 			/* ignore other control chars */
    347 			putchar(*p);
    348 	}
    349 }
    350 
    351 /* Print text, replace TABs, carriage return and other whitespace with ' '.
    352  * Other control chars are removed. Remove leading and trailing whitespace. */
    353 static void
    354 string_print_trimmed(String *s)
    355 {
    356 	if (!s->data || !s->len)
    357 		return;
    358 
    359 	printtrimmed(s->data);
    360 }
    361 
    362 /* Print each field with trimmed whitespace, separated by '|'. */
    363 static void
    364 string_print_trimmed_multi(String *s)
    365 {
    366 	char *p, *e;
    367 	int c;
    368 
    369 	if (!s->data || !s->len)
    370 		return;
    371 
    372 	for (p = s->data; ; p = e + 1) {
    373 		if ((e = strstr(p, FieldMultiSeparator))) {
    374 			c = *e;
    375 			*e = '\0';
    376 			printtrimmed(p);
    377 			*e = c; /* restore NUL byte to original character */
    378 			fputs(FieldMultiSeparator, stdout);
    379 		} else {
    380 			printtrimmed(p);
    381 			break;
    382 		}
    383 	}
    384 }
    385 
    386 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
    387 static void
    388 printuri(char *s)
    389 {
    390 	char link[4096], *p, *e;
    391 	struct uri newuri, olduri;
    392 	int c, r = -1;
    393 
    394 	p = ltrim(s);
    395 	e = rtrim(p);
    396 	c = *e;
    397 	*e = '\0';
    398 
    399 	if (baseurl && !uri_hasscheme(p) &&
    400 	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
    401 	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
    402 		r = uri_format(link, sizeof(link), &newuri);
    403 
    404 	if (r >= 0 && (size_t)r < sizeof(link))
    405 		printtrimmed(link);
    406 	else
    407 		printtrimmed(p);
    408 
    409 	*e = c; /* restore NUL byte to original character */
    410 }
    411 
    412 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
    413 static void
    414 string_print_uri(String *s)
    415 {
    416 	if (!s->data || !s->len)
    417 		return;
    418 
    419 	printuri(s->data);
    420 }
    421 
    422 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
    423 static void
    424 string_print_timestamp(String *s)
    425 {
    426 	long long t;
    427 
    428 	if (!s->data || !s->len)
    429 		return;
    430 
    431 	if (parsetime(s->data, &t) != -1)
    432 		printf("%lld", t);
    433 }
    434 
    435 /* Convert time fields. Returns a UNIX timestamp. */
    436 static long long
    437 datetounix(long long year, int mon, int day, int hour, int min, int sec)
    438 {
    439 	static const int secs_through_month[] = {
    440 		0, 31 * 86400, 59 * 86400, 90 * 86400,
    441 		120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
    442 		243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
    443 	int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
    444 	long long t;
    445 
    446 	if (year - 2ULL <= 136) {
    447 		leaps = (year - 68) >> 2;
    448 		if (!((year - 68) & 3)) {
    449 			leaps--;
    450 			is_leap = 1;
    451 		} else {
    452 			is_leap = 0;
    453 		}
    454 		t = 31536000 * (year - 70) + 86400 * leaps;
    455 	} else {
    456 		cycles = (year - 100) / 400;
    457 		rem = (year - 100) % 400;
    458 		if (rem < 0) {
    459 			cycles--;
    460 			rem += 400;
    461 		}
    462 		if (!rem) {
    463 			is_leap = 1;
    464 		} else {
    465 			if (rem >= 300)
    466 				centuries = 3, rem -= 300;
    467 			else if (rem >= 200)
    468 				centuries = 2, rem -= 200;
    469 			else if (rem >= 100)
    470 				centuries = 1, rem -= 100;
    471 			if (rem) {
    472 				leaps = rem / 4U;
    473 				rem %= 4U;
    474 				is_leap = !rem;
    475 			}
    476 		}
    477 		leaps += 97 * cycles + 24 * centuries - is_leap;
    478 		t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
    479 	}
    480 	t += secs_through_month[mon];
    481 	if (is_leap && mon >= 2)
    482 		t += 86400;
    483 	t += 86400LL * (day - 1);
    484 	t += 3600LL * hour;
    485 	t += 60LL * min;
    486 	t += sec;
    487 
    488 	return t;
    489 }
    490 
    491 /* Get timezone from string, return time offset in seconds from UTC.
    492  * NOTE: only parses timezones in RFC-822, many other timezone names are
    493  * ambiguous anyway.
    494  * ANSI and military zones are defined wrong in RFC822 and are unsupported,
    495  * see note on RFC2822 4.3 page 32. */
    496 static long
    497 gettzoffset(const char *s)
    498 {
    499 	static const struct {
    500 		char *name;
    501 		int offhour;
    502 	} tzones[] = {
    503 		{ "CDT", -5 * 3600 },
    504 		{ "CST", -6 * 3600 },
    505 		{ "EDT", -4 * 3600 },
    506 		{ "EST", -5 * 3600 },
    507 		{ "MDT", -6 * 3600 },
    508 		{ "MST", -7 * 3600 },
    509 		{ "PDT", -7 * 3600 },
    510 		{ "PST", -8 * 3600 },
    511 	};
    512 	const char *p;
    513 	long tzhour = 0, tzmin = 0;
    514 	size_t i;
    515 
    516 	for (; ISSPACE((unsigned char)*s); s++)
    517 		;
    518 	switch (*s) {
    519 	case '-': /* offset */
    520 	case '+':
    521 		for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
    522 			tzhour = (tzhour * 10) + (*p - '0');
    523 		if (*p == ':')
    524 			p++;
    525 		for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
    526 			tzmin = (tzmin * 10) + (*p - '0');
    527 		return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
    528 	default: /* timezone name */
    529 		for (i = 0; ISALPHA((unsigned char)s[i]); i++)
    530 			;
    531 		if (i != 3)
    532 			return 0;
    533 		/* compare timezone and adjust offset relative to UTC */
    534 		for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
    535 			if (!memcmp(s, tzones[i].name, 3))
    536 				return tzones[i].offhour;
    537 		}
    538 	}
    539 	return 0;
    540 }
    541 
    542 /* Parse time string `s` into the UNIX timestamp `tp`.
    543    Returns 0 on success or -1 on failure. */
    544 static int
    545 parsetime(const char *s, long long *tp)
    546 {
    547 	static const struct {
    548 		char *name;
    549 		int len;
    550 	} mons[] = {
    551 		{ STRP("January"),   },
    552 		{ STRP("February"),  },
    553 		{ STRP("March"),     },
    554 		{ STRP("April"),     },
    555 		{ STRP("May"),       },
    556 		{ STRP("June"),      },
    557 		{ STRP("July"),      },
    558 		{ STRP("August"),    },
    559 		{ STRP("September"), },
    560 		{ STRP("October"),   },
    561 		{ STRP("November"),  },
    562 		{ STRP("December"),  },
    563 	};
    564 	int va[6] = { 0 }, i, j, v, vi;
    565 	size_t m;
    566 
    567 	for (; ISSPACE((unsigned char)*s); s++)
    568 		;
    569 	if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
    570 		return -1;
    571 
    572 	if (ISDIGIT((unsigned char)s[0]) &&
    573 	    ISDIGIT((unsigned char)s[1]) &&
    574 	    ISDIGIT((unsigned char)s[2]) &&
    575 	    ISDIGIT((unsigned char)s[3])) {
    576 		/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
    577 		vi = 0;
    578 	} else {
    579 		/* format: "[%a, ]%d %b %Y %H:%M:%S" */
    580 		/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
    581 		for (; ISALPHA((unsigned char)*s); s++)
    582 			;
    583 		for (; ISSPACE((unsigned char)*s); s++)
    584 			;
    585 		if (*s == ',')
    586 			s++;
    587 		for (; ISSPACE((unsigned char)*s); s++)
    588 			;
    589 		for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
    590 			v = (v * 10) + (*s - '0');
    591 		va[2] = v; /* day */
    592 		for (; ISSPACE((unsigned char)*s); s++)
    593 			;
    594 		/* end of word month */
    595 		for (j = 0; ISALPHA((unsigned char)s[j]); j++)
    596 			;
    597 		/* check month name */
    598 		if (j < 3 || j > 9)
    599 			return -1; /* month cannot match */
    600 		for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
    601 			/* abbreviation (3 length) or long name */
    602 			if ((j == 3 || j == mons[m].len) &&
    603 			    !strncasecmp(mons[m].name, s, j)) {
    604 				va[1] = m + 1;
    605 				s += j;
    606 				break;
    607 			}
    608 		}
    609 		if (m >= 12)
    610 			return -1; /* no month found */
    611 		for (; ISSPACE((unsigned char)*s); s++)
    612 			;
    613 		for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
    614 			v = (v * 10) + (*s - '0');
    615 		/* obsolete short year: RFC2822 4.3 */
    616 		if (i <= 3)
    617 			v += (v >= 0 && v <= 49) ? 2000 : 1900;
    618 		va[0] = v; /* year */
    619 		for (; ISSPACE((unsigned char)*s); s++)
    620 			;
    621 		/* parse only regular time part, see below */
    622 		vi = 3;
    623 	}
    624 
    625 	/* parse time parts (and possibly remaining date parts) */
    626 	for (; *s && vi < 6; vi++) {
    627 		for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
    628 		                   ISDIGIT((unsigned char)*s); s++, i++) {
    629 			v = (v * 10) + (*s - '0');
    630 		}
    631 		va[vi] = v;
    632 
    633 		if ((vi < 2 && *s == '-') ||
    634 		    (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
    635 		    (vi > 2 && *s == ':'))
    636 			s++;
    637 	}
    638 
    639 	/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
    640 	if (*s == '.') {
    641 		for (s++; ISDIGIT((unsigned char)*s); s++)
    642 			;
    643 	}
    644 
    645 	/* invalid range */
    646 	if (va[0] < 0 || va[0] > 9999 ||
    647 	    va[1] < 1 || va[1] > 12 ||
    648 	    va[2] < 1 || va[2] > 31 ||
    649 	    va[3] < 0 || va[3] > 23 ||
    650 	    va[4] < 0 || va[4] > 59 ||
    651 	    va[5] < 0 || va[5] > 60) /* allow leap second */
    652 		return -1;
    653 
    654 	*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
    655 	      gettzoffset(s);
    656 
    657 	return 0;
    658 }
    659 
    660 static void
    661 printfields(void)
    662 {
    663 	string_print_timestamp(&ctx.fields[FeedFieldTime].str);
    664 	putchar(FieldSeparator);
    665 	string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
    666 	putchar(FieldSeparator);
    667 	string_print_uri(&ctx.fields[FeedFieldLink].str);
    668 	putchar(FieldSeparator);
    669 	string_print_encoded(&ctx.fields[FeedFieldContent].str);
    670 	putchar(FieldSeparator);
    671 	fputs(contenttypes[ctx.contenttype], stdout);
    672 	putchar(FieldSeparator);
    673 	string_print_trimmed(&ctx.fields[FeedFieldId].str);
    674 	putchar(FieldSeparator);
    675 	string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
    676 	putchar(FieldSeparator);
    677 	string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
    678 	putchar(FieldSeparator);
    679 	string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
    680 	putchar('\n');
    681 
    682 	if (ferror(stdout)) /* check for errors but do not flush */
    683 		checkfileerror(stdout, "<stdout>", 'w');
    684 }
    685 
    686 static int
    687 istag(const char *name, size_t len, const char *name2, size_t len2)
    688 {
    689 	return (len == len2 && !strcasecmp(name, name2));
    690 }
    691 
    692 static int
    693 isattr(const char *name, size_t len, const char *name2, size_t len2)
    694 {
    695 	return (len == len2 && !strcasecmp(name, name2));
    696 }
    697 
    698 static void
    699 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    700 	const char *v, size_t vl)
    701 {
    702 	/* handles transforming inline XML to data */
    703 	if (ISINCONTENT(ctx)) {
    704 		if (ctx.contenttype == ContentTypeHTML)
    705 			xmldata(p, v, vl);
    706 		return;
    707 	}
    708 
    709 	if (!ctx.tag.id)
    710 		return;
    711 
    712 	/* content-type may be: Atom: text, xhtml, html or mime-type.
    713 	   MRSS (media:description): plain, html. */
    714 	if (ISCONTENTTAG(ctx)) {
    715 		if (isattr(n, nl, STRP("type")))
    716 			string_append(&attrtype, v, vl);
    717 		return;
    718 	}
    719 
    720 	if (ctx.feedtype == FeedTypeRSS) {
    721 		if (ctx.tag.id == RSSTagEnclosure &&
    722 		    isattr(n, nl, STRP("url"))) {
    723 			string_append(&tmpstr, v, vl);
    724 		} else if (ctx.tag.id == RSSTagGuid &&
    725 		           isattr(n, nl, STRP("ispermalink"))) {
    726 			string_append(&attrispermalink, v, vl);
    727 		}
    728 	} else if (ctx.feedtype == FeedTypeAtom) {
    729 		if (ctx.tag.id == AtomTagLink) {
    730 			if (isattr(n, nl, STRP("rel"))) {
    731 				string_append(&attrrel, v, vl);
    732 			} else if (isattr(n, nl, STRP("href"))) {
    733 				string_append(&tmpstr, v, vl);
    734 			}
    735 		} else if (ctx.tag.id == AtomTagCategory &&
    736 			   isattr(n, nl, STRP("term"))) {
    737 			string_append(&tmpstr, v, vl);
    738 		}
    739 	}
    740 }
    741 
    742 static void
    743 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    744               const char *data, size_t datalen)
    745 {
    746 	char buf[16];
    747 	int len;
    748 
    749 	/* handles transforming inline XML to data */
    750 	if (ISINCONTENT(ctx)) {
    751 		if (ctx.contenttype == ContentTypeHTML)
    752 			xmldata(p, data, datalen);
    753 		return;
    754 	}
    755 
    756 	if (!ctx.tag.id)
    757 		return;
    758 
    759 	/* try to translate entity, else just pass as data to
    760 	 * xmlattr handler. */
    761 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    762 		xmlattr(p, t, tl, n, nl, buf, (size_t)len);
    763 	else
    764 		xmlattr(p, t, tl, n, nl, data, datalen);
    765 }
    766 
    767 static void
    768 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    769 {
    770 	if (ISINCONTENT(ctx)) {
    771 		if (ctx.contenttype == ContentTypeHTML) {
    772 			/* handles transforming inline XML to data */
    773 			xmldata(p, "\"", 1);
    774 			ctx.attrcount = 0;
    775 		}
    776 		return;
    777 	}
    778 }
    779 
    780 static void
    781 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    782 {
    783 	if (ISINCONTENT(ctx)) {
    784 		if (ctx.contenttype == ContentTypeHTML) {
    785 			/* handles transforming inline XML to data */
    786 			if (!ctx.attrcount)
    787 				xmldata(p, " ", 1);
    788 			ctx.attrcount++;
    789 			xmldata(p, n, nl);
    790 			xmldata(p, "=\"", 2);
    791 		}
    792 		return;
    793 	}
    794 
    795 	if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
    796 		string_clear(&attrispermalink);
    797 	else if (attrrel.len && isattr(n, nl, STRP("rel")))
    798 		string_clear(&attrrel);
    799 	else if (attrtype.len && isattr(n, nl, STRP("type")))
    800 		string_clear(&attrtype);
    801 	else if (tmpstr.len &&
    802 	    (isattr(n, nl, STRP("href")) ||
    803 	     isattr(n, nl, STRP("term")) ||
    804 	     isattr(n, nl, STRP("url"))))
    805 		string_clear(&tmpstr); /* use the last value for multiple attribute values */
    806 }
    807 
    808 static void
    809 xmldata(XMLParser *p, const char *s, size_t len)
    810 {
    811 	if (!ctx.field)
    812 		return;
    813 
    814 	if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    815 		string_append(&tmpstr, s, len);
    816 	else
    817 		string_append(ctx.field, s, len);
    818 }
    819 
    820 static void
    821 xmldataentity(XMLParser *p, const char *data, size_t datalen)
    822 {
    823 	char buf[16];
    824 	int len;
    825 
    826 	if (!ctx.field)
    827 		return;
    828 
    829 	/* try to translate entity, else just pass as data to
    830 	 * xmldata handler. */
    831 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    832 		xmldata(p, buf, (size_t)len);
    833 	else
    834 		xmldata(p, data, datalen);
    835 }
    836 
    837 static void
    838 xmltagstart(XMLParser *p, const char *t, size_t tl)
    839 {
    840 	const FeedTag *f;
    841 
    842 	if (ISINCONTENT(ctx)) {
    843 		if (ctx.contenttype == ContentTypeHTML) {
    844 			ctx.attrcount = 0;
    845 			xmldata(p, "<", 1);
    846 			xmldata(p, t, tl);
    847 		}
    848 		return;
    849 	}
    850 
    851 	/* start of RSS or Atom item / entry */
    852 	if (ctx.feedtype == FeedTypeNone) {
    853 		if (istag(t, tl, STRP("entry")))
    854 			ctx.feedtype = FeedTypeAtom;
    855 		else if (istag(t, tl, STRP("item")))
    856 			ctx.feedtype = FeedTypeRSS;
    857 		return;
    858 	}
    859 
    860 	/* field tagid already set or nested tags. */
    861 	if (ctx.tag.id) {
    862 		/* nested <author><name> for Atom */
    863 		if (ctx.tag.id == AtomTagAuthor &&
    864 		    istag(t, tl, STRP("name"))) {
    865 			memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
    866 		} else {
    867 			return; /* other nested tags are not allowed: return */
    868 		}
    869 	}
    870 
    871 	/* in item */
    872 	if (ctx.tag.id == TagUnknown) {
    873 		if (!(f = gettag(ctx.feedtype, t, tl)))
    874 			f = &notag;
    875 		memcpy(&(ctx.tag), f, sizeof(ctx.tag));
    876 	}
    877 
    878 	ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
    879 	string_clear(&attrispermalink);
    880 	string_clear(&attrrel);
    881 	string_clear(&attrtype);
    882 }
    883 
    884 static void
    885 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
    886 {
    887 	enum TagId tagid;
    888 
    889 	if (ISINCONTENT(ctx)) {
    890 		if (ctx.contenttype == ContentTypeHTML) {
    891 			if (isshort)
    892 				xmldata(p, "/>", 2);
    893 			else
    894 				xmldata(p, ">", 1);
    895 		}
    896 		return;
    897 	}
    898 
    899 	/* set tag type based on it's attribute value */
    900 	if (ctx.tag.id == RSSTagGuid) {
    901 		/* if empty the default is "true" */
    902 		if (!attrispermalink.len ||
    903 		    isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
    904 			ctx.tag.id = RSSTagGuidPermalinkTrue;
    905 		else
    906 			ctx.tag.id = RSSTagGuidPermalinkFalse;
    907 	} else if (ctx.tag.id == AtomTagLink) {
    908 		/* empty or "alternate": other types could be
    909 		   "enclosure", "related", "self" or "via" */
    910 		if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
    911 			ctx.tag.id = AtomTagLinkAlternate;
    912 		else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
    913 			ctx.tag.id = AtomTagLinkEnclosure;
    914 		else
    915 			ctx.tag.id = AtomTagLink; /* unknown */
    916 	}
    917 
    918 	tagid = ctx.tag.id;
    919 
    920 	/* map tag type to field: unknown or lesser priority is ignored,
    921 	   when tags of the same type are repeated only the first is used. */
    922 	if (fieldmap[tagid] == -1 ||
    923 	    (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
    924 	     tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
    925 		return;
    926 	}
    927 
    928 	if (ctx.iscontenttag) {
    929 		ctx.iscontent = 1;
    930 		ctx.iscontenttag = 0;
    931 
    932 		/* detect content-type based on type attribute */
    933 		if (attrtype.len) {
    934 			if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
    935 			    isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
    936 			    isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
    937 			    isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
    938 			    isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
    939 				ctx.contenttype = ContentTypeHTML;
    940 			else /* unknown: handle as base64 text data */
    941 				ctx.contenttype = ContentTypePlain;
    942 		} else {
    943 			/* default content-type */
    944 			if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
    945 				ctx.contenttype = ContentTypeHTML;
    946 			else
    947 				ctx.contenttype = ContentTypePlain;
    948 		}
    949 	}
    950 
    951 	ctx.field = &(ctx.fields[fieldmap[tagid]].str);
    952 	ctx.fields[fieldmap[tagid]].tagid = tagid;
    953 
    954 	/* clear field if it is overwritten (with a priority order) for the new
    955 	   value, if the field can have multiple values then do not clear it. */
    956 	if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    957 		string_clear(ctx.field);
    958 }
    959 
    960 static void
    961 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
    962 {
    963 	size_t i;
    964 
    965 	if (ctx.feedtype == FeedTypeNone)
    966 		return;
    967 
    968 	if (ISINCONTENT(ctx)) {
    969 		/* not a closed content field */
    970 		if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    971 			if (!isshort && ctx.contenttype == ContentTypeHTML) {
    972 				xmldata(p, "</", 2);
    973 				xmldata(p, t, tl);
    974 				xmldata(p, ">", 1);
    975 			}
    976 			return;
    977 		}
    978 	} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    979 		/* matched tag end: close it */
    980 		/* copy also to the link field if the attribute isPermaLink="true"
    981 		   and it is not set by a tag with higher priority. */
    982 		if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
    983 		    ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
    984 			string_clear(&ctx.fields[FeedFieldLink].str);
    985 			string_append(&ctx.fields[FeedFieldLink].str,
    986 			              ctx.field->data, ctx.field->len);
    987 			ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
    988 		}
    989 	} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
    990 	   istag(t, tl, STRP("entry"))) || /* Atom */
    991 	   (ctx.feedtype == FeedTypeRSS &&
    992 	   istag(t, tl, STRP("item"))))) /* RSS */
    993 	{
    994 		/* end of RSS or Atom entry / item */
    995 		printfields();
    996 
    997 		/* clear strings */
    998 		for (i = 0; i < FeedFieldLast; i++) {
    999 			string_clear(&ctx.fields[i].str);
   1000 			ctx.fields[i].tagid = TagUnknown;
   1001 		}
   1002 		ctx.contenttype = ContentTypeNone;
   1003 		/* allow parsing of Atom and RSS concatenated in one XML stream. */
   1004 		ctx.feedtype = FeedTypeNone;
   1005 	} else {
   1006 		return; /* not end of field */
   1007 	}
   1008 
   1009 	/* temporary string: for fields that cannot be processed
   1010 	   directly and need more context, for example by it's tag
   1011 	   attributes, like the Atom link rel="alternate|enclosure". */
   1012 	if (tmpstr.len && ctx.field) {
   1013 		if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
   1014 			if (ctx.field->len)
   1015 				string_append(ctx.field, FieldMultiSeparator, 1);
   1016 			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1017 		} else {
   1018 			string_clear(ctx.field);
   1019 			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1020 		}
   1021 	}
   1022 
   1023 	/* close field */
   1024 	string_clear(&tmpstr); /* reuse and clear temporary string */
   1025 
   1026 	if (ctx.tag.id == AtomTagAuthorName)
   1027 		memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
   1028 	else
   1029 		memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1030 
   1031 	ctx.iscontent = 0;
   1032 	ctx.field = NULL;
   1033 }
   1034 
   1035 int
   1036 main(int argc, char *argv[])
   1037 {
   1038 	if (pledge("stdio", NULL) == -1)
   1039 		err(1, "pledge");
   1040 
   1041 	if (argc > 1) {
   1042 		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
   1043 			baseurl = argv[1];
   1044 		else
   1045 			errx(1, "baseurl incorrect or too long");
   1046 	}
   1047 
   1048 	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1049 
   1050 	parser.xmlattr = xmlattr;
   1051 	parser.xmlattrentity = xmlattrentity;
   1052 	parser.xmlattrend = xmlattrend;
   1053 	parser.xmlattrstart = xmlattrstart;
   1054 	parser.xmlcdata = xmldata;
   1055 	parser.xmldata = xmldata;
   1056 	parser.xmldataentity = xmldataentity;
   1057 	parser.xmltagend = xmltagend;
   1058 	parser.xmltagstart = xmltagstart;
   1059 	parser.xmltagstartparsed = xmltagstartparsed;
   1060 
   1061 	/* NOTE: getnext is defined in xml.h for inline optimization */
   1062 	xml_parse(&parser);
   1063 
   1064 	checkfileerror(stdin, "<stdin>", 'r');
   1065 	checkfileerror(stdout, "<stdout>", 'w');
   1066 
   1067 	return 0;
   1068 }