sfeed

simple feed reader - forked from git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeedsfeed.zip
Log | Files | Refs | Atom | README | LICENSE

sfeed.c (raw) (30076B)


   1 #include <errno.h>
   2 #include <stdint.h>
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <string.h>
   6 #include <strings.h>
   7 
   8 #include "util.h"
   9 #include "xml.h"
  10 
  11 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
  12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
  13 
  14 /* these feed fields support multiple separated values */
  15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
  16 
  17 /* string and byte-length */
  18 #define STRP(s)           s,sizeof(s)-1
  19 
  20 enum FeedType {
  21 	FeedTypeNone = 0,
  22 	FeedTypeRSS  = 1,
  23 	FeedTypeAtom = 2
  24 };
  25 
  26 enum ContentType {
  27 	ContentTypeNone  = 0,
  28 	ContentTypePlain = 1,
  29 	ContentTypeHTML  = 2
  30 };
  31 static const char *contenttypes[] = { "", "plain", "html" };
  32 
  33 /* String data / memory pool */
  34 typedef struct string {
  35 	char   *data;   /* data */
  36 	size_t  len;    /* string length */
  37 	size_t  bufsiz; /* allocated size */
  38 } String;
  39 
  40 /* NOTE: the order of these fields (content, date, author) indicate the
  41  *       priority to use them, from least important to high. */
  42 enum TagId {
  43 	TagUnknown = 0,
  44 	/* RSS */
  45 	RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
  46 	RSSTagTitle,
  47 	RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
  48 	RSSTagGuid,
  49 	RSSTagGuidPermalinkFalse,
  50 	RSSTagGuidPermalinkTrue,
  51 	/* must be defined after GUID, because it can be a link (isPermaLink) */
  52 	RSSTagLink,
  53 	RSSTagEnclosure,
  54 	RSSTagAuthor, RSSTagDccreator,
  55 	RSSTagCategory,
  56 	/* Atom */
  57 	/* creation date has higher priority */
  58 	AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
  59 	AtomTagTitle,
  60 	AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
  61 	AtomTagId,
  62 	AtomTagLink,
  63 	AtomTagLinkAlternate,
  64 	AtomTagLinkEnclosure,
  65 	AtomTagAuthor, AtomTagAuthorName,
  66 	AtomTagCategory,
  67 	TagLast
  68 };
  69 
  70 typedef struct feedtag {
  71 	char       *name; /* name of tag to match */
  72 	size_t      len;  /* len of `name` */
  73 	enum TagId  id;   /* unique ID */
  74 } FeedTag;
  75 
  76 typedef struct field {
  77 	String     str;
  78 	enum TagId tagid; /* tagid set previously, used for tag priority */
  79 } FeedField;
  80 
  81 enum {
  82 	FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
  83 	FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
  84 	FeedFieldLast
  85 };
  86 
  87 typedef struct feedcontext {
  88 	String          *field;        /* current FeedItem field String */
  89 	FeedField        fields[FeedFieldLast]; /* data for current item */
  90 	FeedTag          tag;          /* unique current parsed tag */
  91 	int              iscontent;    /* in content data */
  92 	int              iscontenttag; /* in content tag */
  93 	enum ContentType contenttype;  /* content-type for item */
  94 	enum FeedType    feedtype;
  95 	int              attrcount;    /* count item HTML element attributes */
  96 } FeedContext;
  97 
  98 static long long datetounix(long long, int, int, int, int, int);
  99 static FeedTag * gettag(enum FeedType, const char *, size_t);
 100 static long gettzoffset(const char *);
 101 static int  isattr(const char *, size_t, const char *, size_t);
 102 static int  istag(const char *, size_t, const char *, size_t);
 103 static int  parsetime(const char *, long long *);
 104 static void printfields(void);
 105 static void string_append(String *, const char *, size_t);
 106 static void string_buffer_realloc(String *, size_t);
 107 static void string_clear(String *);
 108 static void string_print_encoded(String *);
 109 static void string_print_timestamp(String *);
 110 static void string_print_trimmed(String *);
 111 static void string_print_trimmed_multi(String *);
 112 static void string_print_uri(String *);
 113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
 114                     const char *, size_t);
 115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
 116                           size_t, const char *, size_t);
 117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
 118                        size_t);
 119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
 120                          size_t);
 121 static void xmldata(XMLParser *, const char *, size_t);
 122 static void xmldataentity(XMLParser *, const char *, size_t);
 123 static void xmltagend(XMLParser *, const char *, size_t, int);
 124 static void xmltagstart(XMLParser *, const char *, size_t);
 125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
 126 
 127 /* map tag name to TagId type */
 128 /* RSS, keep this in alphabetical order */
 129 static const FeedTag rsstags[] = {
 130 	{ STRP("author"),            RSSTagAuthor            },
 131 	{ STRP("category"),          RSSTagCategory          },
 132 	{ STRP("content:encoded"),   RSSTagContentEncoded    },
 133 	{ STRP("dc:creator"),        RSSTagDccreator         },
 134 	{ STRP("dc:date"),           RSSTagDcdate            },
 135 	{ STRP("description"),       RSSTagDescription       },
 136 	/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
 137 	{ STRP("enclosure"),         RSSTagEnclosure         },
 138 	{ STRP("guid"),              RSSTagGuid              },
 139 	{ STRP("link"),              RSSTagLink              },
 140 	{ STRP("media:description"), RSSTagMediaDescription  },
 141 	{ STRP("pubdate"),           RSSTagPubdate           },
 142 	{ STRP("title"),             RSSTagTitle             }
 143 };
 144 
 145 /* Atom, keep this in alphabetical order */
 146 static const FeedTag atomtags[] = {
 147 	{ STRP("author"),            AtomTagAuthor           },
 148 	{ STRP("category"),          AtomTagCategory         },
 149 	{ STRP("content"),           AtomTagContent          },
 150 	{ STRP("id"),                AtomTagId               },
 151 	{ STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
 152 	/* Atom: <link href="" />, RSS has <link></link> */
 153 	{ STRP("link"),              AtomTagLink             },
 154 	{ STRP("media:description"), AtomTagMediaDescription },
 155 	{ STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
 156 	{ STRP("published"),         AtomTagPublished        },
 157 	{ STRP("summary"),           AtomTagSummary          },
 158 	{ STRP("title"),             AtomTagTitle            },
 159 	{ STRP("updated"),           AtomTagUpdated          }
 160 };
 161 
 162 /* special case: nested <author><name> */
 163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
 164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
 165 
 166 /* reference to no / unknown tag */
 167 static const FeedTag notag = { STRP(""), TagUnknown };
 168 
 169 /* map TagId type to RSS/Atom field, all tags must be defined */
 170 static const int fieldmap[TagLast] = {
 171 	[TagUnknown]               = -1,
 172 	/* RSS */
 173 	[RSSTagDcdate]             = FeedFieldTime,
 174 	[RSSTagPubdate]            = FeedFieldTime,
 175 	[RSSTagTitle]              = FeedFieldTitle,
 176 	[RSSTagMediaDescription]   = FeedFieldContent,
 177 	[RSSTagDescription]        = FeedFieldContent,
 178 	[RSSTagContentEncoded]     = FeedFieldContent,
 179 	[RSSTagGuid]               = -1,
 180 	[RSSTagGuidPermalinkFalse] = FeedFieldId,
 181 	[RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special-case: both a link and an id */
 182 	[RSSTagLink]               = FeedFieldLink,
 183 	[RSSTagEnclosure]          = FeedFieldEnclosure,
 184 	[RSSTagAuthor]             = FeedFieldAuthor,
 185 	[RSSTagDccreator]          = FeedFieldAuthor,
 186 	[RSSTagCategory]           = FeedFieldCategory,
 187 	/* Atom */
 188 	[AtomTagModified]          = FeedFieldTime,
 189 	[AtomTagUpdated]           = FeedFieldTime,
 190 	[AtomTagIssued]            = FeedFieldTime,
 191 	[AtomTagPublished]         = FeedFieldTime,
 192 	[AtomTagTitle]             = FeedFieldTitle,
 193 	[AtomTagMediaDescription]  = FeedFieldContent,
 194 	[AtomTagSummary]           = FeedFieldContent,
 195 	[AtomTagContent]           = FeedFieldContent,
 196 	[AtomTagId]                = FeedFieldId,
 197 	[AtomTagLink]              = -1,
 198 	[AtomTagLinkAlternate]     = FeedFieldLink,
 199 	[AtomTagLinkEnclosure]     = FeedFieldEnclosure,
 200 	[AtomTagAuthor]            = -1,
 201 	[AtomTagAuthorName]        = FeedFieldAuthor,
 202 	[AtomTagCategory]          = FeedFieldCategory
 203 };
 204 
 205 static const int FieldSeparator = '\t';
 206 /* separator for multiple values in a field, separator should be 1 byte */
 207 static const char FieldMultiSeparator[] = "|";
 208 static struct uri baseuri;
 209 static const char *baseurl;
 210 
 211 static FeedContext ctx;
 212 static XMLParser parser; /* XML parser state */
 213 static String attrispermalink, attrrel, attrtype, tmpstr;
 214 
 215 /* Unique tag(id) for parsed tag name. */
 216 static FeedTag *
 217 gettag(enum FeedType feedtype, const char *name, size_t namelen)
 218 {
 219 	FeedTag *r;
 220 	size_t i;
 221 
 222 	switch (feedtype) {
 223 	case FeedTypeRSS:
 224 		for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i++) {
 225 			r = (FeedTag *)&rsstags[i];
 226 			if (r->len == namelen && !strcasecmp(r->name, name))
 227 				return r;
 228 		}
 229 		break;
 230 	case FeedTypeAtom:
 231 		for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); i++) {
 232 			r = (FeedTag *)&atomtags[i];
 233 			if (r->len == namelen && !strcasecmp(r->name, name))
 234 				return r;
 235 		}
 236 		break;
 237 	default:
 238 		break;
 239 	}
 240 
 241 	return NULL;
 242 }
 243 
 244 static char *
 245 ltrim(const char *s)
 246 {
 247 	for (; ISSPACE((unsigned char)*s); s++)
 248 		;
 249 	return (char *)s;
 250 }
 251 
 252 static char *
 253 rtrim(const char *s)
 254 {
 255 	const char *e;
 256 
 257 	for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
 258 		;
 259 	return (char *)e;
 260 }
 261 
 262 /* Clear string only; don't free, prevents unnecessary reallocation. */
 263 static void
 264 string_clear(String *s)
 265 {
 266 	if (s->data)
 267 		s->data[0] = '\0';
 268 	s->len = 0;
 269 }
 270 
 271 static void
 272 string_buffer_realloc(String *s, size_t newlen)
 273 {
 274 	size_t alloclen;
 275 
 276 	if (newlen > SIZE_MAX / 2) {
 277 		alloclen = SIZE_MAX;
 278 	} else {
 279 		for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
 280 			;
 281 	}
 282 	if (!(s->data = realloc(s->data, alloclen)))
 283 		err(1, "realloc");
 284 	s->bufsiz = alloclen;
 285 }
 286 
 287 /* Append data to String, s->data and data may not overlap. */
 288 static void
 289 string_append(String *s, const char *data, size_t len)
 290 {
 291 	if (!len)
 292 		return;
 293 
 294 	if (s->len >= SIZE_MAX - len) {
 295 		errno = ENOMEM;
 296 		err(1, "realloc");
 297 	}
 298 
 299 	/* check if allocation is necessary, never shrink the buffer. */
 300 	if (s->len + len >= s->bufsiz)
 301 		string_buffer_realloc(s, s->len + len + 1);
 302 	memcpy(s->data + s->len, data, len);
 303 	s->len += len;
 304 	s->data[s->len] = '\0';
 305 }
 306 
 307 /* Print text, encode TABs, newlines and '\', remove other whitespace.
 308  * Remove leading and trailing whitespace. */
 309 static void
 310 string_print_encoded(String *s)
 311 {
 312 	const char *p, *e;
 313 
 314 	if (!s->data || !s->len)
 315 		return;
 316 
 317 	p = ltrim(s->data);
 318 	e = rtrim(p);
 319 
 320 	for (; *p && p != e; p++) {
 321 		switch (*p) {
 322 		case '\n': putchar('\\'); putchar('n'); break;
 323 		case '\\': putchar('\\'); putchar('\\'); break;
 324 		case '\t': putchar('\\'); putchar('t'); break;
 325 		default:
 326 			/* ignore control chars */
 327 			if (!ISCNTRL((unsigned char)*p))
 328 				putchar(*p);
 329 			break;
 330 		}
 331 	}
 332 }
 333 
 334 static void
 335 printtrimmed(const char *s)
 336 {
 337 	char *p, *e;
 338 
 339 	p = ltrim(s);
 340 	e = rtrim(p);
 341 	for (; *p && p != e; p++) {
 342 		if (ISSPACE((unsigned char)*p))
 343 			putchar(' '); /* any whitespace to space */
 344 		else if (!ISCNTRL((unsigned char)*p))
 345 			/* ignore other control chars */
 346 			putchar(*p);
 347 	}
 348 }
 349 
 350 /* Print text, replace TABs, carriage return and other whitespace with ' '.
 351  * Other control chars are removed. Remove leading and trailing whitespace. */
 352 static void
 353 string_print_trimmed(String *s)
 354 {
 355 	if (!s->data || !s->len)
 356 		return;
 357 
 358 	printtrimmed(s->data);
 359 }
 360 
 361 /* Print each field with trimmed whitespace, separated by '|'. */
 362 static void
 363 string_print_trimmed_multi(String *s)
 364 {
 365 	char *p, *e;
 366 	int c;
 367 
 368 	if (!s->data || !s->len)
 369 		return;
 370 
 371 	for (p = s->data; ; p = e + 1) {
 372 		if ((e = strstr(p, FieldMultiSeparator))) {
 373 			c = *e;
 374 			*e = '\0';
 375 			printtrimmed(p);
 376 			*e = c; /* restore NUL byte to original character */
 377 			fputs(FieldMultiSeparator, stdout);
 378 		} else {
 379 			printtrimmed(p);
 380 			break;
 381 		}
 382 	}
 383 }
 384 
 385 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
 386 static void
 387 printuri(char *s)
 388 {
 389 	char link[4096], *p, *e;
 390 	struct uri newuri, olduri;
 391 	int c, r = -1;
 392 
 393 	p = ltrim(s);
 394 	e = rtrim(p);
 395 	c = *e;
 396 	*e = '\0';
 397 
 398 	if (baseurl && !uri_hasscheme(p) &&
 399 	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
 400 	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
 401 		r = uri_format(link, sizeof(link), &newuri);
 402 
 403 	if (r >= 0 && (size_t)r < sizeof(link))
 404 		printtrimmed(link);
 405 	else
 406 		printtrimmed(p);
 407 
 408 	*e = c; /* restore NUL byte to original character */
 409 }
 410 
 411 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
 412 static void
 413 string_print_uri(String *s)
 414 {
 415 	if (!s->data || !s->len)
 416 		return;
 417 
 418 	printuri(s->data);
 419 }
 420 
 421 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
 422 static void
 423 string_print_timestamp(String *s)
 424 {
 425 	long long t;
 426 
 427 	if (!s->data || !s->len)
 428 		return;
 429 
 430 	if (parsetime(s->data, &t) != -1)
 431 		printf("%lld", t);
 432 }
 433 
 434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
 435  * Parameters should be passed as they are in a struct tm:
 436  * that is: year = year - 1900, month = month - 1. */
 437 static long long
 438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
 439 {
 440 	/* seconds in a month in a regular (non-leap) year */
 441 	static const long secs_through_month[] = {
 442 		0, 31 * 86400, 59 * 86400, 90 * 86400,
 443 		120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
 444 		243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
 445 	int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
 446 	long long t;
 447 
 448 	/* optimization: handle common range year 1902 up to and including 2038 */
 449 	if (year - 2ULL <= 136) {
 450 		/* amount of leap days relative to 1970: every 4 years */
 451 		leaps = (year - 68) >> 2;
 452 		if (!((year - 68) & 3)) {
 453 			leaps--;
 454 			is_leap = 1;
 455 		} else {
 456 			is_leap = 0;
 457 		}
 458 		t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
 459 	} else {
 460 		/* general leap year calculation:
 461 		 * leap years occur mostly every 4 years but every 100 years
 462 		 * a leap year is skipped unless the year is divisible by 400 */
 463 		cycles = (year - 100) / 400;
 464 		rem = (year - 100) % 400;
 465 		if (rem < 0) {
 466 			cycles--;
 467 			rem += 400;
 468 		}
 469 		if (!rem) {
 470 			is_leap = 1;
 471 		} else {
 472 			if (rem >= 300) {
 473 				centuries = 3;
 474 				rem -= 300;
 475 			} else if (rem >= 200) {
 476 				centuries = 2;
 477 				rem -= 200;
 478 			} else if (rem >= 100) {
 479 				centuries = 1;
 480 				rem -= 100;
 481 			}
 482 			if (rem) {
 483 				leaps = rem / 4U;
 484 				rem %= 4U;
 485 				is_leap = !rem;
 486 			}
 487 		}
 488 		leaps += (97 * cycles) + (24 * centuries) - is_leap;
 489 
 490 		/* adjust 8 leap days from 1970 up to and including 2000:
 491 		 * ((30 * 365) + 8) * 86400 = 946771200 */
 492 		t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
 493 	}
 494 	t += secs_through_month[mon];
 495 	if (is_leap && mon >= 2)
 496 		t += 86400;
 497 	t += 86400LL * (day - 1);
 498 	t += 3600LL * hour;
 499 	t += 60LL * min;
 500 	t += sec;
 501 
 502 	return t;
 503 }
 504 
 505 /* Get timezone from string, return time offset in seconds from UTC.
 506  * NOTE: only parses timezones in RFC 822, many other timezone names are
 507  * ambiguous anyway.
 508  * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
 509  * see note on RFC 2822 4.3 page 32. */
 510 static long
 511 gettzoffset(const char *s)
 512 {
 513 	static const struct {
 514 		char *name;
 515 		int offhour;
 516 	} tzones[] = {
 517 		{ "CDT", -5 * 3600 },
 518 		{ "CST", -6 * 3600 },
 519 		{ "EDT", -4 * 3600 },
 520 		{ "EST", -5 * 3600 },
 521 		{ "MDT", -6 * 3600 },
 522 		{ "MST", -7 * 3600 },
 523 		{ "PDT", -7 * 3600 },
 524 		{ "PST", -8 * 3600 },
 525 	};
 526 	const char *p;
 527 	long tzhour = 0, tzmin = 0;
 528 	size_t i;
 529 
 530 	for (; ISSPACE((unsigned char)*s); s++)
 531 		;
 532 	switch (*s) {
 533 	case '-': /* offset */
 534 	case '+':
 535 		for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
 536 			tzhour = (tzhour * 10) + (*p - '0');
 537 		if (*p == ':')
 538 			p++;
 539 		for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
 540 			tzmin = (tzmin * 10) + (*p - '0');
 541 		return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
 542 	default: /* timezone name */
 543 		for (i = 0; ISALPHA((unsigned char)s[i]); i++)
 544 			;
 545 		if (i != 3)
 546 			return 0;
 547 		/* compare timezone and adjust offset relative to UTC */
 548 		for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
 549 			if (!memcmp(s, tzones[i].name, 3))
 550 				return tzones[i].offhour;
 551 		}
 552 	}
 553 	return 0;
 554 }
 555 
 556 /* Parse time string `s` into the UNIX timestamp `tp`.
 557  * Returns 0 on success or -1 on failure. */
 558 static int
 559 parsetime(const char *s, long long *tp)
 560 {
 561 	static const struct {
 562 		char *name;
 563 		int len;
 564 	} mons[] = {
 565 		{ STRP("January"),   },
 566 		{ STRP("February"),  },
 567 		{ STRP("March"),     },
 568 		{ STRP("April"),     },
 569 		{ STRP("May"),       },
 570 		{ STRP("June"),      },
 571 		{ STRP("July"),      },
 572 		{ STRP("August"),    },
 573 		{ STRP("September"), },
 574 		{ STRP("October"),   },
 575 		{ STRP("November"),  },
 576 		{ STRP("December"),  },
 577 	};
 578 	int va[6] = { 0 }, i, j, v, vi;
 579 	size_t m;
 580 
 581 	for (; ISSPACE((unsigned char)*s); s++)
 582 		;
 583 	if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
 584 		return -1;
 585 
 586 	if (ISDIGIT((unsigned char)s[0]) &&
 587 	    ISDIGIT((unsigned char)s[1]) &&
 588 	    ISDIGIT((unsigned char)s[2]) &&
 589 	    ISDIGIT((unsigned char)s[3])) {
 590 		/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
 591 		vi = 0;
 592 	} else {
 593 		/* format: "[%a, ]%d %b %Y %H:%M:%S" */
 594 		/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
 595 		for (; ISALPHA((unsigned char)*s); s++)
 596 			;
 597 		for (; ISSPACE((unsigned char)*s); s++)
 598 			;
 599 		if (*s == ',')
 600 			s++;
 601 		for (; ISSPACE((unsigned char)*s); s++)
 602 			;
 603 		for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
 604 			v = (v * 10) + (*s - '0');
 605 		va[2] = v; /* day */
 606 		for (; ISSPACE((unsigned char)*s); s++)
 607 			;
 608 		/* end of word month */
 609 		for (j = 0; ISALPHA((unsigned char)s[j]); j++)
 610 			;
 611 		/* check month name */
 612 		if (j < 3 || j > 9)
 613 			return -1; /* month cannot match */
 614 		for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
 615 			/* abbreviation (3 length) or long name */
 616 			if ((j == 3 || j == mons[m].len) &&
 617 			    !strncasecmp(mons[m].name, s, j)) {
 618 				va[1] = m + 1;
 619 				s += j;
 620 				break;
 621 			}
 622 		}
 623 		if (m >= 12)
 624 			return -1; /* no month found */
 625 		for (; ISSPACE((unsigned char)*s); s++)
 626 			;
 627 		for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
 628 			v = (v * 10) + (*s - '0');
 629 		/* obsolete short year: RFC 2822 4.3 */
 630 		if (i == 2 || i == 3)
 631 			v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
 632 		va[0] = v; /* year */
 633 		for (; ISSPACE((unsigned char)*s); s++)
 634 			;
 635 		/* parse only regular time part, see below */
 636 		vi = 3;
 637 	}
 638 
 639 	/* parse time parts (and possibly remaining date parts) */
 640 	for (; *s && vi < 6; vi++) {
 641 		for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
 642 		                   ISDIGIT((unsigned char)*s); s++, i++) {
 643 			v = (v * 10) + (*s - '0');
 644 		}
 645 		va[vi] = v;
 646 
 647 		if ((vi < 2 && (*s == '-' || *s == '/')) ||
 648 		    (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) ||
 649 		    (vi > 2 && *s == ':'))
 650 			s++;
 651 	}
 652 
 653 	/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
 654 	if (*s == '.' || *s == ',') {
 655 		for (s++; ISDIGIT((unsigned char)*s); s++)
 656 			;
 657 	}
 658 
 659 	/* invalid range */
 660 	if (va[0] < 0 || va[0] > 9999 ||
 661 	    va[1] < 1 || va[1] > 12 ||
 662 	    va[2] < 1 || va[2] > 31 ||
 663 	    va[3] < 0 || va[3] > 23 ||
 664 	    va[4] < 0 || va[4] > 59 ||
 665 	    va[5] < 0 || va[5] > 60) /* allow leap second */
 666 		return -1;
 667 
 668 	*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
 669 	      gettzoffset(s);
 670 
 671 	return 0;
 672 }
 673 
 674 static void
 675 printfields(void)
 676 {
 677 	string_print_timestamp(&ctx.fields[FeedFieldTime].str);
 678 	putchar(FieldSeparator);
 679 	string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
 680 	putchar(FieldSeparator);
 681 	string_print_uri(&ctx.fields[FeedFieldLink].str);
 682 	putchar(FieldSeparator);
 683 	string_print_encoded(&ctx.fields[FeedFieldContent].str);
 684 	putchar(FieldSeparator);
 685 	fputs(contenttypes[ctx.contenttype], stdout);
 686 	putchar(FieldSeparator);
 687 	string_print_trimmed(&ctx.fields[FeedFieldId].str);
 688 	putchar(FieldSeparator);
 689 	string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
 690 	putchar(FieldSeparator);
 691 	string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
 692 	putchar(FieldSeparator);
 693 	string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
 694 	putchar('\n');
 695 
 696 	if (ferror(stdout)) /* check for errors but do not flush */
 697 		checkfileerror(stdout, "<stdout>", 'w');
 698 }
 699 
 700 static int
 701 istag(const char *name, size_t len, const char *name2, size_t len2)
 702 {
 703 	return (len == len2 && !strcasecmp(name, name2));
 704 }
 705 
 706 static int
 707 isattr(const char *name, size_t len, const char *name2, size_t len2)
 708 {
 709 	return (len == len2 && !strcasecmp(name, name2));
 710 }
 711 
 712 static void
 713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
 714 	const char *v, size_t vl)
 715 {
 716 	/* handles transforming inline XML to data */
 717 	if (ISINCONTENT(ctx)) {
 718 		if (ctx.contenttype == ContentTypeHTML)
 719 			xmldata(p, v, vl);
 720 		return;
 721 	}
 722 
 723 	if (!ctx.tag.id)
 724 		return;
 725 
 726 	/* content-type may be for Atom: text, xhtml, html or a mime-type.
 727 	 * for MRSS (media:description): plain, html. */
 728 	if (ISCONTENTTAG(ctx)) {
 729 		if (isattr(n, nl, STRP("type")))
 730 			string_append(&attrtype, v, vl);
 731 		return;
 732 	}
 733 
 734 	if (ctx.feedtype == FeedTypeRSS) {
 735 		if (ctx.tag.id == RSSTagEnclosure &&
 736 		    isattr(n, nl, STRP("url"))) {
 737 			string_append(&tmpstr, v, vl);
 738 		} else if (ctx.tag.id == RSSTagGuid &&
 739 		           isattr(n, nl, STRP("ispermalink"))) {
 740 			string_append(&attrispermalink, v, vl);
 741 		}
 742 	} else if (ctx.feedtype == FeedTypeAtom) {
 743 		if (ctx.tag.id == AtomTagLink) {
 744 			if (isattr(n, nl, STRP("rel"))) {
 745 				string_append(&attrrel, v, vl);
 746 			} else if (isattr(n, nl, STRP("href"))) {
 747 				string_append(&tmpstr, v, vl);
 748 			}
 749 		} else if (ctx.tag.id == AtomTagCategory &&
 750 			   isattr(n, nl, STRP("term"))) {
 751 			string_append(&tmpstr, v, vl);
 752 		}
 753 	}
 754 }
 755 
 756 static void
 757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
 758               const char *data, size_t datalen)
 759 {
 760 	char buf[8];
 761 	int len;
 762 
 763 	/* handles transforming inline XML to data */
 764 	if (ISINCONTENT(ctx)) {
 765 		if (ctx.contenttype == ContentTypeHTML)
 766 			xmldata(p, data, datalen);
 767 		return;
 768 	}
 769 
 770 	if (!ctx.tag.id)
 771 		return;
 772 
 773 	/* try to translate entity, else just pass as data to
 774 	 * xmlattr handler. */
 775 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
 776 		xmlattr(p, t, tl, n, nl, buf, (size_t)len);
 777 	else
 778 		xmlattr(p, t, tl, n, nl, data, datalen);
 779 }
 780 
 781 static void
 782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
 783 {
 784 	if (ISINCONTENT(ctx)) {
 785 		if (ctx.contenttype == ContentTypeHTML) {
 786 			/* handles transforming inline XML to data */
 787 			xmldata(p, "\"", 1);
 788 			ctx.attrcount = 0;
 789 		}
 790 		return;
 791 	}
 792 }
 793 
 794 static void
 795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
 796 {
 797 	if (ISINCONTENT(ctx)) {
 798 		if (ctx.contenttype == ContentTypeHTML) {
 799 			/* handles transforming inline XML to data */
 800 			if (!ctx.attrcount)
 801 				xmldata(p, " ", 1);
 802 			ctx.attrcount++;
 803 			xmldata(p, n, nl);
 804 			xmldata(p, "=\"", 2);
 805 		}
 806 		return;
 807 	}
 808 
 809 	if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
 810 		string_clear(&attrispermalink);
 811 	else if (attrrel.len && isattr(n, nl, STRP("rel")))
 812 		string_clear(&attrrel);
 813 	else if (attrtype.len && isattr(n, nl, STRP("type")))
 814 		string_clear(&attrtype);
 815 	else if (tmpstr.len &&
 816 	    (isattr(n, nl, STRP("href")) ||
 817 	     isattr(n, nl, STRP("term")) ||
 818 	     isattr(n, nl, STRP("url"))))
 819 		string_clear(&tmpstr); /* use the last value for multiple attribute values */
 820 }
 821 
 822 static void
 823 xmldata(XMLParser *p, const char *s, size_t len)
 824 {
 825 	if (!ctx.field)
 826 		return;
 827 
 828 	if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
 829 		string_append(&tmpstr, s, len);
 830 	else
 831 		string_append(ctx.field, s, len);
 832 }
 833 
 834 static void
 835 xmldataentity(XMLParser *p, const char *data, size_t datalen)
 836 {
 837 	char buf[8];
 838 	int len;
 839 
 840 	if (!ctx.field)
 841 		return;
 842 
 843 	/* try to translate entity, else just pass as data to
 844 	 * xmldata handler. */
 845 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
 846 		xmldata(p, buf, (size_t)len);
 847 	else
 848 		xmldata(p, data, datalen);
 849 }
 850 
 851 static void
 852 xmltagstart(XMLParser *p, const char *t, size_t tl)
 853 {
 854 	const FeedTag *f;
 855 
 856 	if (ISINCONTENT(ctx)) {
 857 		if (ctx.contenttype == ContentTypeHTML) {
 858 			ctx.attrcount = 0;
 859 			xmldata(p, "<", 1);
 860 			xmldata(p, t, tl);
 861 		}
 862 		return;
 863 	}
 864 
 865 	/* start of RSS or Atom item / entry */
 866 	if (ctx.feedtype == FeedTypeNone) {
 867 		if (istag(t, tl, STRP("entry")))
 868 			ctx.feedtype = FeedTypeAtom;
 869 		else if (istag(t, tl, STRP("item")))
 870 			ctx.feedtype = FeedTypeRSS;
 871 		return;
 872 	}
 873 
 874 	/* field tagid already set or nested tags. */
 875 	if (ctx.tag.id) {
 876 		/* nested <author><name> for Atom */
 877 		if (ctx.tag.id == AtomTagAuthor &&
 878 		    istag(t, tl, STRP("name"))) {
 879 			memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
 880 		} else {
 881 			return; /* other nested tags are not allowed: return */
 882 		}
 883 	}
 884 
 885 	/* in item */
 886 	if (ctx.tag.id == TagUnknown) {
 887 		if (!(f = gettag(ctx.feedtype, t, tl)))
 888 			f = &notag;
 889 		memcpy(&(ctx.tag), f, sizeof(ctx.tag));
 890 	}
 891 
 892 	ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
 893 	string_clear(&attrispermalink);
 894 	string_clear(&attrrel);
 895 	string_clear(&attrtype);
 896 }
 897 
 898 static void
 899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 900 {
 901 	enum TagId tagid;
 902 
 903 	if (ISINCONTENT(ctx)) {
 904 		if (ctx.contenttype == ContentTypeHTML) {
 905 			if (isshort)
 906 				xmldata(p, "/>", 2);
 907 			else
 908 				xmldata(p, ">", 1);
 909 		}
 910 		return;
 911 	}
 912 
 913 	/* set tag type based on its attribute value */
 914 	if (ctx.tag.id == RSSTagGuid) {
 915 		/* if empty the default is "true" */
 916 		if (!attrispermalink.len ||
 917 		    isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
 918 			ctx.tag.id = RSSTagGuidPermalinkTrue;
 919 		else
 920 			ctx.tag.id = RSSTagGuidPermalinkFalse;
 921 	} else if (ctx.tag.id == AtomTagLink) {
 922 		/* empty or "alternate": other types could be
 923 		 * "enclosure", "related", "self" or "via" */
 924 		if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
 925 			ctx.tag.id = AtomTagLinkAlternate;
 926 		else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
 927 			ctx.tag.id = AtomTagLinkEnclosure;
 928 		else
 929 			ctx.tag.id = AtomTagLink; /* unknown */
 930 	}
 931 
 932 	tagid = ctx.tag.id;
 933 
 934 	/* map tag type to field: unknown or lesser priority is ignored,
 935 	 * when tags of the same type are repeated only the first is used. */
 936 	if (fieldmap[tagid] == -1 ||
 937 	    (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
 938 	     tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
 939 		return;
 940 	}
 941 
 942 	if (ctx.iscontenttag) {
 943 		ctx.iscontent = 1;
 944 		ctx.iscontenttag = 0;
 945 
 946 		/* detect content-type based on type attribute */
 947 		if (attrtype.len) {
 948 			if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
 949 			    isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
 950 			    isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
 951 			    isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
 952 			    isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
 953 				ctx.contenttype = ContentTypeHTML;
 954 			else /* unknown: handle as base64 text data */
 955 				ctx.contenttype = ContentTypePlain;
 956 		} else {
 957 			/* default content-type */
 958 			if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
 959 				ctx.contenttype = ContentTypeHTML;
 960 			else
 961 				ctx.contenttype = ContentTypePlain;
 962 		}
 963 	}
 964 
 965 	ctx.field = &(ctx.fields[fieldmap[tagid]].str);
 966 	ctx.fields[fieldmap[tagid]].tagid = tagid;
 967 
 968 	/* clear field if it is overwritten (with a priority order) for the new
 969 	 * value, if the field can have multiple values then do not clear it. */
 970 	if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
 971 		string_clear(ctx.field);
 972 }
 973 
 974 static void
 975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
 976 {
 977 	size_t i;
 978 
 979 	if (ctx.feedtype == FeedTypeNone)
 980 		return;
 981 
 982 	if (ISINCONTENT(ctx)) {
 983 		/* not a closed content field */
 984 		if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
 985 			if (!isshort && ctx.contenttype == ContentTypeHTML) {
 986 				xmldata(p, "</", 2);
 987 				xmldata(p, t, tl);
 988 				xmldata(p, ">", 1);
 989 			}
 990 			return;
 991 		}
 992 	} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
 993 		/* matched tag end: close it.
 994 		 * copy also to the link field if the attribute isPermaLink="true"
 995 		 * and it is not set by a tag with higher priority. */
 996 		if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
 997 		    ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
 998 			string_clear(&ctx.fields[FeedFieldLink].str);
 999 			string_append(&ctx.fields[FeedFieldLink].str,
1000 			              ctx.field->data, ctx.field->len);
1001 			ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
1002 		}
1003 	} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
1004 	   istag(t, tl, STRP("entry"))) || /* Atom */
1005 	   (ctx.feedtype == FeedTypeRSS &&
1006 	   istag(t, tl, STRP("item"))))) /* RSS */
1007 	{
1008 		/* end of RSS or Atom entry / item */
1009 		printfields();
1010 
1011 		/* clear strings */
1012 		for (i = 0; i < FeedFieldLast; i++) {
1013 			string_clear(&ctx.fields[i].str);
1014 			ctx.fields[i].tagid = TagUnknown;
1015 		}
1016 		ctx.contenttype = ContentTypeNone;
1017 		/* allow parsing of Atom and RSS concatenated in one XML stream. */
1018 		ctx.feedtype = FeedTypeNone;
1019 	} else {
1020 		return; /* not end of field */
1021 	}
1022 
1023 	/* temporary string: for fields that cannot be processed
1024 	 * directly and need more context, for example by its tag
1025 	 * attributes, like the Atom link rel="alternate|enclosure". */
1026 	if (tmpstr.len && ctx.field) {
1027 		if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
1028 			if (ctx.field->len)
1029 				string_append(ctx.field, FieldMultiSeparator, 1);
1030 			string_append(ctx.field, tmpstr.data, tmpstr.len);
1031 		} else {
1032 			string_clear(ctx.field);
1033 			string_append(ctx.field, tmpstr.data, tmpstr.len);
1034 		}
1035 	}
1036 
1037 	/* close field */
1038 	string_clear(&tmpstr); /* reuse and clear temporary string */
1039 
1040 	if (ctx.tag.id == AtomTagAuthorName)
1041 		memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
1042 	else
1043 		memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
1044 
1045 	ctx.iscontent = 0;
1046 	ctx.field = NULL;
1047 }
1048 
1049 int
1050 main(int argc, char *argv[])
1051 {
1052 	if (pledge("stdio", NULL) == -1)
1053 		err(1, "pledge");
1054 
1055 	if (argc > 1) {
1056 		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
1057 			baseurl = argv[1];
1058 		else
1059 			errx(1, "baseurl incorrect or too long");
1060 	}
1061 
1062 	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
1063 
1064 	parser.xmlattr = xmlattr;
1065 	parser.xmlattrentity = xmlattrentity;
1066 	parser.xmlattrend = xmlattrend;
1067 	parser.xmlattrstart = xmlattrstart;
1068 	parser.xmlcdata = xmldata;
1069 	parser.xmldata = xmldata;
1070 	parser.xmldataentity = xmldataentity;
1071 	parser.xmltagend = xmltagend;
1072 	parser.xmltagstart = xmltagstart;
1073 	parser.xmltagstartparsed = xmltagstartparsed;
1074 
1075 	/* NOTE: GETNEXT is defined in xml.h for inline optimization */
1076 	xml_parse(&parser);
1077 
1078 	checkfileerror(stdin, "<stdin>", 'r');
1079 	checkfileerror(stdout, "<stdout>", 'w');
1080 
1081 	return 0;
1082 }