sfeed.c (raw) (30076B)
1 #include <errno.h> 2 #include <stdint.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 #include <strings.h> 7 8 #include "util.h" 9 #include "xml.h" 10 11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) 12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) 13 14 /* these feed fields support multiple separated values */ 15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory) 16 17 /* string and byte-length */ 18 #define STRP(s) s,sizeof(s)-1 19 20 enum FeedType { 21 FeedTypeNone = 0, 22 FeedTypeRSS = 1, 23 FeedTypeAtom = 2 24 }; 25 26 enum ContentType { 27 ContentTypeNone = 0, 28 ContentTypePlain = 1, 29 ContentTypeHTML = 2 30 }; 31 static const char *contenttypes[] = { "", "plain", "html" }; 32 33 /* String data / memory pool */ 34 typedef struct string { 35 char *data; /* data */ 36 size_t len; /* string length */ 37 size_t bufsiz; /* allocated size */ 38 } String; 39 40 /* NOTE: the order of these fields (content, date, author) indicate the 41 * priority to use them, from least important to high. */ 42 enum TagId { 43 TagUnknown = 0, 44 /* RSS */ 45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */ 46 RSSTagTitle, 47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, 48 RSSTagGuid, 49 RSSTagGuidPermalinkFalse, 50 RSSTagGuidPermalinkTrue, 51 /* must be defined after GUID, because it can be a link (isPermaLink) */ 52 RSSTagLink, 53 RSSTagEnclosure, 54 RSSTagAuthor, RSSTagDccreator, 55 RSSTagCategory, 56 /* Atom */ 57 /* creation date has higher priority */ 58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished, 59 AtomTagTitle, 60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent, 61 AtomTagId, 62 AtomTagLink, 63 AtomTagLinkAlternate, 64 AtomTagLinkEnclosure, 65 AtomTagAuthor, AtomTagAuthorName, 66 AtomTagCategory, 67 TagLast 68 }; 69 70 typedef struct feedtag { 71 char *name; /* name of tag to match */ 72 size_t len; /* len of `name` */ 73 enum TagId id; /* unique ID */ 74 } FeedTag; 75 76 typedef struct field { 77 String str; 78 enum TagId tagid; /* tagid set previously, used for tag priority */ 79 } FeedField; 80 81 enum { 82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent, 83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory, 84 FeedFieldLast 85 }; 86 87 typedef struct feedcontext { 88 String *field; /* current FeedItem field String */ 89 FeedField fields[FeedFieldLast]; /* data for current item */ 90 FeedTag tag; /* unique current parsed tag */ 91 int iscontent; /* in content data */ 92 int iscontenttag; /* in content tag */ 93 enum ContentType contenttype; /* content-type for item */ 94 enum FeedType feedtype; 95 int attrcount; /* count item HTML element attributes */ 96 } FeedContext; 97 98 static long long datetounix(long long, int, int, int, int, int); 99 static FeedTag * gettag(enum FeedType, const char *, size_t); 100 static long gettzoffset(const char *); 101 static int isattr(const char *, size_t, const char *, size_t); 102 static int istag(const char *, size_t, const char *, size_t); 103 static int parsetime(const char *, long long *); 104 static void printfields(void); 105 static void string_append(String *, const char *, size_t); 106 static void string_buffer_realloc(String *, size_t); 107 static void string_clear(String *); 108 static void string_print_encoded(String *); 109 static void string_print_timestamp(String *); 110 static void string_print_trimmed(String *); 111 static void string_print_trimmed_multi(String *); 112 static void string_print_uri(String *); 113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t, 114 const char *, size_t); 115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *, 116 size_t, const char *, size_t); 117 static void xmlattrend(XMLParser *, const char *, size_t, const char *, 118 size_t); 119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *, 120 size_t); 121 static void xmldata(XMLParser *, const char *, size_t); 122 static void xmldataentity(XMLParser *, const char *, size_t); 123 static void xmltagend(XMLParser *, const char *, size_t, int); 124 static void xmltagstart(XMLParser *, const char *, size_t); 125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int); 126 127 /* map tag name to TagId type */ 128 /* RSS, keep this in alphabetical order */ 129 static const FeedTag rsstags[] = { 130 { STRP("author"), RSSTagAuthor }, 131 { STRP("category"), RSSTagCategory }, 132 { STRP("content:encoded"), RSSTagContentEncoded }, 133 { STRP("dc:creator"), RSSTagDccreator }, 134 { STRP("dc:date"), RSSTagDcdate }, 135 { STRP("description"), RSSTagDescription }, 136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */ 137 { STRP("enclosure"), RSSTagEnclosure }, 138 { STRP("guid"), RSSTagGuid }, 139 { STRP("link"), RSSTagLink }, 140 { STRP("media:description"), RSSTagMediaDescription }, 141 { STRP("pubdate"), RSSTagPubdate }, 142 { STRP("title"), RSSTagTitle } 143 }; 144 145 /* Atom, keep this in alphabetical order */ 146 static const FeedTag atomtags[] = { 147 { STRP("author"), AtomTagAuthor }, 148 { STRP("category"), AtomTagCategory }, 149 { STRP("content"), AtomTagContent }, 150 { STRP("id"), AtomTagId }, 151 { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */ 152 /* Atom: <link href="" />, RSS has <link></link> */ 153 { STRP("link"), AtomTagLink }, 154 { STRP("media:description"), AtomTagMediaDescription }, 155 { STRP("modified"), AtomTagModified }, /* Atom 0.3 */ 156 { STRP("published"), AtomTagPublished }, 157 { STRP("summary"), AtomTagSummary }, 158 { STRP("title"), AtomTagTitle }, 159 { STRP("updated"), AtomTagUpdated } 160 }; 161 162 /* special case: nested <author><name> */ 163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; 164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName }; 165 166 /* reference to no / unknown tag */ 167 static const FeedTag notag = { STRP(""), TagUnknown }; 168 169 /* map TagId type to RSS/Atom field, all tags must be defined */ 170 static const int fieldmap[TagLast] = { 171 [TagUnknown] = -1, 172 /* RSS */ 173 [RSSTagDcdate] = FeedFieldTime, 174 [RSSTagPubdate] = FeedFieldTime, 175 [RSSTagTitle] = FeedFieldTitle, 176 [RSSTagMediaDescription] = FeedFieldContent, 177 [RSSTagDescription] = FeedFieldContent, 178 [RSSTagContentEncoded] = FeedFieldContent, 179 [RSSTagGuid] = -1, 180 [RSSTagGuidPermalinkFalse] = FeedFieldId, 181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */ 182 [RSSTagLink] = FeedFieldLink, 183 [RSSTagEnclosure] = FeedFieldEnclosure, 184 [RSSTagAuthor] = FeedFieldAuthor, 185 [RSSTagDccreator] = FeedFieldAuthor, 186 [RSSTagCategory] = FeedFieldCategory, 187 /* Atom */ 188 [AtomTagModified] = FeedFieldTime, 189 [AtomTagUpdated] = FeedFieldTime, 190 [AtomTagIssued] = FeedFieldTime, 191 [AtomTagPublished] = FeedFieldTime, 192 [AtomTagTitle] = FeedFieldTitle, 193 [AtomTagMediaDescription] = FeedFieldContent, 194 [AtomTagSummary] = FeedFieldContent, 195 [AtomTagContent] = FeedFieldContent, 196 [AtomTagId] = FeedFieldId, 197 [AtomTagLink] = -1, 198 [AtomTagLinkAlternate] = FeedFieldLink, 199 [AtomTagLinkEnclosure] = FeedFieldEnclosure, 200 [AtomTagAuthor] = -1, 201 [AtomTagAuthorName] = FeedFieldAuthor, 202 [AtomTagCategory] = FeedFieldCategory 203 }; 204 205 static const int FieldSeparator = '\t'; 206 /* separator for multiple values in a field, separator should be 1 byte */ 207 static const char FieldMultiSeparator[] = "|"; 208 static struct uri baseuri; 209 static const char *baseurl; 210 211 static FeedContext ctx; 212 static XMLParser parser; /* XML parser state */ 213 static String attrispermalink, attrrel, attrtype, tmpstr; 214 215 /* Unique tag(id) for parsed tag name. */ 216 static FeedTag * 217 gettag(enum FeedType feedtype, const char *name, size_t namelen) 218 { 219 FeedTag *r; 220 size_t i; 221 222 switch (feedtype) { 223 case FeedTypeRSS: 224 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i++) { 225 r = (FeedTag *)&rsstags[i]; 226 if (r->len == namelen && !strcasecmp(r->name, name)) 227 return r; 228 } 229 break; 230 case FeedTypeAtom: 231 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); i++) { 232 r = (FeedTag *)&atomtags[i]; 233 if (r->len == namelen && !strcasecmp(r->name, name)) 234 return r; 235 } 236 break; 237 default: 238 break; 239 } 240 241 return NULL; 242 } 243 244 static char * 245 ltrim(const char *s) 246 { 247 for (; ISSPACE((unsigned char)*s); s++) 248 ; 249 return (char *)s; 250 } 251 252 static char * 253 rtrim(const char *s) 254 { 255 const char *e; 256 257 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--) 258 ; 259 return (char *)e; 260 } 261 262 /* Clear string only; don't free, prevents unnecessary reallocation. */ 263 static void 264 string_clear(String *s) 265 { 266 if (s->data) 267 s->data[0] = '\0'; 268 s->len = 0; 269 } 270 271 static void 272 string_buffer_realloc(String *s, size_t newlen) 273 { 274 size_t alloclen; 275 276 if (newlen > SIZE_MAX / 2) { 277 alloclen = SIZE_MAX; 278 } else { 279 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) 280 ; 281 } 282 if (!(s->data = realloc(s->data, alloclen))) 283 err(1, "realloc"); 284 s->bufsiz = alloclen; 285 } 286 287 /* Append data to String, s->data and data may not overlap. */ 288 static void 289 string_append(String *s, const char *data, size_t len) 290 { 291 if (!len) 292 return; 293 294 if (s->len >= SIZE_MAX - len) { 295 errno = ENOMEM; 296 err(1, "realloc"); 297 } 298 299 /* check if allocation is necessary, never shrink the buffer. */ 300 if (s->len + len >= s->bufsiz) 301 string_buffer_realloc(s, s->len + len + 1); 302 memcpy(s->data + s->len, data, len); 303 s->len += len; 304 s->data[s->len] = '\0'; 305 } 306 307 /* Print text, encode TABs, newlines and '\', remove other whitespace. 308 * Remove leading and trailing whitespace. */ 309 static void 310 string_print_encoded(String *s) 311 { 312 const char *p, *e; 313 314 if (!s->data || !s->len) 315 return; 316 317 p = ltrim(s->data); 318 e = rtrim(p); 319 320 for (; *p && p != e; p++) { 321 switch (*p) { 322 case '\n': putchar('\\'); putchar('n'); break; 323 case '\\': putchar('\\'); putchar('\\'); break; 324 case '\t': putchar('\\'); putchar('t'); break; 325 default: 326 /* ignore control chars */ 327 if (!ISCNTRL((unsigned char)*p)) 328 putchar(*p); 329 break; 330 } 331 } 332 } 333 334 static void 335 printtrimmed(const char *s) 336 { 337 char *p, *e; 338 339 p = ltrim(s); 340 e = rtrim(p); 341 for (; *p && p != e; p++) { 342 if (ISSPACE((unsigned char)*p)) 343 putchar(' '); /* any whitespace to space */ 344 else if (!ISCNTRL((unsigned char)*p)) 345 /* ignore other control chars */ 346 putchar(*p); 347 } 348 } 349 350 /* Print text, replace TABs, carriage return and other whitespace with ' '. 351 * Other control chars are removed. Remove leading and trailing whitespace. */ 352 static void 353 string_print_trimmed(String *s) 354 { 355 if (!s->data || !s->len) 356 return; 357 358 printtrimmed(s->data); 359 } 360 361 /* Print each field with trimmed whitespace, separated by '|'. */ 362 static void 363 string_print_trimmed_multi(String *s) 364 { 365 char *p, *e; 366 int c; 367 368 if (!s->data || !s->len) 369 return; 370 371 for (p = s->data; ; p = e + 1) { 372 if ((e = strstr(p, FieldMultiSeparator))) { 373 c = *e; 374 *e = '\0'; 375 printtrimmed(p); 376 *e = c; /* restore NUL byte to original character */ 377 fputs(FieldMultiSeparator, stdout); 378 } else { 379 printtrimmed(p); 380 break; 381 } 382 } 383 } 384 385 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */ 386 static void 387 printuri(char *s) 388 { 389 char link[4096], *p, *e; 390 struct uri newuri, olduri; 391 int c, r = -1; 392 393 p = ltrim(s); 394 e = rtrim(p); 395 c = *e; 396 *e = '\0'; 397 398 if (baseurl && !uri_hasscheme(p) && 399 uri_parse(p, &olduri) != -1 && !olduri.proto[0] && 400 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0]) 401 r = uri_format(link, sizeof(link), &newuri); 402 403 if (r >= 0 && (size_t)r < sizeof(link)) 404 printtrimmed(link); 405 else 406 printtrimmed(p); 407 408 *e = c; /* restore NUL byte to original character */ 409 } 410 411 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */ 412 static void 413 string_print_uri(String *s) 414 { 415 if (!s->data || !s->len) 416 return; 417 418 printuri(s->data); 419 } 420 421 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */ 422 static void 423 string_print_timestamp(String *s) 424 { 425 long long t; 426 427 if (!s->data || !s->len) 428 return; 429 430 if (parsetime(s->data, &t) != -1) 431 printf("%lld", t); 432 } 433 434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp. 435 * Parameters should be passed as they are in a struct tm: 436 * that is: year = year - 1900, month = month - 1. */ 437 static long long 438 datetounix(long long year, int mon, int day, int hour, int min, int sec) 439 { 440 /* seconds in a month in a regular (non-leap) year */ 441 static const long secs_through_month[] = { 442 0, 31 * 86400, 59 * 86400, 90 * 86400, 443 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, 444 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; 445 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; 446 long long t; 447 448 /* optimization: handle common range year 1902 up to and including 2038 */ 449 if (year - 2ULL <= 136) { 450 /* amount of leap days relative to 1970: every 4 years */ 451 leaps = (year - 68) >> 2; 452 if (!((year - 68) & 3)) { 453 leaps--; 454 is_leap = 1; 455 } else { 456 is_leap = 0; 457 } 458 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */ 459 } else { 460 /* general leap year calculation: 461 * leap years occur mostly every 4 years but every 100 years 462 * a leap year is skipped unless the year is divisible by 400 */ 463 cycles = (year - 100) / 400; 464 rem = (year - 100) % 400; 465 if (rem < 0) { 466 cycles--; 467 rem += 400; 468 } 469 if (!rem) { 470 is_leap = 1; 471 } else { 472 if (rem >= 300) { 473 centuries = 3; 474 rem -= 300; 475 } else if (rem >= 200) { 476 centuries = 2; 477 rem -= 200; 478 } else if (rem >= 100) { 479 centuries = 1; 480 rem -= 100; 481 } 482 if (rem) { 483 leaps = rem / 4U; 484 rem %= 4U; 485 is_leap = !rem; 486 } 487 } 488 leaps += (97 * cycles) + (24 * centuries) - is_leap; 489 490 /* adjust 8 leap days from 1970 up to and including 2000: 491 * ((30 * 365) + 8) * 86400 = 946771200 */ 492 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL; 493 } 494 t += secs_through_month[mon]; 495 if (is_leap && mon >= 2) 496 t += 86400; 497 t += 86400LL * (day - 1); 498 t += 3600LL * hour; 499 t += 60LL * min; 500 t += sec; 501 502 return t; 503 } 504 505 /* Get timezone from string, return time offset in seconds from UTC. 506 * NOTE: only parses timezones in RFC 822, many other timezone names are 507 * ambiguous anyway. 508 * ANSI and military zones are defined wrong in RFC 822 and are unsupported, 509 * see note on RFC 2822 4.3 page 32. */ 510 static long 511 gettzoffset(const char *s) 512 { 513 static const struct { 514 char *name; 515 int offhour; 516 } tzones[] = { 517 { "CDT", -5 * 3600 }, 518 { "CST", -6 * 3600 }, 519 { "EDT", -4 * 3600 }, 520 { "EST", -5 * 3600 }, 521 { "MDT", -6 * 3600 }, 522 { "MST", -7 * 3600 }, 523 { "PDT", -7 * 3600 }, 524 { "PST", -8 * 3600 }, 525 }; 526 const char *p; 527 long tzhour = 0, tzmin = 0; 528 size_t i; 529 530 for (; ISSPACE((unsigned char)*s); s++) 531 ; 532 switch (*s) { 533 case '-': /* offset */ 534 case '+': 535 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 536 tzhour = (tzhour * 10) + (*p - '0'); 537 if (*p == ':') 538 p++; 539 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 540 tzmin = (tzmin * 10) + (*p - '0'); 541 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); 542 default: /* timezone name */ 543 for (i = 0; ISALPHA((unsigned char)s[i]); i++) 544 ; 545 if (i != 3) 546 return 0; 547 /* compare timezone and adjust offset relative to UTC */ 548 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { 549 if (!memcmp(s, tzones[i].name, 3)) 550 return tzones[i].offhour; 551 } 552 } 553 return 0; 554 } 555 556 /* Parse time string `s` into the UNIX timestamp `tp`. 557 * Returns 0 on success or -1 on failure. */ 558 static int 559 parsetime(const char *s, long long *tp) 560 { 561 static const struct { 562 char *name; 563 int len; 564 } mons[] = { 565 { STRP("January"), }, 566 { STRP("February"), }, 567 { STRP("March"), }, 568 { STRP("April"), }, 569 { STRP("May"), }, 570 { STRP("June"), }, 571 { STRP("July"), }, 572 { STRP("August"), }, 573 { STRP("September"), }, 574 { STRP("October"), }, 575 { STRP("November"), }, 576 { STRP("December"), }, 577 }; 578 int va[6] = { 0 }, i, j, v, vi; 579 size_t m; 580 581 for (; ISSPACE((unsigned char)*s); s++) 582 ; 583 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s)) 584 return -1; 585 586 if (ISDIGIT((unsigned char)s[0]) && 587 ISDIGIT((unsigned char)s[1]) && 588 ISDIGIT((unsigned char)s[2]) && 589 ISDIGIT((unsigned char)s[3])) { 590 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */ 591 vi = 0; 592 } else { 593 /* format: "[%a, ]%d %b %Y %H:%M:%S" */ 594 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */ 595 for (; ISALPHA((unsigned char)*s); s++) 596 ; 597 for (; ISSPACE((unsigned char)*s); s++) 598 ; 599 if (*s == ',') 600 s++; 601 for (; ISSPACE((unsigned char)*s); s++) 602 ; 603 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++) 604 v = (v * 10) + (*s - '0'); 605 va[2] = v; /* day */ 606 for (; ISSPACE((unsigned char)*s); s++) 607 ; 608 /* end of word month */ 609 for (j = 0; ISALPHA((unsigned char)s[j]); j++) 610 ; 611 /* check month name */ 612 if (j < 3 || j > 9) 613 return -1; /* month cannot match */ 614 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { 615 /* abbreviation (3 length) or long name */ 616 if ((j == 3 || j == mons[m].len) && 617 !strncasecmp(mons[m].name, s, j)) { 618 va[1] = m + 1; 619 s += j; 620 break; 621 } 622 } 623 if (m >= 12) 624 return -1; /* no month found */ 625 for (; ISSPACE((unsigned char)*s); s++) 626 ; 627 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++) 628 v = (v * 10) + (*s - '0'); 629 /* obsolete short year: RFC 2822 4.3 */ 630 if (i == 2 || i == 3) 631 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900; 632 va[0] = v; /* year */ 633 for (; ISSPACE((unsigned char)*s); s++) 634 ; 635 /* parse only regular time part, see below */ 636 vi = 3; 637 } 638 639 /* parse time parts (and possibly remaining date parts) */ 640 for (; *s && vi < 6; vi++) { 641 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && 642 ISDIGIT((unsigned char)*s); s++, i++) { 643 v = (v * 10) + (*s - '0'); 644 } 645 va[vi] = v; 646 647 if ((vi < 2 && (*s == '-' || *s == '/')) || 648 (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) || 649 (vi > 2 && *s == ':')) 650 s++; 651 } 652 653 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ 654 if (*s == '.' || *s == ',') { 655 for (s++; ISDIGIT((unsigned char)*s); s++) 656 ; 657 } 658 659 /* invalid range */ 660 if (va[0] < 0 || va[0] > 9999 || 661 va[1] < 1 || va[1] > 12 || 662 va[2] < 1 || va[2] > 31 || 663 va[3] < 0 || va[3] > 23 || 664 va[4] < 0 || va[4] > 59 || 665 va[5] < 0 || va[5] > 60) /* allow leap second */ 666 return -1; 667 668 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - 669 gettzoffset(s); 670 671 return 0; 672 } 673 674 static void 675 printfields(void) 676 { 677 string_print_timestamp(&ctx.fields[FeedFieldTime].str); 678 putchar(FieldSeparator); 679 string_print_trimmed(&ctx.fields[FeedFieldTitle].str); 680 putchar(FieldSeparator); 681 string_print_uri(&ctx.fields[FeedFieldLink].str); 682 putchar(FieldSeparator); 683 string_print_encoded(&ctx.fields[FeedFieldContent].str); 684 putchar(FieldSeparator); 685 fputs(contenttypes[ctx.contenttype], stdout); 686 putchar(FieldSeparator); 687 string_print_trimmed(&ctx.fields[FeedFieldId].str); 688 putchar(FieldSeparator); 689 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str); 690 putchar(FieldSeparator); 691 string_print_uri(&ctx.fields[FeedFieldEnclosure].str); 692 putchar(FieldSeparator); 693 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str); 694 putchar('\n'); 695 696 if (ferror(stdout)) /* check for errors but do not flush */ 697 checkfileerror(stdout, "<stdout>", 'w'); 698 } 699 700 static int 701 istag(const char *name, size_t len, const char *name2, size_t len2) 702 { 703 return (len == len2 && !strcasecmp(name, name2)); 704 } 705 706 static int 707 isattr(const char *name, size_t len, const char *name2, size_t len2) 708 { 709 return (len == len2 && !strcasecmp(name, name2)); 710 } 711 712 static void 713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 714 const char *v, size_t vl) 715 { 716 /* handles transforming inline XML to data */ 717 if (ISINCONTENT(ctx)) { 718 if (ctx.contenttype == ContentTypeHTML) 719 xmldata(p, v, vl); 720 return; 721 } 722 723 if (!ctx.tag.id) 724 return; 725 726 /* content-type may be for Atom: text, xhtml, html or a mime-type. 727 * for MRSS (media:description): plain, html. */ 728 if (ISCONTENTTAG(ctx)) { 729 if (isattr(n, nl, STRP("type"))) 730 string_append(&attrtype, v, vl); 731 return; 732 } 733 734 if (ctx.feedtype == FeedTypeRSS) { 735 if (ctx.tag.id == RSSTagEnclosure && 736 isattr(n, nl, STRP("url"))) { 737 string_append(&tmpstr, v, vl); 738 } else if (ctx.tag.id == RSSTagGuid && 739 isattr(n, nl, STRP("ispermalink"))) { 740 string_append(&attrispermalink, v, vl); 741 } 742 } else if (ctx.feedtype == FeedTypeAtom) { 743 if (ctx.tag.id == AtomTagLink) { 744 if (isattr(n, nl, STRP("rel"))) { 745 string_append(&attrrel, v, vl); 746 } else if (isattr(n, nl, STRP("href"))) { 747 string_append(&tmpstr, v, vl); 748 } 749 } else if (ctx.tag.id == AtomTagCategory && 750 isattr(n, nl, STRP("term"))) { 751 string_append(&tmpstr, v, vl); 752 } 753 } 754 } 755 756 static void 757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 758 const char *data, size_t datalen) 759 { 760 char buf[8]; 761 int len; 762 763 /* handles transforming inline XML to data */ 764 if (ISINCONTENT(ctx)) { 765 if (ctx.contenttype == ContentTypeHTML) 766 xmldata(p, data, datalen); 767 return; 768 } 769 770 if (!ctx.tag.id) 771 return; 772 773 /* try to translate entity, else just pass as data to 774 * xmlattr handler. */ 775 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 776 xmlattr(p, t, tl, n, nl, buf, (size_t)len); 777 else 778 xmlattr(p, t, tl, n, nl, data, datalen); 779 } 780 781 static void 782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 783 { 784 if (ISINCONTENT(ctx)) { 785 if (ctx.contenttype == ContentTypeHTML) { 786 /* handles transforming inline XML to data */ 787 xmldata(p, "\"", 1); 788 ctx.attrcount = 0; 789 } 790 return; 791 } 792 } 793 794 static void 795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 796 { 797 if (ISINCONTENT(ctx)) { 798 if (ctx.contenttype == ContentTypeHTML) { 799 /* handles transforming inline XML to data */ 800 if (!ctx.attrcount) 801 xmldata(p, " ", 1); 802 ctx.attrcount++; 803 xmldata(p, n, nl); 804 xmldata(p, "=\"", 2); 805 } 806 return; 807 } 808 809 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink"))) 810 string_clear(&attrispermalink); 811 else if (attrrel.len && isattr(n, nl, STRP("rel"))) 812 string_clear(&attrrel); 813 else if (attrtype.len && isattr(n, nl, STRP("type"))) 814 string_clear(&attrtype); 815 else if (tmpstr.len && 816 (isattr(n, nl, STRP("href")) || 817 isattr(n, nl, STRP("term")) || 818 isattr(n, nl, STRP("url")))) 819 string_clear(&tmpstr); /* use the last value for multiple attribute values */ 820 } 821 822 static void 823 xmldata(XMLParser *p, const char *s, size_t len) 824 { 825 if (!ctx.field) 826 return; 827 828 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 829 string_append(&tmpstr, s, len); 830 else 831 string_append(ctx.field, s, len); 832 } 833 834 static void 835 xmldataentity(XMLParser *p, const char *data, size_t datalen) 836 { 837 char buf[8]; 838 int len; 839 840 if (!ctx.field) 841 return; 842 843 /* try to translate entity, else just pass as data to 844 * xmldata handler. */ 845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 846 xmldata(p, buf, (size_t)len); 847 else 848 xmldata(p, data, datalen); 849 } 850 851 static void 852 xmltagstart(XMLParser *p, const char *t, size_t tl) 853 { 854 const FeedTag *f; 855 856 if (ISINCONTENT(ctx)) { 857 if (ctx.contenttype == ContentTypeHTML) { 858 ctx.attrcount = 0; 859 xmldata(p, "<", 1); 860 xmldata(p, t, tl); 861 } 862 return; 863 } 864 865 /* start of RSS or Atom item / entry */ 866 if (ctx.feedtype == FeedTypeNone) { 867 if (istag(t, tl, STRP("entry"))) 868 ctx.feedtype = FeedTypeAtom; 869 else if (istag(t, tl, STRP("item"))) 870 ctx.feedtype = FeedTypeRSS; 871 return; 872 } 873 874 /* field tagid already set or nested tags. */ 875 if (ctx.tag.id) { 876 /* nested <author><name> for Atom */ 877 if (ctx.tag.id == AtomTagAuthor && 878 istag(t, tl, STRP("name"))) { 879 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag)); 880 } else { 881 return; /* other nested tags are not allowed: return */ 882 } 883 } 884 885 /* in item */ 886 if (ctx.tag.id == TagUnknown) { 887 if (!(f = gettag(ctx.feedtype, t, tl))) 888 f = ¬ag; 889 memcpy(&(ctx.tag), f, sizeof(ctx.tag)); 890 } 891 892 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); 893 string_clear(&attrispermalink); 894 string_clear(&attrrel); 895 string_clear(&attrtype); 896 } 897 898 static void 899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) 900 { 901 enum TagId tagid; 902 903 if (ISINCONTENT(ctx)) { 904 if (ctx.contenttype == ContentTypeHTML) { 905 if (isshort) 906 xmldata(p, "/>", 2); 907 else 908 xmldata(p, ">", 1); 909 } 910 return; 911 } 912 913 /* set tag type based on its attribute value */ 914 if (ctx.tag.id == RSSTagGuid) { 915 /* if empty the default is "true" */ 916 if (!attrispermalink.len || 917 isattr(attrispermalink.data, attrispermalink.len, STRP("true"))) 918 ctx.tag.id = RSSTagGuidPermalinkTrue; 919 else 920 ctx.tag.id = RSSTagGuidPermalinkFalse; 921 } else if (ctx.tag.id == AtomTagLink) { 922 /* empty or "alternate": other types could be 923 * "enclosure", "related", "self" or "via" */ 924 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate"))) 925 ctx.tag.id = AtomTagLinkAlternate; 926 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure"))) 927 ctx.tag.id = AtomTagLinkEnclosure; 928 else 929 ctx.tag.id = AtomTagLink; /* unknown */ 930 } 931 932 tagid = ctx.tag.id; 933 934 /* map tag type to field: unknown or lesser priority is ignored, 935 * when tags of the same type are repeated only the first is used. */ 936 if (fieldmap[tagid] == -1 || 937 (!ISFEEDFIELDMULTI(fieldmap[tagid]) && 938 tagid <= ctx.fields[fieldmap[tagid]].tagid)) { 939 return; 940 } 941 942 if (ctx.iscontenttag) { 943 ctx.iscontent = 1; 944 ctx.iscontenttag = 0; 945 946 /* detect content-type based on type attribute */ 947 if (attrtype.len) { 948 if (isattr(attrtype.data, attrtype.len, STRP("html")) || 949 isattr(attrtype.data, attrtype.len, STRP("xhtml")) || 950 isattr(attrtype.data, attrtype.len, STRP("text/html")) || 951 isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) || 952 isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml"))) 953 ctx.contenttype = ContentTypeHTML; 954 else /* unknown: handle as base64 text data */ 955 ctx.contenttype = ContentTypePlain; 956 } else { 957 /* default content-type */ 958 if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription) 959 ctx.contenttype = ContentTypeHTML; 960 else 961 ctx.contenttype = ContentTypePlain; 962 } 963 } 964 965 ctx.field = &(ctx.fields[fieldmap[tagid]].str); 966 ctx.fields[fieldmap[tagid]].tagid = tagid; 967 968 /* clear field if it is overwritten (with a priority order) for the new 969 * value, if the field can have multiple values then do not clear it. */ 970 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 971 string_clear(ctx.field); 972 } 973 974 static void 975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) 976 { 977 size_t i; 978 979 if (ctx.feedtype == FeedTypeNone) 980 return; 981 982 if (ISINCONTENT(ctx)) { 983 /* not a closed content field */ 984 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) { 985 if (!isshort && ctx.contenttype == ContentTypeHTML) { 986 xmldata(p, "</", 2); 987 xmldata(p, t, tl); 988 xmldata(p, ">", 1); 989 } 990 return; 991 } 992 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) { 993 /* matched tag end: close it. 994 * copy also to the link field if the attribute isPermaLink="true" 995 * and it is not set by a tag with higher priority. */ 996 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field && 997 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) { 998 string_clear(&ctx.fields[FeedFieldLink].str); 999 string_append(&ctx.fields[FeedFieldLink].str, 1000 ctx.field->data, ctx.field->len); 1001 ctx.fields[FeedFieldLink].tagid = ctx.tag.id; 1002 } 1003 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && 1004 istag(t, tl, STRP("entry"))) || /* Atom */ 1005 (ctx.feedtype == FeedTypeRSS && 1006 istag(t, tl, STRP("item"))))) /* RSS */ 1007 { 1008 /* end of RSS or Atom entry / item */ 1009 printfields(); 1010 1011 /* clear strings */ 1012 for (i = 0; i < FeedFieldLast; i++) { 1013 string_clear(&ctx.fields[i].str); 1014 ctx.fields[i].tagid = TagUnknown; 1015 } 1016 ctx.contenttype = ContentTypeNone; 1017 /* allow parsing of Atom and RSS concatenated in one XML stream. */ 1018 ctx.feedtype = FeedTypeNone; 1019 } else { 1020 return; /* not end of field */ 1021 } 1022 1023 /* temporary string: for fields that cannot be processed 1024 * directly and need more context, for example by its tag 1025 * attributes, like the Atom link rel="alternate|enclosure". */ 1026 if (tmpstr.len && ctx.field) { 1027 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) { 1028 if (ctx.field->len) 1029 string_append(ctx.field, FieldMultiSeparator, 1); 1030 string_append(ctx.field, tmpstr.data, tmpstr.len); 1031 } else { 1032 string_clear(ctx.field); 1033 string_append(ctx.field, tmpstr.data, tmpstr.len); 1034 } 1035 } 1036 1037 /* close field */ 1038 string_clear(&tmpstr); /* reuse and clear temporary string */ 1039 1040 if (ctx.tag.id == AtomTagAuthorName) 1041 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */ 1042 else 1043 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1044 1045 ctx.iscontent = 0; 1046 ctx.field = NULL; 1047 } 1048 1049 int 1050 main(int argc, char *argv[]) 1051 { 1052 if (pledge("stdio", NULL) == -1) 1053 err(1, "pledge"); 1054 1055 if (argc > 1) { 1056 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0]) 1057 baseurl = argv[1]; 1058 else 1059 errx(1, "baseurl incorrect or too long"); 1060 } 1061 1062 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1063 1064 parser.xmlattr = xmlattr; 1065 parser.xmlattrentity = xmlattrentity; 1066 parser.xmlattrend = xmlattrend; 1067 parser.xmlattrstart = xmlattrstart; 1068 parser.xmlcdata = xmldata; 1069 parser.xmldata = xmldata; 1070 parser.xmldataentity = xmldataentity; 1071 parser.xmltagend = xmltagend; 1072 parser.xmltagstart = xmltagstart; 1073 parser.xmltagstartparsed = xmltagstartparsed; 1074 1075 /* NOTE: GETNEXT is defined in xml.h for inline optimization */ 1076 xml_parse(&parser); 1077 1078 checkfileerror(stdin, "<stdin>", 'r'); 1079 checkfileerror(stdout, "<stdout>", 'w'); 1080 1081 return 0; 1082 }