sfeed.c (29354B)
1 #include <errno.h> 2 #include <stdint.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 #include <strings.h> 7 8 #include "util.h" 9 #include "xml.h" 10 11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) 12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) 13 14 /* these feed fields support multiple separated values */ 15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory) 16 17 /* string and byte-length */ 18 #define STRP(s) s,sizeof(s)-1 19 20 enum FeedType { 21 FeedTypeNone = 0, 22 FeedTypeRSS = 1, 23 FeedTypeAtom = 2 24 }; 25 26 enum ContentType { 27 ContentTypeNone = 0, 28 ContentTypePlain = 1, 29 ContentTypeHTML = 2 30 }; 31 static const char *contenttypes[] = { "", "plain", "html" }; 32 33 /* String data / memory pool */ 34 typedef struct string { 35 char *data; /* data */ 36 size_t len; /* string length */ 37 size_t bufsiz; /* allocated size */ 38 } String; 39 40 /* NOTE: the order of these fields (content, date, author) indicate the 41 * priority to use them, from least important to high. */ 42 enum TagId { 43 TagUnknown = 0, 44 /* RSS */ 45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */ 46 RSSTagTitle, 47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, 48 RSSTagGuid, 49 RSSTagGuidPermalinkFalse, 50 RSSTagGuidPermalinkTrue, 51 /* must be defined after GUID, because it can be a link (isPermaLink) */ 52 RSSTagLink, 53 RSSTagEnclosure, 54 RSSTagAuthor, RSSTagDccreator, 55 RSSTagCategory, 56 /* Atom */ 57 /* creation date has higher priority */ 58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished, 59 AtomTagTitle, 60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent, 61 AtomTagId, 62 AtomTagLink, 63 AtomTagLinkAlternate, 64 AtomTagLinkEnclosure, 65 AtomTagAuthor, AtomTagAuthorName, 66 AtomTagCategory, 67 TagLast 68 }; 69 70 typedef struct feedtag { 71 char *name; /* name of tag to match */ 72 size_t len; /* len of `name` */ 73 enum TagId id; /* unique ID */ 74 } FeedTag; 75 76 typedef struct field { 77 String str; 78 enum TagId tagid; /* tagid set previously, used for tag priority */ 79 } FeedField; 80 81 enum { 82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent, 83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory, 84 FeedFieldLast 85 }; 86 87 typedef struct feedcontext { 88 String *field; /* current FeedItem field String */ 89 FeedField fields[FeedFieldLast]; /* data for current item */ 90 FeedTag tag; /* unique current parsed tag */ 91 int iscontent; /* in content data */ 92 int iscontenttag; /* in content tag */ 93 enum ContentType contenttype; /* content-type for item */ 94 enum FeedType feedtype; 95 int attrcount; /* count item HTML element attributes */ 96 } FeedContext; 97 98 static long long datetounix(long long, int, int, int, int, int); 99 static FeedTag * gettag(enum FeedType, const char *, size_t); 100 static long gettzoffset(const char *); 101 static int isattr(const char *, size_t, const char *, size_t); 102 static int istag(const char *, size_t, const char *, size_t); 103 static int parsetime(const char *, long long *); 104 static void printfields(void); 105 static void string_append(String *, const char *, size_t); 106 static void string_buffer_realloc(String *, size_t); 107 static void string_clear(String *); 108 static void string_print_encoded(String *); 109 static void string_print_timestamp(String *); 110 static void string_print_trimmed(String *); 111 static void string_print_trimmed_multi(String *); 112 static void string_print_uri(String *); 113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t, 114 const char *, size_t); 115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *, 116 size_t, const char *, size_t); 117 static void xmlattrend(XMLParser *, const char *, size_t, const char *, 118 size_t); 119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *, 120 size_t); 121 static void xmldata(XMLParser *, const char *, size_t); 122 static void xmldataentity(XMLParser *, const char *, size_t); 123 static void xmltagend(XMLParser *, const char *, size_t, int); 124 static void xmltagstart(XMLParser *, const char *, size_t); 125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int); 126 127 /* map tag name to TagId type */ 128 /* RSS, must be alphabetical order */ 129 static const FeedTag rsstags[] = { 130 { STRP("author"), RSSTagAuthor }, 131 { STRP("category"), RSSTagCategory }, 132 { STRP("content:encoded"), RSSTagContentEncoded }, 133 { STRP("dc:creator"), RSSTagDccreator }, 134 { STRP("dc:date"), RSSTagDcdate }, 135 { STRP("description"), RSSTagDescription }, 136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */ 137 { STRP("enclosure"), RSSTagEnclosure }, 138 { STRP("guid"), RSSTagGuid }, 139 { STRP("link"), RSSTagLink }, 140 { STRP("media:description"), RSSTagMediaDescription }, 141 { STRP("pubdate"), RSSTagPubdate }, 142 { STRP("title"), RSSTagTitle } 143 }; 144 145 /* Atom, must be alphabetical order */ 146 static const FeedTag atomtags[] = { 147 { STRP("author"), AtomTagAuthor }, 148 { STRP("category"), AtomTagCategory }, 149 { STRP("content"), AtomTagContent }, 150 { STRP("id"), AtomTagId }, 151 { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */ 152 /* Atom: <link href="" />, RSS has <link></link> */ 153 { STRP("link"), AtomTagLink }, 154 { STRP("media:description"), AtomTagMediaDescription }, 155 { STRP("modified"), AtomTagModified }, /* Atom 0.3 */ 156 { STRP("published"), AtomTagPublished }, 157 { STRP("summary"), AtomTagSummary }, 158 { STRP("title"), AtomTagTitle }, 159 { STRP("updated"), AtomTagUpdated } 160 }; 161 162 /* special case: nested <author><name> */ 163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; 164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName }; 165 166 /* reference to no / unknown tag */ 167 static const FeedTag notag = { STRP(""), TagUnknown }; 168 169 /* map TagId type to RSS/Atom field, all tags must be defined */ 170 static const int fieldmap[TagLast] = { 171 [TagUnknown] = -1, 172 /* RSS */ 173 [RSSTagDcdate] = FeedFieldTime, 174 [RSSTagPubdate] = FeedFieldTime, 175 [RSSTagTitle] = FeedFieldTitle, 176 [RSSTagMediaDescription] = FeedFieldContent, 177 [RSSTagDescription] = FeedFieldContent, 178 [RSSTagContentEncoded] = FeedFieldContent, 179 [RSSTagGuid] = -1, 180 [RSSTagGuidPermalinkFalse] = FeedFieldId, 181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */ 182 [RSSTagLink] = FeedFieldLink, 183 [RSSTagEnclosure] = FeedFieldEnclosure, 184 [RSSTagAuthor] = FeedFieldAuthor, 185 [RSSTagDccreator] = FeedFieldAuthor, 186 [RSSTagCategory] = FeedFieldCategory, 187 /* Atom */ 188 [AtomTagModified] = FeedFieldTime, 189 [AtomTagUpdated] = FeedFieldTime, 190 [AtomTagIssued] = FeedFieldTime, 191 [AtomTagPublished] = FeedFieldTime, 192 [AtomTagTitle] = FeedFieldTitle, 193 [AtomTagMediaDescription] = FeedFieldContent, 194 [AtomTagSummary] = FeedFieldContent, 195 [AtomTagContent] = FeedFieldContent, 196 [AtomTagId] = FeedFieldId, 197 [AtomTagLink] = -1, 198 [AtomTagLinkAlternate] = FeedFieldLink, 199 [AtomTagLinkEnclosure] = FeedFieldEnclosure, 200 [AtomTagAuthor] = -1, 201 [AtomTagAuthorName] = FeedFieldAuthor, 202 [AtomTagCategory] = FeedFieldCategory 203 }; 204 205 static const int FieldSeparator = '\t'; 206 /* separator for multiple values in a field, separator should be 1 byte */ 207 static const char FieldMultiSeparator[] = "|"; 208 static struct uri baseuri; 209 static const char *baseurl; 210 211 static FeedContext ctx; 212 static XMLParser parser; /* XML parser state */ 213 static String attrispermalink, attrrel, attrtype, tmpstr; 214 215 static int 216 tagcmp(const void *v1, const void *v2) 217 { 218 return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name); 219 } 220 221 /* Unique tagid for parsed tag name. */ 222 static FeedTag * 223 gettag(enum FeedType feedtype, const char *name, size_t namelen) 224 { 225 FeedTag f, *r = NULL; 226 227 f.name = (char *)name; 228 229 switch (feedtype) { 230 case FeedTypeRSS: 231 r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]), 232 sizeof(rsstags[0]), tagcmp); 233 break; 234 case FeedTypeAtom: 235 r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]), 236 sizeof(atomtags[0]), tagcmp); 237 break; 238 default: 239 break; 240 } 241 242 return r; 243 } 244 245 static char * 246 ltrim(const char *s) 247 { 248 for (; ISSPACE((unsigned char)*s); s++) 249 ; 250 return (char *)s; 251 } 252 253 static char * 254 rtrim(const char *s) 255 { 256 const char *e; 257 258 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--) 259 ; 260 return (char *)e; 261 } 262 263 /* Clear string only; don't free, prevents unnecessary reallocation. */ 264 static void 265 string_clear(String *s) 266 { 267 if (s->data) 268 s->data[0] = '\0'; 269 s->len = 0; 270 } 271 272 static void 273 string_buffer_realloc(String *s, size_t newlen) 274 { 275 size_t alloclen; 276 277 if (newlen > SIZE_MAX / 2) { 278 alloclen = SIZE_MAX; 279 } else { 280 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) 281 ; 282 } 283 if (!(s->data = realloc(s->data, alloclen))) 284 err(1, "realloc"); 285 s->bufsiz = alloclen; 286 } 287 288 /* Append data to String, s->data and data may not overlap. */ 289 static void 290 string_append(String *s, const char *data, size_t len) 291 { 292 if (!len) 293 return; 294 295 if (s->len >= SIZE_MAX - len) { 296 errno = EOVERFLOW; 297 err(1, "realloc"); 298 } 299 300 /* check if allocation is necessary, never shrink the buffer. */ 301 if (s->len + len >= s->bufsiz) 302 string_buffer_realloc(s, s->len + len + 1); 303 memcpy(s->data + s->len, data, len); 304 s->len += len; 305 s->data[s->len] = '\0'; 306 } 307 308 /* Print text, encode TABs, newlines and '\', remove other whitespace. 309 * Remove leading and trailing whitespace. */ 310 static void 311 string_print_encoded(String *s) 312 { 313 const char *p, *e; 314 315 if (!s->data || !s->len) 316 return; 317 318 p = ltrim(s->data); 319 e = rtrim(p); 320 321 for (; *p && p != e; p++) { 322 switch (*p) { 323 case '\n': putchar('\\'); putchar('n'); break; 324 case '\\': putchar('\\'); putchar('\\'); break; 325 case '\t': putchar('\\'); putchar('t'); break; 326 default: 327 /* ignore control chars */ 328 if (!ISCNTRL((unsigned char)*p)) 329 putchar(*p); 330 break; 331 } 332 } 333 } 334 335 static void 336 printtrimmed(const char *s) 337 { 338 char *p, *e; 339 340 p = ltrim(s); 341 e = rtrim(p); 342 for (; *p && p != e; p++) { 343 if (ISSPACE((unsigned char)*p)) 344 putchar(' '); /* any whitespace to space */ 345 else if (!ISCNTRL((unsigned char)*p)) 346 /* ignore other control chars */ 347 putchar(*p); 348 } 349 } 350 351 /* Print text, replace TABs, carriage return and other whitespace with ' '. 352 * Other control chars are removed. Remove leading and trailing whitespace. */ 353 static void 354 string_print_trimmed(String *s) 355 { 356 if (!s->data || !s->len) 357 return; 358 359 printtrimmed(s->data); 360 } 361 362 /* Print each field with trimmed whitespace, separated by '|'. */ 363 static void 364 string_print_trimmed_multi(String *s) 365 { 366 char *p, *e; 367 int c; 368 369 if (!s->data || !s->len) 370 return; 371 372 for (p = s->data; ; p = e + 1) { 373 if ((e = strstr(p, FieldMultiSeparator))) { 374 c = *e; 375 *e = '\0'; 376 printtrimmed(p); 377 *e = c; /* restore NUL byte to original character */ 378 fputs(FieldMultiSeparator, stdout); 379 } else { 380 printtrimmed(p); 381 break; 382 } 383 } 384 } 385 386 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */ 387 static void 388 printuri(char *s) 389 { 390 char link[4096], *p, *e; 391 struct uri newuri, olduri; 392 int c, r = -1; 393 394 p = ltrim(s); 395 e = rtrim(p); 396 c = *e; 397 *e = '\0'; 398 399 if (baseurl && !uri_hasscheme(p) && 400 uri_parse(p, &olduri) != -1 && !olduri.proto[0] && 401 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0]) 402 r = uri_format(link, sizeof(link), &newuri); 403 404 if (r >= 0 && (size_t)r < sizeof(link)) 405 printtrimmed(link); 406 else 407 printtrimmed(p); 408 409 *e = c; /* restore NUL byte to original character */ 410 } 411 412 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */ 413 static void 414 string_print_uri(String *s) 415 { 416 if (!s->data || !s->len) 417 return; 418 419 printuri(s->data); 420 } 421 422 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */ 423 static void 424 string_print_timestamp(String *s) 425 { 426 long long t; 427 428 if (!s->data || !s->len) 429 return; 430 431 if (parsetime(s->data, &t) != -1) 432 printf("%lld", t); 433 } 434 435 /* Convert time fields. Returns a UNIX timestamp. */ 436 static long long 437 datetounix(long long year, int mon, int day, int hour, int min, int sec) 438 { 439 static const int secs_through_month[] = { 440 0, 31 * 86400, 59 * 86400, 90 * 86400, 441 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, 442 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; 443 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; 444 long long t; 445 446 if (year - 2ULL <= 136) { 447 leaps = (year - 68) >> 2; 448 if (!((year - 68) & 3)) { 449 leaps--; 450 is_leap = 1; 451 } else { 452 is_leap = 0; 453 } 454 t = 31536000 * (year - 70) + 86400 * leaps; 455 } else { 456 cycles = (year - 100) / 400; 457 rem = (year - 100) % 400; 458 if (rem < 0) { 459 cycles--; 460 rem += 400; 461 } 462 if (!rem) { 463 is_leap = 1; 464 } else { 465 if (rem >= 300) 466 centuries = 3, rem -= 300; 467 else if (rem >= 200) 468 centuries = 2, rem -= 200; 469 else if (rem >= 100) 470 centuries = 1, rem -= 100; 471 if (rem) { 472 leaps = rem / 4U; 473 rem %= 4U; 474 is_leap = !rem; 475 } 476 } 477 leaps += 97 * cycles + 24 * centuries - is_leap; 478 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400; 479 } 480 t += secs_through_month[mon]; 481 if (is_leap && mon >= 2) 482 t += 86400; 483 t += 86400LL * (day - 1); 484 t += 3600LL * hour; 485 t += 60LL * min; 486 t += sec; 487 488 return t; 489 } 490 491 /* Get timezone from string, return time offset in seconds from UTC. 492 * NOTE: only parses timezones in RFC-822, many other timezone names are 493 * ambiguous anyway. 494 * ANSI and military zones are defined wrong in RFC822 and are unsupported, 495 * see note on RFC2822 4.3 page 32. */ 496 static long 497 gettzoffset(const char *s) 498 { 499 static const struct { 500 char *name; 501 int offhour; 502 } tzones[] = { 503 { "CDT", -5 * 3600 }, 504 { "CST", -6 * 3600 }, 505 { "EDT", -4 * 3600 }, 506 { "EST", -5 * 3600 }, 507 { "MDT", -6 * 3600 }, 508 { "MST", -7 * 3600 }, 509 { "PDT", -7 * 3600 }, 510 { "PST", -8 * 3600 }, 511 }; 512 const char *p; 513 long tzhour = 0, tzmin = 0; 514 size_t i; 515 516 for (; ISSPACE((unsigned char)*s); s++) 517 ; 518 switch (*s) { 519 case '-': /* offset */ 520 case '+': 521 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 522 tzhour = (tzhour * 10) + (*p - '0'); 523 if (*p == ':') 524 p++; 525 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 526 tzmin = (tzmin * 10) + (*p - '0'); 527 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); 528 default: /* timezone name */ 529 for (i = 0; ISALPHA((unsigned char)s[i]); i++) 530 ; 531 if (i != 3) 532 return 0; 533 /* compare timezone and adjust offset relative to UTC */ 534 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { 535 if (!memcmp(s, tzones[i].name, 3)) 536 return tzones[i].offhour; 537 } 538 } 539 return 0; 540 } 541 542 /* Parse time string `s` into the UNIX timestamp `tp`. 543 Returns 0 on success or -1 on failure. */ 544 static int 545 parsetime(const char *s, long long *tp) 546 { 547 static const struct { 548 char *name; 549 int len; 550 } mons[] = { 551 { STRP("January"), }, 552 { STRP("February"), }, 553 { STRP("March"), }, 554 { STRP("April"), }, 555 { STRP("May"), }, 556 { STRP("June"), }, 557 { STRP("July"), }, 558 { STRP("August"), }, 559 { STRP("September"), }, 560 { STRP("October"), }, 561 { STRP("November"), }, 562 { STRP("December"), }, 563 }; 564 int va[6] = { 0 }, i, j, v, vi; 565 size_t m; 566 567 for (; ISSPACE((unsigned char)*s); s++) 568 ; 569 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s)) 570 return -1; 571 572 if (ISDIGIT((unsigned char)s[0]) && 573 ISDIGIT((unsigned char)s[1]) && 574 ISDIGIT((unsigned char)s[2]) && 575 ISDIGIT((unsigned char)s[3])) { 576 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */ 577 vi = 0; 578 } else { 579 /* format: "[%a, ]%d %b %Y %H:%M:%S" */ 580 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */ 581 for (; ISALPHA((unsigned char)*s); s++) 582 ; 583 for (; ISSPACE((unsigned char)*s); s++) 584 ; 585 if (*s == ',') 586 s++; 587 for (; ISSPACE((unsigned char)*s); s++) 588 ; 589 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++) 590 v = (v * 10) + (*s - '0'); 591 va[2] = v; /* day */ 592 for (; ISSPACE((unsigned char)*s); s++) 593 ; 594 /* end of word month */ 595 for (j = 0; ISALPHA((unsigned char)s[j]); j++) 596 ; 597 /* check month name */ 598 if (j < 3 || j > 9) 599 return -1; /* month cannot match */ 600 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { 601 /* abbreviation (3 length) or long name */ 602 if ((j == 3 || j == mons[m].len) && 603 !strncasecmp(mons[m].name, s, j)) { 604 va[1] = m + 1; 605 s += j; 606 break; 607 } 608 } 609 if (m >= 12) 610 return -1; /* no month found */ 611 for (; ISSPACE((unsigned char)*s); s++) 612 ; 613 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++) 614 v = (v * 10) + (*s - '0'); 615 /* obsolete short year: RFC2822 4.3 */ 616 if (i <= 3) 617 v += (v >= 0 && v <= 49) ? 2000 : 1900; 618 va[0] = v; /* year */ 619 for (; ISSPACE((unsigned char)*s); s++) 620 ; 621 /* parse only regular time part, see below */ 622 vi = 3; 623 } 624 625 /* parse time parts (and possibly remaining date parts) */ 626 for (; *s && vi < 6; vi++) { 627 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && 628 ISDIGIT((unsigned char)*s); s++, i++) { 629 v = (v * 10) + (*s - '0'); 630 } 631 va[vi] = v; 632 633 if ((vi < 2 && *s == '-') || 634 (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) || 635 (vi > 2 && *s == ':')) 636 s++; 637 } 638 639 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ 640 if (*s == '.') { 641 for (s++; ISDIGIT((unsigned char)*s); s++) 642 ; 643 } 644 645 /* invalid range */ 646 if (va[0] < 0 || va[0] > 9999 || 647 va[1] < 1 || va[1] > 12 || 648 va[2] < 1 || va[2] > 31 || 649 va[3] < 0 || va[3] > 23 || 650 va[4] < 0 || va[4] > 59 || 651 va[5] < 0 || va[5] > 60) /* allow leap second */ 652 return -1; 653 654 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - 655 gettzoffset(s); 656 657 return 0; 658 } 659 660 static void 661 printfields(void) 662 { 663 string_print_timestamp(&ctx.fields[FeedFieldTime].str); 664 putchar(FieldSeparator); 665 string_print_trimmed(&ctx.fields[FeedFieldTitle].str); 666 putchar(FieldSeparator); 667 string_print_uri(&ctx.fields[FeedFieldLink].str); 668 putchar(FieldSeparator); 669 string_print_encoded(&ctx.fields[FeedFieldContent].str); 670 putchar(FieldSeparator); 671 fputs(contenttypes[ctx.contenttype], stdout); 672 putchar(FieldSeparator); 673 string_print_trimmed(&ctx.fields[FeedFieldId].str); 674 putchar(FieldSeparator); 675 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str); 676 putchar(FieldSeparator); 677 string_print_uri(&ctx.fields[FeedFieldEnclosure].str); 678 putchar(FieldSeparator); 679 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str); 680 putchar('\n'); 681 682 if (ferror(stdout)) /* check for errors but do not flush */ 683 checkfileerror(stdout, "<stdout>", 'w'); 684 } 685 686 static int 687 istag(const char *name, size_t len, const char *name2, size_t len2) 688 { 689 return (len == len2 && !strcasecmp(name, name2)); 690 } 691 692 static int 693 isattr(const char *name, size_t len, const char *name2, size_t len2) 694 { 695 return (len == len2 && !strcasecmp(name, name2)); 696 } 697 698 static void 699 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 700 const char *v, size_t vl) 701 { 702 /* handles transforming inline XML to data */ 703 if (ISINCONTENT(ctx)) { 704 if (ctx.contenttype == ContentTypeHTML) 705 xmldata(p, v, vl); 706 return; 707 } 708 709 if (!ctx.tag.id) 710 return; 711 712 /* content-type may be: Atom: text, xhtml, html or mime-type. 713 MRSS (media:description): plain, html. */ 714 if (ISCONTENTTAG(ctx)) { 715 if (isattr(n, nl, STRP("type"))) 716 string_append(&attrtype, v, vl); 717 return; 718 } 719 720 if (ctx.feedtype == FeedTypeRSS) { 721 if (ctx.tag.id == RSSTagEnclosure && 722 isattr(n, nl, STRP("url"))) { 723 string_append(&tmpstr, v, vl); 724 } else if (ctx.tag.id == RSSTagGuid && 725 isattr(n, nl, STRP("ispermalink"))) { 726 string_append(&attrispermalink, v, vl); 727 } 728 } else if (ctx.feedtype == FeedTypeAtom) { 729 if (ctx.tag.id == AtomTagLink) { 730 if (isattr(n, nl, STRP("rel"))) { 731 string_append(&attrrel, v, vl); 732 } else if (isattr(n, nl, STRP("href"))) { 733 string_append(&tmpstr, v, vl); 734 } 735 } else if (ctx.tag.id == AtomTagCategory && 736 isattr(n, nl, STRP("term"))) { 737 string_append(&tmpstr, v, vl); 738 } 739 } 740 } 741 742 static void 743 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 744 const char *data, size_t datalen) 745 { 746 char buf[16]; 747 int len; 748 749 /* handles transforming inline XML to data */ 750 if (ISINCONTENT(ctx)) { 751 if (ctx.contenttype == ContentTypeHTML) 752 xmldata(p, data, datalen); 753 return; 754 } 755 756 if (!ctx.tag.id) 757 return; 758 759 /* try to translate entity, else just pass as data to 760 * xmlattr handler. */ 761 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 762 xmlattr(p, t, tl, n, nl, buf, (size_t)len); 763 else 764 xmlattr(p, t, tl, n, nl, data, datalen); 765 } 766 767 static void 768 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 769 { 770 if (ISINCONTENT(ctx)) { 771 if (ctx.contenttype == ContentTypeHTML) { 772 /* handles transforming inline XML to data */ 773 xmldata(p, "\"", 1); 774 ctx.attrcount = 0; 775 } 776 return; 777 } 778 } 779 780 static void 781 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 782 { 783 if (ISINCONTENT(ctx)) { 784 if (ctx.contenttype == ContentTypeHTML) { 785 /* handles transforming inline XML to data */ 786 if (!ctx.attrcount) 787 xmldata(p, " ", 1); 788 ctx.attrcount++; 789 xmldata(p, n, nl); 790 xmldata(p, "=\"", 2); 791 } 792 return; 793 } 794 795 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink"))) 796 string_clear(&attrispermalink); 797 else if (attrrel.len && isattr(n, nl, STRP("rel"))) 798 string_clear(&attrrel); 799 else if (attrtype.len && isattr(n, nl, STRP("type"))) 800 string_clear(&attrtype); 801 else if (tmpstr.len && 802 (isattr(n, nl, STRP("href")) || 803 isattr(n, nl, STRP("term")) || 804 isattr(n, nl, STRP("url")))) 805 string_clear(&tmpstr); /* use the last value for multiple attribute values */ 806 } 807 808 static void 809 xmldata(XMLParser *p, const char *s, size_t len) 810 { 811 if (!ctx.field) 812 return; 813 814 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 815 string_append(&tmpstr, s, len); 816 else 817 string_append(ctx.field, s, len); 818 } 819 820 static void 821 xmldataentity(XMLParser *p, const char *data, size_t datalen) 822 { 823 char buf[16]; 824 int len; 825 826 if (!ctx.field) 827 return; 828 829 /* try to translate entity, else just pass as data to 830 * xmldata handler. */ 831 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 832 xmldata(p, buf, (size_t)len); 833 else 834 xmldata(p, data, datalen); 835 } 836 837 static void 838 xmltagstart(XMLParser *p, const char *t, size_t tl) 839 { 840 const FeedTag *f; 841 842 if (ISINCONTENT(ctx)) { 843 if (ctx.contenttype == ContentTypeHTML) { 844 ctx.attrcount = 0; 845 xmldata(p, "<", 1); 846 xmldata(p, t, tl); 847 } 848 return; 849 } 850 851 /* start of RSS or Atom item / entry */ 852 if (ctx.feedtype == FeedTypeNone) { 853 if (istag(t, tl, STRP("entry"))) 854 ctx.feedtype = FeedTypeAtom; 855 else if (istag(t, tl, STRP("item"))) 856 ctx.feedtype = FeedTypeRSS; 857 return; 858 } 859 860 /* field tagid already set or nested tags. */ 861 if (ctx.tag.id) { 862 /* nested <author><name> for Atom */ 863 if (ctx.tag.id == AtomTagAuthor && 864 istag(t, tl, STRP("name"))) { 865 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag)); 866 } else { 867 return; /* other nested tags are not allowed: return */ 868 } 869 } 870 871 /* in item */ 872 if (ctx.tag.id == TagUnknown) { 873 if (!(f = gettag(ctx.feedtype, t, tl))) 874 f = ¬ag; 875 memcpy(&(ctx.tag), f, sizeof(ctx.tag)); 876 } 877 878 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); 879 string_clear(&attrispermalink); 880 string_clear(&attrrel); 881 string_clear(&attrtype); 882 } 883 884 static void 885 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) 886 { 887 enum TagId tagid; 888 889 if (ISINCONTENT(ctx)) { 890 if (ctx.contenttype == ContentTypeHTML) { 891 if (isshort) 892 xmldata(p, "/>", 2); 893 else 894 xmldata(p, ">", 1); 895 } 896 return; 897 } 898 899 /* set tag type based on it's attribute value */ 900 if (ctx.tag.id == RSSTagGuid) { 901 /* if empty the default is "true" */ 902 if (!attrispermalink.len || 903 isattr(attrispermalink.data, attrispermalink.len, STRP("true"))) 904 ctx.tag.id = RSSTagGuidPermalinkTrue; 905 else 906 ctx.tag.id = RSSTagGuidPermalinkFalse; 907 } else if (ctx.tag.id == AtomTagLink) { 908 /* empty or "alternate": other types could be 909 "enclosure", "related", "self" or "via" */ 910 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate"))) 911 ctx.tag.id = AtomTagLinkAlternate; 912 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure"))) 913 ctx.tag.id = AtomTagLinkEnclosure; 914 else 915 ctx.tag.id = AtomTagLink; /* unknown */ 916 } 917 918 tagid = ctx.tag.id; 919 920 /* map tag type to field: unknown or lesser priority is ignored, 921 when tags of the same type are repeated only the first is used. */ 922 if (fieldmap[tagid] == -1 || 923 (!ISFEEDFIELDMULTI(fieldmap[tagid]) && 924 tagid <= ctx.fields[fieldmap[tagid]].tagid)) { 925 return; 926 } 927 928 if (ctx.iscontenttag) { 929 ctx.iscontent = 1; 930 ctx.iscontenttag = 0; 931 932 /* detect content-type based on type attribute */ 933 if (attrtype.len) { 934 if (isattr(attrtype.data, attrtype.len, STRP("html")) || 935 isattr(attrtype.data, attrtype.len, STRP("xhtml")) || 936 isattr(attrtype.data, attrtype.len, STRP("text/html")) || 937 isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) || 938 isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml"))) 939 ctx.contenttype = ContentTypeHTML; 940 else /* unknown: handle as base64 text data */ 941 ctx.contenttype = ContentTypePlain; 942 } else { 943 /* default content-type */ 944 if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription) 945 ctx.contenttype = ContentTypeHTML; 946 else 947 ctx.contenttype = ContentTypePlain; 948 } 949 } 950 951 ctx.field = &(ctx.fields[fieldmap[tagid]].str); 952 ctx.fields[fieldmap[tagid]].tagid = tagid; 953 954 /* clear field if it is overwritten (with a priority order) for the new 955 value, if the field can have multiple values then do not clear it. */ 956 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 957 string_clear(ctx.field); 958 } 959 960 static void 961 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) 962 { 963 size_t i; 964 965 if (ctx.feedtype == FeedTypeNone) 966 return; 967 968 if (ISINCONTENT(ctx)) { 969 /* not a closed content field */ 970 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) { 971 if (!isshort && ctx.contenttype == ContentTypeHTML) { 972 xmldata(p, "</", 2); 973 xmldata(p, t, tl); 974 xmldata(p, ">", 1); 975 } 976 return; 977 } 978 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) { 979 /* matched tag end: close it */ 980 /* copy also to the link field if the attribute isPermaLink="true" 981 and it is not set by a tag with higher priority. */ 982 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field && 983 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) { 984 string_clear(&ctx.fields[FeedFieldLink].str); 985 string_append(&ctx.fields[FeedFieldLink].str, 986 ctx.field->data, ctx.field->len); 987 ctx.fields[FeedFieldLink].tagid = ctx.tag.id; 988 } 989 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && 990 istag(t, tl, STRP("entry"))) || /* Atom */ 991 (ctx.feedtype == FeedTypeRSS && 992 istag(t, tl, STRP("item"))))) /* RSS */ 993 { 994 /* end of RSS or Atom entry / item */ 995 printfields(); 996 997 /* clear strings */ 998 for (i = 0; i < FeedFieldLast; i++) { 999 string_clear(&ctx.fields[i].str); 1000 ctx.fields[i].tagid = TagUnknown; 1001 } 1002 ctx.contenttype = ContentTypeNone; 1003 /* allow parsing of Atom and RSS concatenated in one XML stream. */ 1004 ctx.feedtype = FeedTypeNone; 1005 } else { 1006 return; /* not end of field */ 1007 } 1008 1009 /* temporary string: for fields that cannot be processed 1010 directly and need more context, for example by it's tag 1011 attributes, like the Atom link rel="alternate|enclosure". */ 1012 if (tmpstr.len && ctx.field) { 1013 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) { 1014 if (ctx.field->len) 1015 string_append(ctx.field, FieldMultiSeparator, 1); 1016 string_append(ctx.field, tmpstr.data, tmpstr.len); 1017 } else { 1018 string_clear(ctx.field); 1019 string_append(ctx.field, tmpstr.data, tmpstr.len); 1020 } 1021 } 1022 1023 /* close field */ 1024 string_clear(&tmpstr); /* reuse and clear temporary string */ 1025 1026 if (ctx.tag.id == AtomTagAuthorName) 1027 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */ 1028 else 1029 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1030 1031 ctx.iscontent = 0; 1032 ctx.field = NULL; 1033 } 1034 1035 int 1036 main(int argc, char *argv[]) 1037 { 1038 if (pledge("stdio", NULL) == -1) 1039 err(1, "pledge"); 1040 1041 if (argc > 1) { 1042 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0]) 1043 baseurl = argv[1]; 1044 else 1045 errx(1, "baseurl incorrect or too long"); 1046 } 1047 1048 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1049 1050 parser.xmlattr = xmlattr; 1051 parser.xmlattrentity = xmlattrentity; 1052 parser.xmlattrend = xmlattrend; 1053 parser.xmlattrstart = xmlattrstart; 1054 parser.xmlcdata = xmldata; 1055 parser.xmldata = xmldata; 1056 parser.xmldataentity = xmldataentity; 1057 parser.xmltagend = xmltagend; 1058 parser.xmltagstart = xmltagstart; 1059 parser.xmltagstartparsed = xmltagstartparsed; 1060 1061 /* NOTE: getnext is defined in xml.h for inline optimization */ 1062 xml_parse(&parser); 1063 1064 checkfileerror(stdin, "<stdin>", 'r'); 1065 checkfileerror(stdout, "<stdout>", 'w'); 1066 1067 return 0; 1068 }