commit e771e43d51830ec7d2a19d9d4e67cded83c1b302
parent f054e581dac4921b302e0459a40d1b4f1fbd28ae
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 21 Oct 2020 22:06:58 +0200
sfeed_web: attribute parsing improvements, improve man page
Fix attribute parsing and now decode entities. The following now works (from
helsinkitimes.fi):
<base href="https://www.helsinkitimes.fi/" />
<link href="/?format=feed&type=rss" rel="alternate" type="application/rss+xml" title="RSS 2.0" />
<link href="/?format=feed&type=atom" rel="alternate" type="application/atom+xml" title="Atom 1.0" />
Properly associate attributes with the actual tag, this now parses properly
(from ascii.jp).
<link rel="apple-touch-icon-precomposed" href="/img/apple-touch-icon.png" />
<link rel="alternate" type="application/rss+xml" />
Diffstat:
M | sfeed_web.1 | | | 13 | ++++++------- |
M | sfeed_web.c | | | 94 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
2 files changed, 72 insertions(+), 35 deletions(-)
diff --git a/sfeed_web.1 b/sfeed_web.1
@@ -1,4 +1,4 @@
-.Dd March 15, 2020
+.Dd October 22, 2020
.Dt SFEED_WEB 1
.Os
.Sh NAME
@@ -21,13 +21,12 @@ url<TAB>content\-type<newline>
.Bl -tag -width Ds
.It url
Found relative or absolute url.
-If the url is relative and the
+.Pp
+For relative urls if a <base href="..." /> tag is found it will be used,
+otherwise if the
.Ar baseurl
-option is
-specified then the url is made absolute.
-If the url is relative and no
-.Ar baseurl
-option is specified then it is printed as is.
+option is specified then that is used, if neither are set then the relative url
+is printed.
.It content\-type
Usually application/atom+xml or application/rss+xml.
.El
diff --git a/sfeed_web.c b/sfeed_web.c
@@ -10,65 +10,101 @@
#define STRP(s) s,sizeof(s)-1
static XMLParser parser;
-static int isbase, islink, isfeedlink;
-static char abslink[4096], feedlink[4096], basehref[4096], feedtype[256];
+static int isbasetag, islinktag, ishrefattr, istypeattr;
+static char linkhref[4096], linktype[256], basehref[4096];
+static char abslink[4096];
static void
-printfeedtype(const char *s, FILE *fp)
+printvalue(const char *s)
{
for (; *s; s++)
- if (!isspace((unsigned char)*s))
- fputc(*s, fp);
+ if (!iscntrl((unsigned char)*s))
+ putchar(*s);
}
static void
xmltagstart(XMLParser *p, const char *t, size_t tl)
{
- isbase = islink = isfeedlink = 0;
- feedlink[0] = '\0';
+ isbasetag = islinktag = 0;
if (!strcasecmp(t, "base"))
- isbase = 1;
+ isbasetag = 1;
else if (!strcasecmp(t, "link"))
- islink = 1;
+ islinktag = 1;
}
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
- if (!isfeedlink)
+ if (!islinktag)
return;
- if (absuri(abslink, sizeof(abslink), feedlink, basehref) != -1)
- fputs(abslink, stdout);
+ if (strncasecmp(linktype, STRP("application/atom")) &&
+ strncasecmp(linktype, STRP("application/xml")) &&
+ strncasecmp(linktype, STRP("application/rss")))
+ return;
+
+ if (absuri(abslink, sizeof(abslink), linkhref, basehref) != -1)
+ printvalue(abslink);
else
- fputs(feedlink, stdout);
+ printvalue(linkhref);
putchar('\t');
- printfeedtype(feedtype, stdout);
+ printvalue(linktype);
putchar('\n');
}
static void
+xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *a, size_t al)
+{
+ ishrefattr = istypeattr = 0;
+
+ if (!isbasetag && !islinktag)
+ return;
+
+ if (!strcasecmp(a, "href")) {
+ ishrefattr = 1;
+ if (isbasetag)
+ basehref[0] = '\0';
+ else if (islinktag)
+ linkhref[0] = '\0';
+ } else if (!strcasecmp(a, "type") && islinktag) {
+ istypeattr = 1;
+ linktype[0] = '\0';
+ }
+}
+
+static void
xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
const char *v, size_t vl)
{
- if (isbase) {
- if (!strcasecmp(n, "href"))
- strlcpy(basehref, v, sizeof(basehref));
- } else if (islink) {
- if (!strcasecmp(n, "type")) {
- if (!strncasecmp(v, STRP("application/atom")) ||
- !strncasecmp(v, STRP("application/xml")) ||
- !strncasecmp(v, STRP("application/rss"))) {
- isfeedlink = 1;
- strlcpy(feedtype, v, sizeof(feedtype));
- }
- } else if (!strcasecmp(n, "href")) {
- strlcpy(feedlink, v, sizeof(feedlink));
- }
+ if (isbasetag && ishrefattr) {
+ strlcat(basehref, v, sizeof(basehref));
+ } else if (islinktag) {
+ if (ishrefattr)
+ strlcat(linkhref, v, sizeof(linkhref));
+ else if (istypeattr)
+ strlcat(linktype, v, sizeof(linktype));
}
}
+static void
+xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl)
+{
+ char buf[16];
+ int len;
+
+ if (!ishrefattr && !istypeattr)
+ return;
+
+ /* try to translate entity, else just pass as data to
+ * xmlattr handler. */
+ if ((len = xml_entitytostr(v, buf, sizeof(buf))) > 0)
+ xmlattr(p, t, tl, a, al, buf, (size_t)len);
+ else
+ xmlattr(p, t, tl, a, al, v, vl);
+}
+
int
main(int argc, char *argv[])
{
@@ -79,6 +115,8 @@ main(int argc, char *argv[])
strlcpy(basehref, argv[1], sizeof(basehref));
parser.xmlattr = xmlattr;
+ parser.xmlattrentity = xmlattrentity;
+ parser.xmlattrstart = xmlattrstart;
parser.xmltagstart = xmltagstart;
parser.xmltagstartparsed = xmltagstartparsed;