util: improve/refactor URI parsing and formatting - sfeed

commit f305b032bc19b4e81c0dd6c0398370028ea910ca
parent 30476d22307aaa38170da5241a5d5e9864c4e76d
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Tue, 16 Feb 2021 18:38:56 +0100

util: improve/refactor URI parsing and formatting

Removed/rewritten the functions:
absuri, parseuri, and encodeuri() for percent-encoding.

The functions are now split separately with the following purpose:

- uri_format: format struct uri into a string.
- uri_hasscheme: quick check if a string is absolute or not.
- uri_makeabs: make a URI absolute using a base uri and the original URI.
- uri_parse: parse a string into a struct uri.

The following URLs are better parsed:

- URLs with extra "/"'s in the path prepended are kept as is, no "/" is added
  either for empty paths.
- URLs like "http://codemadness.org" are not changed to
  "http://codemadness.org/" anymore (paths are kept as is, unless they are
  non-empty and not start with "/").
- Paths are not percent-encoded anymore.
- URLs with userinfo field (username, password) are parsed.
  like: ftp://user:password@[2001:db8::7]:2121/rfc/rfc1808.txt
- Non-authoritive URLs like mailto:some@email.org, magnet URIs, ISBN URIs/urn,
  like: urn:isbn:0-395-36341-1 are allowed and parsed correctly.
- Both local (file:///) and non-local (file://) are supported.
- Specifying a base URL with a port will now only use it when the relative URL
  has no host and port set and follows RFC3986 5.2.2 more closely.
- Parsing numeric port: parse as signed long and check <= 0, empty port is
  allowed.
- Parsing URIs containing query, fragment, but no path separator (/) will now
  parse the component properly.

For sfeed:

- Parse the baseURI only once (no need to do it every time for making absolute
  URIs).
- If a link/enclosure is absolute already or if there is no base URL specified
  then just print the link directly. There have also been other small performance
  improvements related to handling URIs.

References:
- https://tools.ietf.org/html/rfc3986
  - Section "5.2.2. Transform References" have also been helpful.

Diffstat:
M sfeed.1  | 11 +++++++----
M sfeed.5  | 4 ++--
M sfeed.c  | 31 +++++++++++++++++++++++--------
M sfeed_gopher.c  | 18 ++++++++++++++++--
M sfeed_web.c  | 17 ++++++++++++++---
M util.c  | 284 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M util.h  | 18 ++++++++++++------

7 files changed, 234 insertions(+), 149 deletions(-)
diff --git a/sfeed.1 b/sfeed.1
@@ -1,4 +1,4 @@
-.Dd January 26, 2021
+.Dd February 19, 2021
 .Dt SFEED 1
 .Os
 .Sh NAME
@@ -13,8 +13,11 @@ reads RSS or Atom feed data (XML) from stdin.
 It writes the feed data in a TAB-separated format to stdout.
 A
 .Ar baseurl
-can be specified if the links in the feed are relative URLs.
-It is recommended to always have absolute URLs in your feeds.
+can be specified if the links or enclosures in the feed are relative URLs.
+If the
+.Ar baseurl
+is a valid absolute URL then the relative links or enclosures will be
+made absolute.
 .Sh TAB-SEPARATED FORMAT FIELDS
 The items are output per line in a TSV-like format.
 .Pp
@@ -35,7 +38,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure.
 .It title
 Title text, HTML code in titles is ignored and is treated as plain-text.
 .It link
-Absolute URL, unsafe characters are encoded.
+Link
 .It content
 Content, can have plain-text or HTML code depending on the content-type field.
 .It content-type
diff --git a/sfeed.5 b/sfeed.5
@@ -1,4 +1,4 @@
-.Dd September 19, 2020
+.Dd February 19, 2021
 .Dt SFEED 5
 .Os
 .Sh NAME
@@ -29,7 +29,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure.
 .It title
 Title text, HTML code in titles is ignored and is treated as plain-text.
 .It link
-Absolute URL, unsafe characters are encoded.
+Link
 .It content
 Content, can have plain-text or HTML code depending on the content-type field.
 .It content-type
diff --git a/sfeed.c b/sfeed.c
@@ -204,7 +204,8 @@ static int fieldmap[TagLast] = {
 static const int FieldSeparator = '\t';
 /* separator for multiple values in a field, separator should be 1 byte */
 static const char *FieldMultiSeparator = "|";
-static const char *baseurl = "";
+static struct uri baseuri;
+static const char *baseurl;
 
 static FeedContext ctx;
 static XMLParser parser; /* XML parser state */
@@ -381,23 +382,33 @@ string_print_trimmed_multi(String *s)
 	}
 }
 
-/* always print absolute urls (using global baseurl) */
+/* always print absolute URLs (using global baseurl) */
 void
 printuri(char *s)
 {
 	char link[4096], *p, *e;
-	int c;
+	struct uri newuri, olduri;
+	int c, r = -1;
 
 	p = ltrim(s);
 	e = rtrim(p);
 	c = *e;
 	*e = '\0';
-	if (absuri(link, sizeof(link), p, baseurl) != -1)
-		fputs(link, stdout);
+
+	if (baseurl && !uri_hasscheme(p) &&
+	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
+	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
+		r = uri_format(link, sizeof(link), &newuri);
+
+	if (r >= 0 && (size_t)r < sizeof(link))
+		printtrimmed(link);
+	else
+		printtrimmed(p);
+
 	*e = c; /* restore NUL byte to original character */
 }
 
-/* always print absolute urls (using global baseurl) */
+/* always print absolute URLs (using global baseurl) */
 void
 string_print_uri(String *s)
 {
@@ -1015,8 +1026,12 @@ main(int argc, char *argv[])
 	if (pledge("stdio", NULL) == -1)
 		err(1, "pledge");
 
-	if (argc > 1)
-		baseurl = argv[1];
+	if (argc > 1) {
+		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
+			baseurl = argv[1];
+		else
+			errx(1, "baseurl incorrect or too long");
+	}
 
 	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
 
diff --git a/sfeed_gopher.c b/sfeed_gopher.c
@@ -38,7 +38,8 @@ static void
 printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 {
 	struct uri u;
-	char *fields[FieldLast], *itemhost, *itemport, *itempath;
+	char *fields[FieldLast];
+	char *itemhost, *itemport, *itempath, *itemquery, *itemfragment;
 	ssize_t linelen;
 	unsigned int isnew;
 	struct tm rtm, *tm;
@@ -59,15 +60,20 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 		itemport = port;
 		itemtype = 'i';
 		itempath = fields[FieldLink];
+		itemquery = "";
+		itemfragment = "";
 
 		if (fields[FieldLink][0]) {
 			itemtype = 'h';
+			/* if it's a gopher URL then change it into a direntry */
 			if (!strncmp(fields[FieldLink], "gopher://", 9) &&
-			    parseuri(fields[FieldLink], &u, 0) != -1) {
+			    uri_parse(fields[FieldLink], &u) != -1) {
 				itemhost = u.host;
 				itemport = u.port[0] ? u.port : "70";
 				itemtype = '1';
 				itempath = u.path;
+				itemquery = u.query;
+				itemfragment = u.fragment;
 
 				if (itempath[0] == '/') {
 					itempath++;
@@ -100,6 +106,14 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 		if (itemtype == 'h' && fields[FieldLink] == itempath)
 			fputs("URL:", fpitems);
 		gophertext(fpitems, itempath);
+		if (itemquery[0]) {
+			fputs("?", fpitems);
+			gophertext(fpitems, itemquery);
+		}
+		if (itemfragment[0]) {
+			fputs("#", fpitems);
+			gophertext(fpitems, itemfragment);
+		}
 		fprintf(fpitems, "\t%s\t%s\r\n", itemhost, itemport);
 	}
 	fputs(".\r\n", fpitems);
diff --git a/sfeed_web.c b/sfeed_web.c
@@ -12,7 +12,6 @@
 static XMLParser parser;
 static int isbasetag, islinktag, ishrefattr, istypeattr;
 static char linkhref[4096], linktype[256], basehref[4096];
-static char abslink[4096];
 
 static void
 printvalue(const char *s)
@@ -39,6 +38,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
 static void
 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 {
+	struct uri baseuri, linkuri, u;
+	char buf[4096];
+	int r = -1;
+
 	if (!islinktag)
 		return;
 
@@ -47,10 +50,18 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 	    strncasecmp(linktype, STRP("application/rss")))
 		return;
 
-	if (absuri(abslink, sizeof(abslink), linkhref, basehref) != -1)
-		printvalue(abslink);
+	/* parse base URI each time: it can change. */
+	if (basehref[0] &&
+	    uri_parse(linkhref, &linkuri) != -1 && !linkuri.proto[0] &&
+	    uri_parse(basehref, &baseuri) != -1 &&
+	    uri_makeabs(&u, &linkuri, &baseuri) != -1 && u.proto[0])
+		r = uri_format(buf, sizeof(buf), &u);
+
+	if (r >= 0 && (size_t)r < sizeof(buf))
+		printvalue(buf);
 	else
 		printvalue(linkhref);
+
 	putchar('\t');
 	printvalue(linktype);
 	putchar('\n');
diff --git a/util.c b/util.c
@@ -7,167 +7,203 @@
 
 #include "util.h"
 
+/* check if string has a non-empty scheme / protocol part */
 int
-parseuri(const char *s, struct uri *u, int rel)
+uri_hasscheme(const char *s)
 {
-	const char *p = s, *b;
-	char *endptr = NULL;
+	const char *p = s;
+
+	for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+		       *p == '+' || *p == '-' || *p == '.'; p++)
+		;
+	/* scheme, except if empty and starts with ":" then it is a path */
+	return (*p == ':' && p != s);
+}
+
+int
+uri_parse(const char *s, struct uri *u)
+{
+	const char *p = s;
+	char *endptr;
 	size_t i;
-	unsigned long l;
+	long l;
 
-	u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
-	if (!*s)
-		return 0;
+	u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
+	u->path[0] = u->query[0] = u->fragment[0] = '\0';
 
-	/* prefix is "//", don't read protocol, skip to domain parsing */
-	if (!strncmp(p, "//", 2)) {
+	/* protocol-relative */
+	if (*p == '/' && *(p + 1) == '/') {
 		p += 2; /* skip "//" */
-	} else {
-		/* protocol part */
-		for (p = s; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
-			       *p == '+' || *p == '-' || *p == '.'; p++)
-			;
-		if (!strncmp(p, "://", 3)) {
-			if ((size_t)(p - s) >= sizeof(u->proto))
-				return -1; /* protocol too long */
-			memcpy(u->proto, s, p - s);
-			u->proto[p - s] = '\0';
+		goto parseauth;
+	}
+
+	/* scheme / protocol part */
+	for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+		       *p == '+' || *p == '-' || *p == '.'; p++)
+		;
+	/* scheme, except if empty and starts with ":" then it is a path */
+	if (*p == ':' && p != s) {
+		if (*(p + 1) == '/' && *(p + 2) == '/')
 			p += 3; /* skip "://" */
-		} else {
-			p = s; /* no protocol format, set to start */
-			/* relative url: read rest as path, else as domain */
-			if (rel)
-				goto readpath;
-		}
+		else
+			p++; /* skip ":" */
+
+		if ((size_t)(p - s) >= sizeof(u->proto))
+			return -1; /* protocol too long */
+		memcpy(u->proto, s, p - s);
+		u->proto[p - s] = '\0';
+
+		if (*(p - 1) != '/')
+			goto parsepath;
+	} else {
+		p = s; /* no scheme format, reset to start */
+		goto parsepath;
+	}
+
+parseauth:
+	/* userinfo (username:password) */
+	i = strcspn(p, "@/?#");
+	if (p[i] == '@') {
+		if (i >= sizeof(u->userinfo))
+			return -1; /* userinfo too long */
+		memcpy(u->userinfo, p, i);
+		u->userinfo[i] = '\0';
+		p += i + 1;
 	}
+
 	/* IPv6 address */
 	if (*p == '[') {
-		/* bracket not found or host too long */
-		if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
-		    (size_t)(b - p) >= sizeof(u->host))
+		/* bracket not found, host too short or too long */
+		i = strcspn(p, "]");
+		if (p[i] != ']' || i < 3)
 			return -1;
-		memcpy(u->host, p, b - p + 1);
-		u->host[b - p + 1] = '\0';
-		p = b + 1;
+		i++; /* including "]" */
 	} else {
 		/* domain / host part, skip until port, path or end. */
-		if ((i = strcspn(p, ":/")) >= sizeof(u->host))
-			return -1; /* host too long */
-		memcpy(u->host, p, i);
-		u->host[i] = '\0';
-		p = &p[i];
+		i = strcspn(p, ":/?#");
 	}
+	if (i >= sizeof(u->host))
+		return -1; /* host too long */
+	memcpy(u->host, p, i);
+	u->host[i] = '\0';
+	p += i;
+
 	/* port */
 	if (*p == ':') {
-		if ((i = strcspn(++p, "/")) >= sizeof(u->port))
+		p++;
+		if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
 			return -1; /* port too long */
 		memcpy(u->port, p, i);
 		u->port[i] = '\0';
-		/* check for valid port: range 1 - 65535 */
+		/* check for valid port: range 1 - 65535, may be empty */
 		errno = 0;
-		l = strtoul(u->port, &endptr, 10);
-		if (errno || u->port[0] == '\0' || *endptr ||
-		    !l || l > 65535)
+		l = strtol(u->port, &endptr, 10);
+		if (i && (errno || *endptr || l <= 0 || l > 65535))
 			return -1;
-		p = &p[i];
+		p += i;
 	}
-readpath:
-	if (u->host[0]) {
-		p = &p[strspn(p, "/")];
-		strlcpy(u->path, "/", sizeof(u->path));
-	} else {
-		/* absolute uri must have a host specified */
-		if (!rel)
-			return -1;
-	}
-	/* treat truncation as an error */
-	if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
-		return -1;
-	return 0;
-}
 
-static int
-encodeuri(char *buf, size_t bufsiz, const char *s)
-{
-	static const char *table = "0123456789ABCDEF";
-	size_t i, b;
+parsepath:
+	/* path */
+	if ((i = strcspn(p, "?#")) >= sizeof(u->path))
+		return -1; /* path too long */
+	memcpy(u->path, p, i);
+	u->path[i] = '\0';
+	p += i;
 
-	for (i = 0, b = 0; s[i]; i++) {
-		if ((unsigned char)s[i] <= ' ' ||
-		    (unsigned char)s[i] >= 127) {
-			if (b + 3 >= bufsiz)
-				return -1;
-			buf[b++] = '%';
-			buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
-			buf[b++] = table[(unsigned char)s[i] & 15];
-		} else if (b < bufsiz) {
-			buf[b++] = s[i];
-		} else {
-			return -1;
-		}
+	/* query */
+	if (*p == '?') {
+		p++;
+		if ((i = strcspn(p, "#")) >= sizeof(u->query))
+			return -1; /* query too long */
+		memcpy(u->query, p, i);
+		u->query[i] = '\0';
+		p += i;
+	}
+
+	/* fragment */
+	if (*p == '#') {
+		p++;
+		if ((i = strlen(p)) >= sizeof(u->fragment))
+			return -1; /* fragment too long */
+		memcpy(u->fragment, p, i);
+		u->fragment[i] = '\0';
 	}
-	if (b >= bufsiz)
-		return -1;
-	buf[b] = '\0';
 
 	return 0;
 }
 
-/* Get absolute uri; if `link` is relative use `base` to make it absolute.
- * the returned string in `buf` is uri encoded, see: encodeuri(). */
+/* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
+   Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
+   Returns 0 on success, -1 on error or truncation. */
 int
-absuri(char *buf, size_t bufsiz, const char *link, const char *base)
+uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
 {
-	struct uri ulink, ubase;
-	char tmp[4096], *host, *p, *port;
-	int c, r;
-	size_t i;
+	char *p;
+	int c;
 
-	buf[0] = '\0';
-	if (parseuri(base, &ubase, 0) == -1 ||
-	    parseuri(link, &ulink, 1) == -1 ||
-	    (!ulink.host[0] && !ubase.host[0]))
-		return -1;
+	strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
 
-	if (!strncmp(link, "//", 2)) {
-		host = ulink.host;
-		port = ulink.port;
-	} else {
-		host = ulink.host[0] ? ulink.host : ubase.host;
-		port = ulink.port[0] ? ulink.port : ubase.port;
+	if (u->proto[0] || u->host[0]) {
+		strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
+		strlcpy(a->host, u->host, sizeof(a->host));
+		strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
+		strlcpy(a->host, u->host, sizeof(a->host));
+		strlcpy(a->port, u->port, sizeof(a->port));
+		strlcpy(a->path, u->path, sizeof(a->path));
+		strlcpy(a->query, u->query, sizeof(a->query));
+		return 0;
 	}
-	r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
-		ulink.proto[0] ?
-			ulink.proto :
-			(ubase.proto[0] ? ubase.proto : "http"),
-		host,
-		port[0] ? ":" : "",
-		port);
-	if (r < 0 || (size_t)r >= sizeof(tmp))
-		return -1; /* error or truncation */
-
-	/* relative to root */
-	if (!ulink.host[0] && ulink.path[0] != '/') {
-		/* relative to base url path */
-		if (ulink.path[0]) {
-			if ((p = strrchr(ubase.path, '/'))) {
-				/* temporary null-terminate */
-				c = *(++p);
-				*p = '\0';
-				i = strlcat(tmp, ubase.path, sizeof(tmp));
-				*p = c; /* restore */
-				if (i >= sizeof(tmp))
-					return -1;
-			}
-		} else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
-		           sizeof(tmp)) {
-			return -1;
+
+	strlcpy(a->proto, b->proto, sizeof(a->proto));
+	strlcpy(a->host, b->host, sizeof(a->host));
+	strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
+	strlcpy(a->host, b->host, sizeof(a->host));
+	strlcpy(a->port, b->port, sizeof(a->port));
+
+	if (!u->path[0]) {
+		strlcpy(a->path, b->path, sizeof(a->path));
+	} else if (u->path[0] == '/') {
+		strlcpy(a->path, u->path, sizeof(a->path));
+	} else {
+		a->path[0] = (a->host[0] && b->path[0] != '/') ? '/' : '\0';
+		a->path[1] = '\0';
+
+		if ((p = strrchr(b->path, '/'))) {
+			c = *(++p);
+			*p = '\0'; /* temporary NUL-terminate */
+			if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
+				return -1;
+			*p = c; /* restore */
 		}
+		if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
+			return -1;
 	}
-	if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
-		return -1;
 
-	return encodeuri(buf, bufsiz, tmp);
+	if (u->path[0] || u->query[0])
+		strlcpy(a->query, u->query, sizeof(a->query));
+	else
+		strlcpy(a->query, b->query, sizeof(a->query));
+
+	return 0;
+}
+
+int
+uri_format(char *buf, size_t bufsiz, struct uri *u)
+{
+	return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
+		u->proto,
+		u->userinfo[0] ? u->userinfo : "",
+		u->userinfo[0] ? "@" : "",
+		u->host,
+		u->port[0] ? ":" : "",
+		u->port,
+		u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
+		u->path,
+		u->query[0] ? "?" : "",
+		u->query,
+		u->fragment[0] ? "#" : "",
+		u->fragment);
 }
 
 /* Splits fields in the line buffer by replacing TAB separators with NUL ('\0')
diff --git a/util.h b/util.h
@@ -21,12 +21,15 @@ struct feed {
 	unsigned long total;    /* total items */
 };
 
-/* uri */
+/* URI */
 struct uri {
-	char proto[48];
+	char proto[48];     /* scheme including ":" or "://" */
+	char userinfo[256]; /* username [:password] */
 	char host[256];
-	char path[2048];
-	char port[6];     /* numeric port */
+	char port[6];       /* numeric port */
+	char path[1024];
+	char query[1024];
+	char fragment[1024];
 };
 
 enum {
@@ -35,9 +38,12 @@ enum {
 	FieldLast
 };
 
-int  absuri(char *, size_t, const char *, const char *);
+int uri_format(char *, size_t, struct uri *);
+int uri_hasscheme(const char *);
+int uri_makeabs(struct uri *, struct uri *, struct uri *);
+int uri_parse(const char *, struct uri *);
+
 void parseline(char *, char *[FieldLast]);
-int  parseuri(const char *, struct uri *, int);
 void printutf8pad(FILE *, const char *, size_t, int);
 int  strtotime(const char *, time_t *);
 void xmlencode(const char *, FILE *);

	sfeed simple feed reader - forked from git.codemadness.org/sfeed
	git clone git://src.gearsix.net/sfeed	sfeed.zip
	Log \| Files \| Refs \| Atom \| README \| LICENSE

M	sfeed.1	\|	11	+++++++----
M	sfeed.5	\|	4	++--
M	sfeed.c	\|	31	+++++++++++++++++++++++--------
M	sfeed_gopher.c	\|	18	++++++++++++++++--
M	sfeed_web.c	\|	17	++++++++++++++---
M	util.c	\|	284	++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M	util.h	\|	18	++++++++++++------