util: rewrite uri parser - sfeed

commit ce1c1697a4d3a0e592b47ae65b4096d21d4cb90b
parent 3a598e3357e0bda6d5a5c828065feabb49b1c029
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Tue, 28 Jul 2015 21:24:06 +0200

util: rewrite uri parser

- don't print directly but use an internal buffer (also better for testing).
- encode uri when printing (security).
- add some comments.

Diffstat:
M util.c  | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M util.h  | 42 +++++++++++++++++++++++-------------------

2 files changed, 156 insertions(+), 72 deletions(-)
diff --git a/util.c b/util.c
@@ -1,72 +1,152 @@
+#include <sys/types.h>
+
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <libgen.h>
 #include <limits.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <sys/types.h>
 #include <time.h>
 #include <wchar.h>
 
 #include "util.h"
 
-void
-printurlencoded(const char *s, size_t len, FILE *fp)
+static void
+encodehex(unsigned char c, char *s)
 {
+	static const char *table = "0123456789ABCDEF";
+
+	s[0] = table[((c - (c % 16)) / 16) % 16];
+	s[1] = table[c % 16];
+}
+
+int
+parseuri(const char *s, struct uri *u, int rel)
+{
+	const char *p = s;
 	size_t i;
 
-	for(i = 0; i < len && s[i]; i++) {
-		if((int)s[i] == ' ')
-			fputs("%20", fp);
-		else if((unsigned char)s[i] > 127 || iscntrl((int)s[i]))
-			fprintf(fp, "%%%02X", (unsigned char)s[i]);
-		else
-			fputc(s[i], fp);
+	memset(u, 0, sizeof(struct uri));
+	if (!*s)
+		return 0;
+
+	/* prefix is "//", don't read protocol, skip to domain parsing */
+	if (!strncmp(p, "//", 2)) {
+		p += 2; /* skip "//" */
+	} else {
+		/* protocol part */
+		for (p = s; *p && (isalpha((int)*p) || isdigit((int)*p) ||
+			       *p == '+' || *p == '-' || *p == '.'); p++)
+			;
+		if (!strncmp(p, "://", 3)) {
+			if (p - s + 1 >= (ssize_t)sizeof(u->proto))
+				return -1; /* protocol too long */
+			memcpy(u->proto, s, p - s);
+			p += 3; /* skip "://" */
+		} else {
+			p = s; /* no protocol format, set to start */
+			/* relative url: read rest as path, else as domain */
+			if (rel)
+				goto readpath;
+		}
 	}
+	/* domain / host part, skip until "/" or end. */
+	i = strcspn(p, "/");
+	if (i + 1 >= sizeof(u->host))
+		return -1; /* host too long */
+	memcpy(u->host, p, i);
+	p = &p[i];
+
+readpath:
+	if (u->host[0]) {
+		p = &p[strspn(p, "/")];
+		strlcpy(u->path, "/", sizeof(u->path));
+	} else {
+		/* having no host is an error in this case */
+		if (!rel)
+			return -1;
+	}
+	/* treat truncation as an error */
+	return strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path) ? -1 : 0;
 }
 
-/* print link; if link is relative use baseurl to make it absolute */
-void
-printlink(const char *link, const char *baseurl, FILE *fp)
+/* get absolute uri; if link is relative use baseuri to make it absolute */
+int
+absuri(const char *link, const char *base, char *buf, size_t bufsiz)
 {
-	const char *ebaseproto, *ebasedomain, *p;
-	int isrelative;
-
-	/* protocol part */
-	for(p = link; *p && (isalpha((int)*p) || isdigit((int)*p) ||
-		                *p == '+' || *p == '-' || *p == '.'); p++);
-	/* relative link (baseurl is used). */
-	isrelative = strncmp(p, "://", strlen("://"));
-	if(isrelative) {
-		if((ebaseproto = strstr(baseurl, "://"))) {
-			ebaseproto += strlen("://");
-			printurlencoded(baseurl, ebaseproto - baseurl, fp);
+	struct uri ulink, ubase;
+	char tmp[4096] = "", *p;
+	int r = -1, c;
+
+	buf[0] = '\0';
+	if (parseuri(base, &ubase, 0) == -1 ||
+	    parseuri(link, &ulink, 1) == -1)
+		return -1;
+
+	if (!ulink.host[0] && !ubase.host[0])
+		return -1;
+
+	r = snprintf(tmp, sizeof(tmp), "%s://%s",
+		ulink.proto[0] ?
+			ulink.proto :
+			(ubase.proto[0] ? ubase.proto : "http"),
+		!strncmp(link, "//", 2) ?
+			ulink.host :
+			(ulink.host[0] ? ulink.host : ubase.host));
+	if (r == -1 || (size_t)r >= sizeof(tmp))
+		return -1;
+
+	/* relative to root */
+	if (!ulink.host[0] && ulink.path[0] != '/') {
+		/* relative to base url path */
+		if (ulink.path[0]) {
+			if ((p = strrchr(ubase.path, '/'))) {
+				/* temporary null-terminate */
+				c = *(++p);
+				*p = '\0';
+				strlcat(tmp, ubase.path, sizeof(tmp));
+				*p = c; /* restore */
+			}
 		} else {
-			ebaseproto = baseurl;
-			if(*baseurl || (link[0] == '/' && link[1] == '/'))
-				fputs("http://", fp);
+			strlcat(tmp, ubase.path, sizeof(tmp));
 		}
-		if(link[0] == '/') { /* relative to baseurl domain (not path).  */
-			if(link[1] == '/') /* absolute url but with protocol from baseurl. */
-				link += 2;
-			else if((ebasedomain = strchr(ebaseproto, '/')))
-				/* relative to baseurl and baseurl path. */
-				printurlencoded(ebaseproto, ebasedomain - ebaseproto, fp);
-			else
-				printurlencoded(ebaseproto, strlen(ebaseproto), fp);
-		} else if((ebasedomain = strrchr(ebaseproto, '/'))) {
-			/* relative to baseurl and baseurl path. */
-			printurlencoded(ebaseproto, ebasedomain - ebaseproto + 1, fp);
+	}
+	if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
+		return -1;
+
+	return encodeuri(tmp, buf, bufsiz);
+}
+
+int
+encodeuri(const char *s, char *buf, size_t bufsiz)
+{
+	size_t i, b;
+
+	if (!bufsiz)
+		return -1;
+	for (i = 0, b = 0; s[i]; i++) {
+		if ((int)s[i] == ' ' ||
+		    (unsigned char)s[i] > 127 ||
+		    iscntrl((int)s[i])) {
+			if (b + 3 >= bufsiz)
+				return -1;
+			buf[b++] = '%';
+			encodehex(s[i], &buf[b]);
+			b += 2;
 		} else {
-			printurlencoded(ebaseproto, strlen(ebaseproto), fp);
-			if(*baseurl && *link)
-				fputc('/', fp);
+			if (b >= bufsiz)
+				return -1;
+			buf[b++] = s[i];
 		}
 	}
-	printurlencoded(link, strlen(link), fp);
+	if (b >= bufsiz)
+		return -1;
+	buf[b] = '\0';
+
+	return 0;
 }
 
 /* read a field-separated line from 'fp',
@@ -135,6 +215,8 @@ printxmlencoded(const char *s, FILE *fp)
 	}
 }
 
+/* print `len` columns of characters. If string is shorter pad the rest
+ * with characters `pad`. */
 void
 printutf8pad(FILE *fp, const char *s, size_t len, int pad)
 {
@@ -156,6 +238,7 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
 		putc(pad, fp);
 }
 
+/* parse time to time_t, assumes time_t is signed */
 int
 strtotime(const char *s, time_t *t)
 {
@@ -179,15 +262,12 @@ printcontent(const char *s, FILE *fp)
 
 	for(p = s; *p; p++) {
 		if(*p == '\\') {
-			p++;
-			if(*p == '\\')
-				fputc('\\', fp);
-			else if(*p == 't')
-				fputc('\t', fp);
-			else if(*p == 'n')
-				fputc('\n', fp);
-			else
-				fputc(*p, fp); /* unknown */
+			switch (*(++p)) {
+			case '\\': fputc('\\', fp); break;
+			case 't':  fputc('\t', fp); break;
+			case 'n':  fputc('\n', fp); break;
+			default:   fputc(*p,   fp);
+			}
 		} else {
 			fputc(*p, fp);
 		}
diff --git a/util.h b/util.h
@@ -1,6 +1,3 @@
-#include <stdio.h>
-#include <time.h>
-
 #ifdef COMPAT
 #include "compat.h"
 #endif
@@ -10,27 +7,34 @@
 
 /* feed info */
 struct feed {
-	char *         name;     /* feed name */
-	unsigned long  totalnew; /* amount of new items per feed */
-	unsigned long  total;    /* total items */
-	time_t         timenewest;
-	char           timenewestformat[64];
+	char *        name;     /* feed name */
+	unsigned long totalnew; /* amount of new items per feed */
+	unsigned long total;    /* total items */
+	time_t        timenewest;
+	char          timenewestformat[64];
+};
+
+/* uri */
+struct uri {
+	char proto[48];
+	char host[255];
+	char path[2048];
 };
 
 enum { FieldUnixTimestamp = 0, FieldTimeFormatted, FieldTitle, FieldLink,
        FieldContent, FieldContentType, FieldId, FieldAuthor, FieldFeedType,
        FieldLast };
 
-ssize_t chartoxmlentity(int, char *, size_t);
-int     parseline(char **, size_t *, char **, unsigned int, int, FILE *);
-void    printcontent(const char *, FILE *);
-void    printxmlencoded(const char *, FILE *);
-void    printlink(const char *, const char *, FILE *);
-void    printurlencoded(const char *, size_t, FILE *);
-void    printutf8pad(FILE *, const char *, size_t, int);
-int     strtotime(const char *, time_t *);
-char   *trimstart(const char *);
-char   *trimend(const char *);
-char   *xbasename(const char *);
+int    absuri(const char *, const char *, char *, size_t);
+int    encodeuri(const char *, char *, size_t);
+int    parseline(char **, size_t *, char **, unsigned int, int, FILE *);
+int    parseuri(const char *, struct uri *, int);
+void   printcontent(const char *, FILE *);
+void   printxmlencoded(const char *, FILE *);
+void   printutf8pad(FILE *, const char *, size_t, int);
+int    strtotime(const char *, time_t *);
+char * trimstart(const char *);
+char * trimend(const char *);
+char * xbasename(const char *);

	sfeed simple feed reader - forked from git.codemadness.org/sfeed
	git clone git://src.gearsix.net/sfeed	sfeed.zip
	Log \| Files \| Refs \| Atom \| README \| LICENSE

M	util.c	\|	186	++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M	util.h	\|	42	+++++++++++++++++++++++-------------------