commit ce1c1697a4d3a0e592b47ae65b4096d21d4cb90b
parent 3a598e3357e0bda6d5a5c828065feabb49b1c029
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Tue, 28 Jul 2015 21:24:06 +0200
util: rewrite uri parser
- don't print directly but use an internal buffer (also better for testing).
- encode uri when printing (security).
- add some comments.
Diffstat:
M | util.c | | | 186 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- |
M | util.h | | | 42 | +++++++++++++++++++++++------------------- |
2 files changed, 156 insertions(+), 72 deletions(-)
diff --git a/util.c b/util.c
@@ -1,72 +1,152 @@
+#include <sys/types.h>
+
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <libgen.h>
#include <limits.h>
+#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
-#include <stdarg.h>
#include <string.h>
-#include <sys/types.h>
#include <time.h>
#include <wchar.h>
#include "util.h"
-void
-printurlencoded(const char *s, size_t len, FILE *fp)
+static void
+encodehex(unsigned char c, char *s)
{
+ static const char *table = "0123456789ABCDEF";
+
+ s[0] = table[((c - (c % 16)) / 16) % 16];
+ s[1] = table[c % 16];
+}
+
+int
+parseuri(const char *s, struct uri *u, int rel)
+{
+ const char *p = s;
size_t i;
- for(i = 0; i < len && s[i]; i++) {
- if((int)s[i] == ' ')
- fputs("%20", fp);
- else if((unsigned char)s[i] > 127 || iscntrl((int)s[i]))
- fprintf(fp, "%%%02X", (unsigned char)s[i]);
- else
- fputc(s[i], fp);
+ memset(u, 0, sizeof(struct uri));
+ if (!*s)
+ return 0;
+
+ /* prefix is "//", don't read protocol, skip to domain parsing */
+ if (!strncmp(p, "//", 2)) {
+ p += 2; /* skip "//" */
+ } else {
+ /* protocol part */
+ for (p = s; *p && (isalpha((int)*p) || isdigit((int)*p) ||
+ *p == '+' || *p == '-' || *p == '.'); p++)
+ ;
+ if (!strncmp(p, "://", 3)) {
+ if (p - s + 1 >= (ssize_t)sizeof(u->proto))
+ return -1; /* protocol too long */
+ memcpy(u->proto, s, p - s);
+ p += 3; /* skip "://" */
+ } else {
+ p = s; /* no protocol format, set to start */
+ /* relative url: read rest as path, else as domain */
+ if (rel)
+ goto readpath;
+ }
}
+ /* domain / host part, skip until "/" or end. */
+ i = strcspn(p, "/");
+ if (i + 1 >= sizeof(u->host))
+ return -1; /* host too long */
+ memcpy(u->host, p, i);
+ p = &p[i];
+
+readpath:
+ if (u->host[0]) {
+ p = &p[strspn(p, "/")];
+ strlcpy(u->path, "/", sizeof(u->path));
+ } else {
+ /* having no host is an error in this case */
+ if (!rel)
+ return -1;
+ }
+ /* treat truncation as an error */
+ return strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path) ? -1 : 0;
}
-/* print link; if link is relative use baseurl to make it absolute */
-void
-printlink(const char *link, const char *baseurl, FILE *fp)
+/* get absolute uri; if link is relative use baseuri to make it absolute */
+int
+absuri(const char *link, const char *base, char *buf, size_t bufsiz)
{
- const char *ebaseproto, *ebasedomain, *p;
- int isrelative;
-
- /* protocol part */
- for(p = link; *p && (isalpha((int)*p) || isdigit((int)*p) ||
- *p == '+' || *p == '-' || *p == '.'); p++);
- /* relative link (baseurl is used). */
- isrelative = strncmp(p, "://", strlen("://"));
- if(isrelative) {
- if((ebaseproto = strstr(baseurl, "://"))) {
- ebaseproto += strlen("://");
- printurlencoded(baseurl, ebaseproto - baseurl, fp);
+ struct uri ulink, ubase;
+ char tmp[4096] = "", *p;
+ int r = -1, c;
+
+ buf[0] = '\0';
+ if (parseuri(base, &ubase, 0) == -1 ||
+ parseuri(link, &ulink, 1) == -1)
+ return -1;
+
+ if (!ulink.host[0] && !ubase.host[0])
+ return -1;
+
+ r = snprintf(tmp, sizeof(tmp), "%s://%s",
+ ulink.proto[0] ?
+ ulink.proto :
+ (ubase.proto[0] ? ubase.proto : "http"),
+ !strncmp(link, "//", 2) ?
+ ulink.host :
+ (ulink.host[0] ? ulink.host : ubase.host));
+ if (r == -1 || (size_t)r >= sizeof(tmp))
+ return -1;
+
+ /* relative to root */
+ if (!ulink.host[0] && ulink.path[0] != '/') {
+ /* relative to base url path */
+ if (ulink.path[0]) {
+ if ((p = strrchr(ubase.path, '/'))) {
+ /* temporary null-terminate */
+ c = *(++p);
+ *p = '\0';
+ strlcat(tmp, ubase.path, sizeof(tmp));
+ *p = c; /* restore */
+ }
} else {
- ebaseproto = baseurl;
- if(*baseurl || (link[0] == '/' && link[1] == '/'))
- fputs("http://", fp);
+ strlcat(tmp, ubase.path, sizeof(tmp));
}
- if(link[0] == '/') { /* relative to baseurl domain (not path). */
- if(link[1] == '/') /* absolute url but with protocol from baseurl. */
- link += 2;
- else if((ebasedomain = strchr(ebaseproto, '/')))
- /* relative to baseurl and baseurl path. */
- printurlencoded(ebaseproto, ebasedomain - ebaseproto, fp);
- else
- printurlencoded(ebaseproto, strlen(ebaseproto), fp);
- } else if((ebasedomain = strrchr(ebaseproto, '/'))) {
- /* relative to baseurl and baseurl path. */
- printurlencoded(ebaseproto, ebasedomain - ebaseproto + 1, fp);
+ }
+ if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
+ return -1;
+
+ return encodeuri(tmp, buf, bufsiz);
+}
+
+int
+encodeuri(const char *s, char *buf, size_t bufsiz)
+{
+ size_t i, b;
+
+ if (!bufsiz)
+ return -1;
+ for (i = 0, b = 0; s[i]; i++) {
+ if ((int)s[i] == ' ' ||
+ (unsigned char)s[i] > 127 ||
+ iscntrl((int)s[i])) {
+ if (b + 3 >= bufsiz)
+ return -1;
+ buf[b++] = '%';
+ encodehex(s[i], &buf[b]);
+ b += 2;
} else {
- printurlencoded(ebaseproto, strlen(ebaseproto), fp);
- if(*baseurl && *link)
- fputc('/', fp);
+ if (b >= bufsiz)
+ return -1;
+ buf[b++] = s[i];
}
}
- printurlencoded(link, strlen(link), fp);
+ if (b >= bufsiz)
+ return -1;
+ buf[b] = '\0';
+
+ return 0;
}
/* read a field-separated line from 'fp',
@@ -135,6 +215,8 @@ printxmlencoded(const char *s, FILE *fp)
}
}
+/* print `len` columns of characters. If string is shorter pad the rest
+ * with characters `pad`. */
void
printutf8pad(FILE *fp, const char *s, size_t len, int pad)
{
@@ -156,6 +238,7 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
putc(pad, fp);
}
+/* parse time to time_t, assumes time_t is signed */
int
strtotime(const char *s, time_t *t)
{
@@ -179,15 +262,12 @@ printcontent(const char *s, FILE *fp)
for(p = s; *p; p++) {
if(*p == '\\') {
- p++;
- if(*p == '\\')
- fputc('\\', fp);
- else if(*p == 't')
- fputc('\t', fp);
- else if(*p == 'n')
- fputc('\n', fp);
- else
- fputc(*p, fp); /* unknown */
+ switch (*(++p)) {
+ case '\\': fputc('\\', fp); break;
+ case 't': fputc('\t', fp); break;
+ case 'n': fputc('\n', fp); break;
+ default: fputc(*p, fp);
+ }
} else {
fputc(*p, fp);
}
diff --git a/util.h b/util.h
@@ -1,6 +1,3 @@
-#include <stdio.h>
-#include <time.h>
-
#ifdef COMPAT
#include "compat.h"
#endif
@@ -10,27 +7,34 @@
/* feed info */
struct feed {
- char * name; /* feed name */
- unsigned long totalnew; /* amount of new items per feed */
- unsigned long total; /* total items */
- time_t timenewest;
- char timenewestformat[64];
+ char * name; /* feed name */
+ unsigned long totalnew; /* amount of new items per feed */
+ unsigned long total; /* total items */
+ time_t timenewest;
+ char timenewestformat[64];
+};
+
+/* uri */
+struct uri {
+ char proto[48];
+ char host[255];
+ char path[2048];
};
enum { FieldUnixTimestamp = 0, FieldTimeFormatted, FieldTitle, FieldLink,
FieldContent, FieldContentType, FieldId, FieldAuthor, FieldFeedType,
FieldLast };
-ssize_t chartoxmlentity(int, char *, size_t);
-int parseline(char **, size_t *, char **, unsigned int, int, FILE *);
-void printcontent(const char *, FILE *);
-void printxmlencoded(const char *, FILE *);
-void printlink(const char *, const char *, FILE *);
-void printurlencoded(const char *, size_t, FILE *);
-void printutf8pad(FILE *, const char *, size_t, int);
-int strtotime(const char *, time_t *);
-char *trimstart(const char *);
-char *trimend(const char *);
-char *xbasename(const char *);
+int absuri(const char *, const char *, char *, size_t);
+int encodeuri(const char *, char *, size_t);
+int parseline(char **, size_t *, char **, unsigned int, int, FILE *);
+int parseuri(const char *, struct uri *, int);
+void printcontent(const char *, FILE *);
+void printxmlencoded(const char *, FILE *);
+void printutf8pad(FILE *, const char *, size_t, int);
+int strtotime(const char *, time_t *);
+char * trimstart(const char *);
+char * trimend(const char *);
+char * xbasename(const char *);