sfeed

simple feed reader - forked from git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeed
Log | Files | Refs | Atom | README | LICENSE

xml.c (10011B)


      1 #include <errno.h>
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <string.h>
      5 
      6 #include "xml.h"
      7 
      8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
      9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
     10 
     11 static void
     12 xml_parseattrs(XMLParser *x)
     13 {
     14 	size_t namelen = 0, valuelen;
     15 	int c, endsep, endname = 0, valuestart = 0;
     16 
     17 	while ((c = GETNEXT()) != EOF) {
     18 		if (ISSPACE(c)) {
     19 			if (namelen)
     20 				endname = 1;
     21 			continue;
     22 		} else if (c == '?')
     23 			; /* ignore */
     24 		else if (c == '=') {
     25 			x->name[namelen] = '\0';
     26 			valuestart = 1;
     27 			endname = 1;
     28 		} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
     29 			/* attribute without value */
     30 			x->name[namelen] = '\0';
     31 			if (x->xmlattrstart)
     32 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     33 			if (x->xmlattr)
     34 				x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
     35 			if (x->xmlattrend)
     36 				x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
     37 			endname = 0;
     38 			x->name[0] = c;
     39 			namelen = 1;
     40 		} else if (namelen && valuestart) {
     41 			/* attribute with value */
     42 			if (x->xmlattrstart)
     43 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     44 
     45 			valuelen = 0;
     46 			if (c == '\'' || c == '"') {
     47 				endsep = c;
     48 			} else {
     49 				endsep = ' '; /* ISSPACE() */
     50 				goto startvalue;
     51 			}
     52 
     53 			while ((c = GETNEXT()) != EOF) {
     54 startvalue:
     55 				if (c == '&') { /* entities */
     56 					x->data[valuelen] = '\0';
     57 					/* call data function with data before entity if there is data */
     58 					if (valuelen && x->xmlattr)
     59 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     60 					x->data[0] = c;
     61 					valuelen = 1;
     62 					while ((c = GETNEXT()) != EOF) {
     63 						if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
     64 							break;
     65 						if (valuelen < sizeof(x->data) - 1)
     66 							x->data[valuelen++] = c;
     67 						else {
     68 							/* entity too long for buffer, handle as normal data */
     69 							x->data[valuelen] = '\0';
     70 							if (x->xmlattr)
     71 								x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     72 							x->data[0] = c;
     73 							valuelen = 1;
     74 							break;
     75 						}
     76 						if (c == ';') {
     77 							x->data[valuelen] = '\0';
     78 							if (x->xmlattrentity)
     79 								x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     80 							valuelen = 0;
     81 							break;
     82 						}
     83 					}
     84 				} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
     85 					if (valuelen < sizeof(x->data) - 1) {
     86 						x->data[valuelen++] = c;
     87 					} else {
     88 						x->data[valuelen] = '\0';
     89 						if (x->xmlattr)
     90 							x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     91 						x->data[0] = c;
     92 						valuelen = 1;
     93 					}
     94 				}
     95 				if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
     96 					x->data[valuelen] = '\0';
     97 					if (x->xmlattr)
     98 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     99 					if (x->xmlattrend)
    100 						x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
    101 					break;
    102 				}
    103 			}
    104 			namelen = endname = valuestart = 0;
    105 		} else if (namelen < sizeof(x->name) - 1) {
    106 			x->name[namelen++] = c;
    107 		}
    108 		if (c == '>') {
    109 			break;
    110 		} else if (c == '/') {
    111 			x->isshorttag = 1;
    112 			x->name[0] = '\0';
    113 			namelen = 0;
    114 		}
    115 	}
    116 }
    117 
    118 static void
    119 xml_parsecomment(XMLParser *x)
    120 {
    121 	int c, i = 0;
    122 
    123 	while ((c = GETNEXT()) != EOF) {
    124 		if (c == '-') {
    125 			if (++i > 2)
    126 				i = 2;
    127 			continue;
    128 		} else if (c == '>' && i == 2) {
    129 			return;
    130 		} else if (i) {
    131 			i = 0;
    132 		}
    133 	}
    134 }
    135 
    136 static void
    137 xml_parsecdata(XMLParser *x)
    138 {
    139 	size_t datalen = 0, i = 0;
    140 	int c;
    141 
    142 	while ((c = GETNEXT()) != EOF) {
    143 		if (c == ']' || c == '>') {
    144 			if (x->xmlcdata && datalen) {
    145 				x->data[datalen] = '\0';
    146 				x->xmlcdata(x, x->data, datalen);
    147 				datalen = 0;
    148 			}
    149 		}
    150 
    151 		if (c == ']') {
    152 			if (++i > 2) {
    153 				if (x->xmlcdata)
    154 					for (; i > 2; i--)
    155 						x->xmlcdata(x, "]", 1);
    156 				i = 2;
    157 			}
    158 			continue;
    159 		} else if (c == '>' && i == 2) {
    160 			return;
    161 		} else if (i) {
    162 			if (x->xmlcdata)
    163 				for (; i > 0; i--)
    164 					x->xmlcdata(x, "]", 1);
    165 			i = 0;
    166 		}
    167 
    168 		if (datalen < sizeof(x->data) - 1) {
    169 			x->data[datalen++] = c;
    170 		} else {
    171 			x->data[datalen] = '\0';
    172 			if (x->xmlcdata)
    173 				x->xmlcdata(x, x->data, datalen);
    174 			x->data[0] = c;
    175 			datalen = 1;
    176 		}
    177 	}
    178 }
    179 
    180 static int
    181 codepointtoutf8(long r, char *s)
    182 {
    183 	if (r == 0) {
    184 		return 0; /* NUL byte */
    185 	} else if (r <= 0x7F) {
    186 		/* 1 byte: 0aaaaaaa */
    187 		s[0] = r;
    188 		return 1;
    189 	} else if (r <= 0x07FF) {
    190 		/* 2 bytes: 00000aaa aabbbbbb */
    191 		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
    192 		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
    193 		return 2;
    194 	} else if (r <= 0xFFFF) {
    195 		/* 3 bytes: aaaabbbb bbcccccc */
    196 		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
    197 		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
    198 		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
    199 		return 3;
    200 	} else {
    201 		/* 4 bytes: 000aaabb bbbbcccc ccdddddd */
    202 		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
    203 		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
    204 		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
    205 		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
    206 		return 4;
    207 	}
    208 }
    209 
    210 static int
    211 namedentitytostr(const char *e, char *buf, size_t bufsiz)
    212 {
    213 	static const struct {
    214 		const char *entity;
    215 		int c;
    216 	} entities[] = {
    217 		{ "amp;",  '&'  },
    218 		{ "lt;",   '<'  },
    219 		{ "gt;",   '>'  },
    220 		{ "apos;", '\'' },
    221 		{ "quot;", '"'  },
    222 	};
    223 	size_t i;
    224 
    225 	/* buffer is too small */
    226 	if (bufsiz < 2)
    227 		return -1;
    228 
    229 	for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
    230 		if (!strcmp(e, entities[i].entity)) {
    231 			buf[0] = entities[i].c;
    232 			buf[1] = '\0';
    233 			return 1;
    234 		}
    235 	}
    236 	return -1;
    237 }
    238 
    239 static int
    240 numericentitytostr(const char *e, char *buf, size_t bufsiz)
    241 {
    242 	long l;
    243 	int len;
    244 	char *end;
    245 
    246 	/* buffer is too small */
    247 	if (bufsiz < 5)
    248 		return -1;
    249 
    250 	errno = 0;
    251 	/* hex (16) or decimal (10) */
    252 	if (*e == 'x')
    253 		l = strtol(++e, &end, 16);
    254 	else
    255 		l = strtol(e, &end, 10);
    256 	/* invalid value or not a well-formed entity or invalid code point */
    257 	if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
    258 	    (l >= 0xd800 && l <= 0xdfff))
    259 		return -1;
    260 	len = codepointtoutf8(l, buf);
    261 	buf[len] = '\0';
    262 
    263 	return len;
    264 }
    265 
    266 /* convert named- or numeric entity string to buffer string
    267  * returns byte-length of string or -1 on failure. */
    268 int
    269 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
    270 {
    271 	/* doesn't start with & */
    272 	if (e[0] != '&')
    273 		return -1;
    274 	/* numeric entity */
    275 	if (e[1] == '#')
    276 		return numericentitytostr(e + 2, buf, bufsiz);
    277 	else /* named entity */
    278 		return namedentitytostr(e + 1, buf, bufsiz);
    279 }
    280 
    281 void
    282 xml_parse(XMLParser *x)
    283 {
    284 	size_t datalen, tagdatalen;
    285 	int c, isend;
    286 
    287 	while ((c = GETNEXT()) != EOF && c != '<')
    288 		; /* skip until < */
    289 
    290 	while (c != EOF) {
    291 		if (c == '<') { /* parse tag */
    292 			if ((c = GETNEXT()) == EOF)
    293 				return;
    294 
    295 			if (c == '!') { /* CDATA and comments */
    296 				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
    297 					/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
    298 					if (tagdatalen <= sizeof("[CDATA[") - 1)
    299 						x->data[tagdatalen++] = c;
    300 					if (c == '>')
    301 						break;
    302 					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
    303 							(x->data[0] == '-')) {
    304 						xml_parsecomment(x);
    305 						break;
    306 					} else if (c == '[') {
    307 						if (tagdatalen == sizeof("[CDATA[") - 1 &&
    308 						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
    309 							xml_parsecdata(x);
    310 							break;
    311 						}
    312 					}
    313 				}
    314 			} else {
    315 				/* normal tag (open, short open, close), processing instruction. */
    316 				x->tag[0] = c;
    317 				x->taglen = 1;
    318 				x->isshorttag = isend = 0;
    319 
    320 				/* treat processing instruction as shorttag, don't strip "?" prefix. */
    321 				if (c == '?') {
    322 					x->isshorttag = 1;
    323 				} else if (c == '/') {
    324 					if ((c = GETNEXT()) == EOF)
    325 						return;
    326 					x->tag[0] = c;
    327 					isend = 1;
    328 				}
    329 
    330 				while ((c = GETNEXT()) != EOF) {
    331 					if (c == '/')
    332 						x->isshorttag = 1; /* short tag */
    333 					else if (c == '>' || ISSPACE(c)) {
    334 						x->tag[x->taglen] = '\0';
    335 						if (isend) { /* end tag, starts with </ */
    336 							if (x->xmltagend)
    337 								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
    338 							x->tag[0] = '\0';
    339 							x->taglen = 0;
    340 						} else {
    341 							/* start tag */
    342 							if (x->xmltagstart)
    343 								x->xmltagstart(x, x->tag, x->taglen);
    344 							if (ISSPACE(c))
    345 								xml_parseattrs(x);
    346 							if (x->xmltagstartparsed)
    347 								x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
    348 						}
    349 						/* call tagend for shortform or processing instruction */
    350 						if (x->isshorttag) {
    351 							if (x->xmltagend)
    352 								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
    353 							x->tag[0] = '\0';
    354 							x->taglen = 0;
    355 						}
    356 						break;
    357 					} else if (x->taglen < sizeof(x->tag) - 1)
    358 						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
    359 				}
    360 			}
    361 		} else {
    362 			/* parse tag data */
    363 			datalen = 0;
    364 			while ((c = GETNEXT()) != EOF) {
    365 				if (c == '&') {
    366 					if (datalen) {
    367 						x->data[datalen] = '\0';
    368 						if (x->xmldata)
    369 							x->xmldata(x, x->data, datalen);
    370 					}
    371 					x->data[0] = c;
    372 					datalen = 1;
    373 					while ((c = GETNEXT()) != EOF) {
    374 						if (c == '<')
    375 							break;
    376 						if (datalen < sizeof(x->data) - 1)
    377 							x->data[datalen++] = c;
    378 						else {
    379 							/* entity too long for buffer, handle as normal data */
    380 							x->data[datalen] = '\0';
    381 							if (x->xmldata)
    382 								x->xmldata(x, x->data, datalen);
    383 							x->data[0] = c;
    384 							datalen = 1;
    385 							break;
    386 						}
    387 						if (c == ';') {
    388 							x->data[datalen] = '\0';
    389 							if (x->xmldataentity)
    390 								x->xmldataentity(x, x->data, datalen);
    391 							datalen = 0;
    392 							break;
    393 						}
    394 					}
    395 				} else if (c != '<') {
    396 					if (datalen < sizeof(x->data) - 1) {
    397 						x->data[datalen++] = c;
    398 					} else {
    399 						x->data[datalen] = '\0';
    400 						if (x->xmldata)
    401 							x->xmldata(x, x->data, datalen);
    402 						x->data[0] = c;
    403 						datalen = 1;
    404 					}
    405 				}
    406 				if (c == '<') {
    407 					x->data[datalen] = '\0';
    408 					if (x->xmldata && datalen)
    409 						x->xmldata(x, x->data, datalen);
    410 					break;
    411 				}
    412 			}
    413 		}
    414 	}
    415 }