sfeed

simple feed reader - forked from https://git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeedsfeed.zip
Log | Files | Refs | Atom | README | LICENSE

xml.c (raw) (10011B)


   1 #include <errno.h>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <string.h>
   5 
   6 #include "xml.h"
   7 
   8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
   9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
  10 
  11 static void
  12 xml_parseattrs(XMLParser *x)
  13 {
  14 	size_t namelen = 0, valuelen;
  15 	int c, endsep, endname = 0, valuestart = 0;
  16 
  17 	while ((c = GETNEXT()) != EOF) {
  18 		if (ISSPACE(c)) {
  19 			if (namelen)
  20 				endname = 1;
  21 			continue;
  22 		} else if (c == '?')
  23 			; /* ignore */
  24 		else if (c == '=') {
  25 			x->name[namelen] = '\0';
  26 			valuestart = 1;
  27 			endname = 1;
  28 		} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
  29 			/* attribute without value */
  30 			x->name[namelen] = '\0';
  31 			if (x->xmlattrstart)
  32 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
  33 			if (x->xmlattr)
  34 				x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
  35 			if (x->xmlattrend)
  36 				x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
  37 			endname = 0;
  38 			x->name[0] = c;
  39 			namelen = 1;
  40 		} else if (namelen && valuestart) {
  41 			/* attribute with value */
  42 			if (x->xmlattrstart)
  43 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
  44 
  45 			valuelen = 0;
  46 			if (c == '\'' || c == '"') {
  47 				endsep = c;
  48 			} else {
  49 				endsep = ' '; /* ISSPACE() */
  50 				goto startvalue;
  51 			}
  52 
  53 			while ((c = GETNEXT()) != EOF) {
  54 startvalue:
  55 				if (c == '&') { /* entities */
  56 					x->data[valuelen] = '\0';
  57 					/* call data function with data before entity if there is data */
  58 					if (valuelen && x->xmlattr)
  59 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  60 					x->data[0] = c;
  61 					valuelen = 1;
  62 					while ((c = GETNEXT()) != EOF) {
  63 						if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
  64 							break;
  65 						if (valuelen < sizeof(x->data) - 1)
  66 							x->data[valuelen++] = c;
  67 						else {
  68 							/* entity too long for buffer, handle as normal data */
  69 							x->data[valuelen] = '\0';
  70 							if (x->xmlattr)
  71 								x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  72 							x->data[0] = c;
  73 							valuelen = 1;
  74 							break;
  75 						}
  76 						if (c == ';') {
  77 							x->data[valuelen] = '\0';
  78 							if (x->xmlattrentity)
  79 								x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  80 							valuelen = 0;
  81 							break;
  82 						}
  83 					}
  84 				} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
  85 					if (valuelen < sizeof(x->data) - 1) {
  86 						x->data[valuelen++] = c;
  87 					} else {
  88 						x->data[valuelen] = '\0';
  89 						if (x->xmlattr)
  90 							x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  91 						x->data[0] = c;
  92 						valuelen = 1;
  93 					}
  94 				}
  95 				if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
  96 					x->data[valuelen] = '\0';
  97 					if (x->xmlattr)
  98 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  99 					if (x->xmlattrend)
 100 						x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
 101 					break;
 102 				}
 103 			}
 104 			namelen = endname = valuestart = 0;
 105 		} else if (namelen < sizeof(x->name) - 1) {
 106 			x->name[namelen++] = c;
 107 		}
 108 		if (c == '>') {
 109 			break;
 110 		} else if (c == '/') {
 111 			x->isshorttag = 1;
 112 			x->name[0] = '\0';
 113 			namelen = 0;
 114 		}
 115 	}
 116 }
 117 
 118 static void
 119 xml_parsecomment(XMLParser *x)
 120 {
 121 	int c, i = 0;
 122 
 123 	while ((c = GETNEXT()) != EOF) {
 124 		if (c == '-') {
 125 			if (++i > 2)
 126 				i = 2;
 127 			continue;
 128 		} else if (c == '>' && i == 2) {
 129 			return;
 130 		} else if (i) {
 131 			i = 0;
 132 		}
 133 	}
 134 }
 135 
 136 static void
 137 xml_parsecdata(XMLParser *x)
 138 {
 139 	size_t datalen = 0, i = 0;
 140 	int c;
 141 
 142 	while ((c = GETNEXT()) != EOF) {
 143 		if (c == ']' || c == '>') {
 144 			if (x->xmlcdata && datalen) {
 145 				x->data[datalen] = '\0';
 146 				x->xmlcdata(x, x->data, datalen);
 147 				datalen = 0;
 148 			}
 149 		}
 150 
 151 		if (c == ']') {
 152 			if (++i > 2) {
 153 				if (x->xmlcdata)
 154 					for (; i > 2; i--)
 155 						x->xmlcdata(x, "]", 1);
 156 				i = 2;
 157 			}
 158 			continue;
 159 		} else if (c == '>' && i == 2) {
 160 			return;
 161 		} else if (i) {
 162 			if (x->xmlcdata)
 163 				for (; i > 0; i--)
 164 					x->xmlcdata(x, "]", 1);
 165 			i = 0;
 166 		}
 167 
 168 		if (datalen < sizeof(x->data) - 1) {
 169 			x->data[datalen++] = c;
 170 		} else {
 171 			x->data[datalen] = '\0';
 172 			if (x->xmlcdata)
 173 				x->xmlcdata(x, x->data, datalen);
 174 			x->data[0] = c;
 175 			datalen = 1;
 176 		}
 177 	}
 178 }
 179 
 180 static int
 181 codepointtoutf8(long r, char *s)
 182 {
 183 	if (r == 0) {
 184 		return 0; /* NUL byte */
 185 	} else if (r <= 0x7F) {
 186 		/* 1 byte: 0aaaaaaa */
 187 		s[0] = r;
 188 		return 1;
 189 	} else if (r <= 0x07FF) {
 190 		/* 2 bytes: 00000aaa aabbbbbb */
 191 		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
 192 		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
 193 		return 2;
 194 	} else if (r <= 0xFFFF) {
 195 		/* 3 bytes: aaaabbbb bbcccccc */
 196 		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
 197 		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
 198 		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
 199 		return 3;
 200 	} else {
 201 		/* 4 bytes: 000aaabb bbbbcccc ccdddddd */
 202 		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
 203 		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
 204 		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
 205 		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
 206 		return 4;
 207 	}
 208 }
 209 
 210 static int
 211 namedentitytostr(const char *e, char *buf, size_t bufsiz)
 212 {
 213 	static const struct {
 214 		const char *entity;
 215 		int c;
 216 	} entities[] = {
 217 		{ "amp;",  '&'  },
 218 		{ "lt;",   '<'  },
 219 		{ "gt;",   '>'  },
 220 		{ "apos;", '\'' },
 221 		{ "quot;", '"'  },
 222 	};
 223 	size_t i;
 224 
 225 	/* buffer is too small */
 226 	if (bufsiz < 2)
 227 		return -1;
 228 
 229 	for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
 230 		if (!strcmp(e, entities[i].entity)) {
 231 			buf[0] = entities[i].c;
 232 			buf[1] = '\0';
 233 			return 1;
 234 		}
 235 	}
 236 	return -1;
 237 }
 238 
 239 static int
 240 numericentitytostr(const char *e, char *buf, size_t bufsiz)
 241 {
 242 	long l;
 243 	int len;
 244 	char *end;
 245 
 246 	/* buffer is too small */
 247 	if (bufsiz < 5)
 248 		return -1;
 249 
 250 	errno = 0;
 251 	/* hex (16) or decimal (10) */
 252 	if (*e == 'x')
 253 		l = strtol(++e, &end, 16);
 254 	else
 255 		l = strtol(e, &end, 10);
 256 	/* invalid value or not a well-formed entity or invalid code point */
 257 	if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
 258 	    (l >= 0xd800 && l <= 0xdfff))
 259 		return -1;
 260 	len = codepointtoutf8(l, buf);
 261 	buf[len] = '\0';
 262 
 263 	return len;
 264 }
 265 
 266 /* convert named- or numeric entity string to buffer string
 267  * returns byte-length of string or -1 on failure. */
 268 int
 269 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
 270 {
 271 	/* doesn't start with & */
 272 	if (e[0] != '&')
 273 		return -1;
 274 	/* numeric entity */
 275 	if (e[1] == '#')
 276 		return numericentitytostr(e + 2, buf, bufsiz);
 277 	else /* named entity */
 278 		return namedentitytostr(e + 1, buf, bufsiz);
 279 }
 280 
 281 void
 282 xml_parse(XMLParser *x)
 283 {
 284 	size_t datalen, tagdatalen;
 285 	int c, isend;
 286 
 287 	while ((c = GETNEXT()) != EOF && c != '<')
 288 		; /* skip until < */
 289 
 290 	while (c != EOF) {
 291 		if (c == '<') { /* parse tag */
 292 			if ((c = GETNEXT()) == EOF)
 293 				return;
 294 
 295 			if (c == '!') { /* CDATA and comments */
 296 				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
 297 					/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
 298 					if (tagdatalen <= sizeof("[CDATA[") - 1)
 299 						x->data[tagdatalen++] = c;
 300 					if (c == '>')
 301 						break;
 302 					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
 303 							(x->data[0] == '-')) {
 304 						xml_parsecomment(x);
 305 						break;
 306 					} else if (c == '[') {
 307 						if (tagdatalen == sizeof("[CDATA[") - 1 &&
 308 						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
 309 							xml_parsecdata(x);
 310 							break;
 311 						}
 312 					}
 313 				}
 314 			} else {
 315 				/* normal tag (open, short open, close), processing instruction. */
 316 				x->tag[0] = c;
 317 				x->taglen = 1;
 318 				x->isshorttag = isend = 0;
 319 
 320 				/* treat processing instruction as shorttag, don't strip "?" prefix. */
 321 				if (c == '?') {
 322 					x->isshorttag = 1;
 323 				} else if (c == '/') {
 324 					if ((c = GETNEXT()) == EOF)
 325 						return;
 326 					x->tag[0] = c;
 327 					isend = 1;
 328 				}
 329 
 330 				while ((c = GETNEXT()) != EOF) {
 331 					if (c == '/')
 332 						x->isshorttag = 1; /* short tag */
 333 					else if (c == '>' || ISSPACE(c)) {
 334 						x->tag[x->taglen] = '\0';
 335 						if (isend) { /* end tag, starts with </ */
 336 							if (x->xmltagend)
 337 								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
 338 							x->tag[0] = '\0';
 339 							x->taglen = 0;
 340 						} else {
 341 							/* start tag */
 342 							if (x->xmltagstart)
 343 								x->xmltagstart(x, x->tag, x->taglen);
 344 							if (ISSPACE(c))
 345 								xml_parseattrs(x);
 346 							if (x->xmltagstartparsed)
 347 								x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
 348 						}
 349 						/* call tagend for shortform or processing instruction */
 350 						if (x->isshorttag) {
 351 							if (x->xmltagend)
 352 								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
 353 							x->tag[0] = '\0';
 354 							x->taglen = 0;
 355 						}
 356 						break;
 357 					} else if (x->taglen < sizeof(x->tag) - 1)
 358 						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
 359 				}
 360 			}
 361 		} else {
 362 			/* parse tag data */
 363 			datalen = 0;
 364 			while ((c = GETNEXT()) != EOF) {
 365 				if (c == '&') {
 366 					if (datalen) {
 367 						x->data[datalen] = '\0';
 368 						if (x->xmldata)
 369 							x->xmldata(x, x->data, datalen);
 370 					}
 371 					x->data[0] = c;
 372 					datalen = 1;
 373 					while ((c = GETNEXT()) != EOF) {
 374 						if (c == '<')
 375 							break;
 376 						if (datalen < sizeof(x->data) - 1)
 377 							x->data[datalen++] = c;
 378 						else {
 379 							/* entity too long for buffer, handle as normal data */
 380 							x->data[datalen] = '\0';
 381 							if (x->xmldata)
 382 								x->xmldata(x, x->data, datalen);
 383 							x->data[0] = c;
 384 							datalen = 1;
 385 							break;
 386 						}
 387 						if (c == ';') {
 388 							x->data[datalen] = '\0';
 389 							if (x->xmldataentity)
 390 								x->xmldataentity(x, x->data, datalen);
 391 							datalen = 0;
 392 							break;
 393 						}
 394 					}
 395 				} else if (c != '<') {
 396 					if (datalen < sizeof(x->data) - 1) {
 397 						x->data[datalen++] = c;
 398 					} else {
 399 						x->data[datalen] = '\0';
 400 						if (x->xmldata)
 401 							x->xmldata(x, x->data, datalen);
 402 						x->data[0] = c;
 403 						datalen = 1;
 404 					}
 405 				}
 406 				if (c == '<') {
 407 					x->data[datalen] = '\0';
 408 					if (x->xmldata && datalen)
 409 						x->xmldata(x, x->data, datalen);
 410 					break;
 411 				}
 412 			}
 413 		}
 414 	}
 415 }