xml.c (10011B)
1 #include <errno.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 #include "xml.h" 7 8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 10 11 static void 12 xml_parseattrs(XMLParser *x) 13 { 14 size_t namelen = 0, valuelen; 15 int c, endsep, endname = 0, valuestart = 0; 16 17 while ((c = GETNEXT()) != EOF) { 18 if (ISSPACE(c)) { 19 if (namelen) 20 endname = 1; 21 continue; 22 } else if (c == '?') 23 ; /* ignore */ 24 else if (c == '=') { 25 x->name[namelen] = '\0'; 26 valuestart = 1; 27 endname = 1; 28 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { 29 /* attribute without value */ 30 x->name[namelen] = '\0'; 31 if (x->xmlattrstart) 32 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 33 if (x->xmlattr) 34 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 35 if (x->xmlattrend) 36 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 37 endname = 0; 38 x->name[0] = c; 39 namelen = 1; 40 } else if (namelen && valuestart) { 41 /* attribute with value */ 42 if (x->xmlattrstart) 43 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 44 45 valuelen = 0; 46 if (c == '\'' || c == '"') { 47 endsep = c; 48 } else { 49 endsep = ' '; /* ISSPACE() */ 50 goto startvalue; 51 } 52 53 while ((c = GETNEXT()) != EOF) { 54 startvalue: 55 if (c == '&') { /* entities */ 56 x->data[valuelen] = '\0'; 57 /* call data function with data before entity if there is data */ 58 if (valuelen && x->xmlattr) 59 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 60 x->data[0] = c; 61 valuelen = 1; 62 while ((c = GETNEXT()) != EOF) { 63 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) 64 break; 65 if (valuelen < sizeof(x->data) - 1) 66 x->data[valuelen++] = c; 67 else { 68 /* entity too long for buffer, handle as normal data */ 69 x->data[valuelen] = '\0'; 70 if (x->xmlattr) 71 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 72 x->data[0] = c; 73 valuelen = 1; 74 break; 75 } 76 if (c == ';') { 77 x->data[valuelen] = '\0'; 78 if (x->xmlattrentity) 79 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 80 valuelen = 0; 81 break; 82 } 83 } 84 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { 85 if (valuelen < sizeof(x->data) - 1) { 86 x->data[valuelen++] = c; 87 } else { 88 x->data[valuelen] = '\0'; 89 if (x->xmlattr) 90 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 91 x->data[0] = c; 92 valuelen = 1; 93 } 94 } 95 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { 96 x->data[valuelen] = '\0'; 97 if (x->xmlattr) 98 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 99 if (x->xmlattrend) 100 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 101 break; 102 } 103 } 104 namelen = endname = valuestart = 0; 105 } else if (namelen < sizeof(x->name) - 1) { 106 x->name[namelen++] = c; 107 } 108 if (c == '>') { 109 break; 110 } else if (c == '/') { 111 x->isshorttag = 1; 112 x->name[0] = '\0'; 113 namelen = 0; 114 } 115 } 116 } 117 118 static void 119 xml_parsecomment(XMLParser *x) 120 { 121 int c, i = 0; 122 123 while ((c = GETNEXT()) != EOF) { 124 if (c == '-') { 125 if (++i > 2) 126 i = 2; 127 continue; 128 } else if (c == '>' && i == 2) { 129 return; 130 } else if (i) { 131 i = 0; 132 } 133 } 134 } 135 136 static void 137 xml_parsecdata(XMLParser *x) 138 { 139 size_t datalen = 0, i = 0; 140 int c; 141 142 while ((c = GETNEXT()) != EOF) { 143 if (c == ']' || c == '>') { 144 if (x->xmlcdata && datalen) { 145 x->data[datalen] = '\0'; 146 x->xmlcdata(x, x->data, datalen); 147 datalen = 0; 148 } 149 } 150 151 if (c == ']') { 152 if (++i > 2) { 153 if (x->xmlcdata) 154 for (; i > 2; i--) 155 x->xmlcdata(x, "]", 1); 156 i = 2; 157 } 158 continue; 159 } else if (c == '>' && i == 2) { 160 return; 161 } else if (i) { 162 if (x->xmlcdata) 163 for (; i > 0; i--) 164 x->xmlcdata(x, "]", 1); 165 i = 0; 166 } 167 168 if (datalen < sizeof(x->data) - 1) { 169 x->data[datalen++] = c; 170 } else { 171 x->data[datalen] = '\0'; 172 if (x->xmlcdata) 173 x->xmlcdata(x, x->data, datalen); 174 x->data[0] = c; 175 datalen = 1; 176 } 177 } 178 } 179 180 static int 181 codepointtoutf8(long r, char *s) 182 { 183 if (r == 0) { 184 return 0; /* NUL byte */ 185 } else if (r <= 0x7F) { 186 /* 1 byte: 0aaaaaaa */ 187 s[0] = r; 188 return 1; 189 } else if (r <= 0x07FF) { 190 /* 2 bytes: 00000aaa aabbbbbb */ 191 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ 192 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ 193 return 2; 194 } else if (r <= 0xFFFF) { 195 /* 3 bytes: aaaabbbb bbcccccc */ 196 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ 197 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ 198 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ 199 return 3; 200 } else { 201 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ 202 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ 203 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ 204 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ 205 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ 206 return 4; 207 } 208 } 209 210 static int 211 namedentitytostr(const char *e, char *buf, size_t bufsiz) 212 { 213 static const struct { 214 const char *entity; 215 int c; 216 } entities[] = { 217 { "amp;", '&' }, 218 { "lt;", '<' }, 219 { "gt;", '>' }, 220 { "apos;", '\'' }, 221 { "quot;", '"' }, 222 }; 223 size_t i; 224 225 /* buffer is too small */ 226 if (bufsiz < 2) 227 return -1; 228 229 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { 230 if (!strcmp(e, entities[i].entity)) { 231 buf[0] = entities[i].c; 232 buf[1] = '\0'; 233 return 1; 234 } 235 } 236 return -1; 237 } 238 239 static int 240 numericentitytostr(const char *e, char *buf, size_t bufsiz) 241 { 242 long l; 243 int len; 244 char *end; 245 246 /* buffer is too small */ 247 if (bufsiz < 5) 248 return -1; 249 250 errno = 0; 251 /* hex (16) or decimal (10) */ 252 if (*e == 'x') 253 l = strtol(++e, &end, 16); 254 else 255 l = strtol(e, &end, 10); 256 /* invalid value or not a well-formed entity or invalid code point */ 257 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || 258 (l >= 0xd800 && l <= 0xdfff)) 259 return -1; 260 len = codepointtoutf8(l, buf); 261 buf[len] = '\0'; 262 263 return len; 264 } 265 266 /* convert named- or numeric entity string to buffer string 267 * returns byte-length of string or -1 on failure. */ 268 int 269 xml_entitytostr(const char *e, char *buf, size_t bufsiz) 270 { 271 /* doesn't start with & */ 272 if (e[0] != '&') 273 return -1; 274 /* numeric entity */ 275 if (e[1] == '#') 276 return numericentitytostr(e + 2, buf, bufsiz); 277 else /* named entity */ 278 return namedentitytostr(e + 1, buf, bufsiz); 279 } 280 281 void 282 xml_parse(XMLParser *x) 283 { 284 size_t datalen, tagdatalen; 285 int c, isend; 286 287 while ((c = GETNEXT()) != EOF && c != '<') 288 ; /* skip until < */ 289 290 while (c != EOF) { 291 if (c == '<') { /* parse tag */ 292 if ((c = GETNEXT()) == EOF) 293 return; 294 295 if (c == '!') { /* CDATA and comments */ 296 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { 297 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ 298 if (tagdatalen <= sizeof("[CDATA[") - 1) 299 x->data[tagdatalen++] = c; 300 if (c == '>') 301 break; 302 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 303 (x->data[0] == '-')) { 304 xml_parsecomment(x); 305 break; 306 } else if (c == '[') { 307 if (tagdatalen == sizeof("[CDATA[") - 1 && 308 !strncmp(x->data, "[CDATA[", tagdatalen)) { 309 xml_parsecdata(x); 310 break; 311 } 312 } 313 } 314 } else { 315 /* normal tag (open, short open, close), processing instruction. */ 316 x->tag[0] = c; 317 x->taglen = 1; 318 x->isshorttag = isend = 0; 319 320 /* treat processing instruction as shorttag, don't strip "?" prefix. */ 321 if (c == '?') { 322 x->isshorttag = 1; 323 } else if (c == '/') { 324 if ((c = GETNEXT()) == EOF) 325 return; 326 x->tag[0] = c; 327 isend = 1; 328 } 329 330 while ((c = GETNEXT()) != EOF) { 331 if (c == '/') 332 x->isshorttag = 1; /* short tag */ 333 else if (c == '>' || ISSPACE(c)) { 334 x->tag[x->taglen] = '\0'; 335 if (isend) { /* end tag, starts with </ */ 336 if (x->xmltagend) 337 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 338 x->tag[0] = '\0'; 339 x->taglen = 0; 340 } else { 341 /* start tag */ 342 if (x->xmltagstart) 343 x->xmltagstart(x, x->tag, x->taglen); 344 if (ISSPACE(c)) 345 xml_parseattrs(x); 346 if (x->xmltagstartparsed) 347 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); 348 } 349 /* call tagend for shortform or processing instruction */ 350 if (x->isshorttag) { 351 if (x->xmltagend) 352 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 353 x->tag[0] = '\0'; 354 x->taglen = 0; 355 } 356 break; 357 } else if (x->taglen < sizeof(x->tag) - 1) 358 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 359 } 360 } 361 } else { 362 /* parse tag data */ 363 datalen = 0; 364 while ((c = GETNEXT()) != EOF) { 365 if (c == '&') { 366 if (datalen) { 367 x->data[datalen] = '\0'; 368 if (x->xmldata) 369 x->xmldata(x, x->data, datalen); 370 } 371 x->data[0] = c; 372 datalen = 1; 373 while ((c = GETNEXT()) != EOF) { 374 if (c == '<') 375 break; 376 if (datalen < sizeof(x->data) - 1) 377 x->data[datalen++] = c; 378 else { 379 /* entity too long for buffer, handle as normal data */ 380 x->data[datalen] = '\0'; 381 if (x->xmldata) 382 x->xmldata(x, x->data, datalen); 383 x->data[0] = c; 384 datalen = 1; 385 break; 386 } 387 if (c == ';') { 388 x->data[datalen] = '\0'; 389 if (x->xmldataentity) 390 x->xmldataentity(x, x->data, datalen); 391 datalen = 0; 392 break; 393 } 394 } 395 } else if (c != '<') { 396 if (datalen < sizeof(x->data) - 1) { 397 x->data[datalen++] = c; 398 } else { 399 x->data[datalen] = '\0'; 400 if (x->xmldata) 401 x->xmldata(x, x->data, datalen); 402 x->data[0] = c; 403 datalen = 1; 404 } 405 } 406 if (c == '<') { 407 x->data[datalen] = '\0'; 408 if (x->xmldata && datalen) 409 x->xmldata(x, x->data, datalen); 410 break; 411 } 412 } 413 } 414 } 415 }