commit 8262579564087b850fd05ed2fb72bfbdd7c9982a
parent 77606ed0b26aa364dd53b1b9689ff3fd8f0489f3
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 22 Aug 2018 16:08:01 +0200
xml: improve parsing of invalid attribute values separated by whitespace
It is invalid XML, but this allows parsing old HTML pages aswell.
For example:
<input id=cb checked type="checkbox" title='checkbox' />
or
<FONT FACE=wingdings SIZE=12><BLINK>oh hai</BLINK></FONT>
Diffstat:
M | xml.c | | | 34 | ++++++++++++++++++++++------------ |
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/xml.c b/xml.c
@@ -14,19 +14,20 @@ static void
xml_parseattrs(XMLParser *x)
{
size_t namelen = 0, valuelen;
- int c, endsep, endname = 0;
+ int c, endsep, endname = 0, valuestart = 0;
while ((c = x->getnext()) != EOF) {
- if (isspace(c)) { /* TODO: simplify endname ? */
+ if (isspace(c)) {
if (namelen)
endname = 1;
continue;
- }
- if (c == '?')
+ } else if (c == '?')
; /* ignore */
else if (c == '=') {
x->name[namelen] = '\0';
- } else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) {
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
/* attribute without value */
x->name[namelen] = '\0';
if (x->xmlattrstart)
@@ -38,12 +39,21 @@ xml_parseattrs(XMLParser *x)
endname = 0;
x->name[0] = c;
namelen = 1;
- } else if (namelen && (c == '\'' || c == '"')) {
+ } else if (namelen && valuestart) {
/* attribute with value */
- endsep = c; /* c is end separator */
if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
- for (valuelen = 0; (c = x->getnext()) != EOF;) {
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* isspace() */
+ goto startvalue;
+ }
+
+ while ((c = x->getnext()) != EOF) {
+startvalue:
if (c == '&') { /* entities */
x->data[valuelen] = '\0';
/* call data function with data before entity if there is data */
@@ -52,7 +62,7 @@ xml_parseattrs(XMLParser *x)
x->data[0] = c;
valuelen = 1;
while ((c = x->getnext()) != EOF) {
- if (c == endsep)
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
break;
if (valuelen < sizeof(x->data) - 1)
x->data[valuelen++] = c;
@@ -73,7 +83,7 @@ xml_parseattrs(XMLParser *x)
break;
}
}
- } else if (c != endsep) {
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
if (valuelen < sizeof(x->data) - 1) {
x->data[valuelen++] = c;
} else {
@@ -84,7 +94,7 @@ xml_parseattrs(XMLParser *x)
valuelen = 1;
}
}
- if (c == endsep) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
x->data[valuelen] = '\0';
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
@@ -93,7 +103,7 @@ xml_parseattrs(XMLParser *x)
break;
}
}
- namelen = endname = 0;
+ namelen = endname = valuestart = 0;
} else if (namelen < sizeof(x->name) - 1) {
x->name[namelen++] = c;
}