json: Fix \uXXXX for surrogate pairs

The JSON parser treats each half of a surrogate pair as unpaired
surrogate.  Fix it to recognize surrogate pairs.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180823164025.12553-30-armbru@redhat.com>
This commit is contained in:
Markus Armbruster 2018-08-23 18:39:56 +02:00
parent 46a628b139
commit dc45a07c36
2 changed files with 40 additions and 23 deletions

View file

@ -64,16 +64,27 @@ static void GCC_FMT_ATTR(3, 4) parse_error(JSONParserContext *ctxt,
error_setg(&ctxt->err, "JSON parse error, %s", message); error_setg(&ctxt->err, "JSON parse error, %s", message);
} }
static int hex2decimal(char ch) static int cvt4hex(const char *s)
{ {
if (ch >= '0' && ch <= '9') { int cp, i;
return (ch - '0');
} else if (ch >= 'a' && ch <= 'f') { cp = 0;
return 10 + (ch - 'a'); for (i = 0; i < 4; i++) {
} else if (ch >= 'A' && ch <= 'F') { if (!qemu_isxdigit(s[i])) {
return 10 + (ch - 'A'); return -1;
}
cp <<= 4;
if (s[i] >= '0' && s[i] <= '9') {
cp |= s[i] - '0';
} else if (s[i] >= 'a' && s[i] <= 'f') {
cp |= 10 + s[i] - 'a';
} else if (s[i] >= 'A' && s[i] <= 'F') {
cp |= 10 + s[i] - 'A';
} else {
return -1;
}
} }
abort(); return cp;
} }
/** /**
@ -115,7 +126,8 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
const char *ptr = token->str; const char *ptr = token->str;
QString *str; QString *str;
char quote; char quote;
int cp, i; const char *beg;
int cp, trailing;
char *end; char *end;
ssize_t len; ssize_t len;
char utf8_buf[5]; char utf8_buf[5];
@ -127,7 +139,7 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
while (*ptr != quote) { while (*ptr != quote) {
assert(*ptr); assert(*ptr);
if (*ptr == '\\') { if (*ptr == '\\') {
ptr++; beg = ptr++;
switch (*ptr++) { switch (*ptr++) {
case '"': case '"':
qstring_append_chr(str, '"'); qstring_append_chr(str, '"');
@ -157,22 +169,28 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
qstring_append_chr(str, '\t'); qstring_append_chr(str, '\t');
break; break;
case 'u': case 'u':
cp = 0; cp = cvt4hex(ptr);
for (i = 0; i < 4; i++) { ptr += 4;
if (!qemu_isxdigit(*ptr)) {
parse_error(ctxt, token, /* handle surrogate pairs */
"invalid hex escape sequence in string"); if (cp >= 0xD800 && cp <= 0xDBFF
goto out; && ptr[0] == '\\' && ptr[1] == 'u') {
/* leading surrogate followed by \u */
cp = 0x10000 + ((cp & 0x3FF) << 10);
trailing = cvt4hex(ptr + 2);
if (trailing >= 0xDC00 && trailing <= 0xDFFF) {
/* followed by trailing surrogate */
cp |= trailing & 0x3FF;
ptr += 6;
} else {
cp = -1; /* invalid */
} }
cp <<= 4;
cp |= hex2decimal(*ptr);
ptr++;
} }
if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) { if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) {
parse_error(ctxt, token, parse_error(ctxt, token,
"\\u%.4s is not a valid Unicode character", "%.*s is not a valid Unicode character",
ptr - 3); (int)(ptr - beg), beg);
goto out; goto out;
} }
qstring_append(str, utf8_buf); qstring_append(str, utf8_buf);

View file

@ -63,8 +63,7 @@ static void escaped_string(void)
{ "double byte utf-8 \\u00A2", "double byte utf-8 \xc2\xa2" }, { "double byte utf-8 \\u00A2", "double byte utf-8 \xc2\xa2" },
{ "triple byte utf-8 \\u20AC", "triple byte utf-8 \xe2\x82\xac" }, { "triple byte utf-8 \\u20AC", "triple byte utf-8 \xe2\x82\xac" },
{ "quadruple byte utf-8 \\uD834\\uDD1E", /* U+1D11E */ { "quadruple byte utf-8 \\uD834\\uDD1E", /* U+1D11E */
/* bug: want \xF0\x9D\x84\x9E */ "quadruple byte utf-8 \xF0\x9D\x84\x9E" },
NULL },
{ "\\", NULL }, { "\\", NULL },
{ "\\z", NULL }, { "\\z", NULL },
{ "\\ux", NULL }, { "\\ux", NULL },