json: Reject invalid UTF-8 sequences

We reject bytes that can't occur in valid UTF-8 (\xC0..\xC1, \xF5..\xFF in the lexer. That's insufficient; there's plenty of invalid UTF-8 not containing these bytes, as demonstrated by check-qjson: * Malformed sequences - Unexpected continuation bytes - Missing continuation bytes after start bytes other than \xC0..\xC1, \xF5..\xFD. * Overlong sequences with start bytes other than \xC0..\xC1, \xF5..\xFD. * Invalid code points Fixing this in the lexer would be bothersome. Fixing it in the parser is straightforward, so do that. Signed-off-by: Markus Armbruster <armbru@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Message-Id: <20180823164025.12553-23-armbru@redhat.com>
2025-08-03 07:43:54 -06:00 · 2018-08-23 18:39:49 +02:00 · 2018-08-23 18:39:49 +02:00 · e59f39d403
commit e59f39d403
parent a89d3104a2
4 changed files with 122 additions and 105 deletions
--- a/tests/check-qjson.c
+++ b/tests/check-qjson.c
@ -152,13 +152,6 @@ static void string_with_quotes(void)
 static void utf8_string(void)
 {
    /*
-     * FIXME Current behavior for invalid UTF-8 sequences is
-     * incorrect.  This test expects current, incorrect results.
-     * They're all marked "bug:" below, and are to be replaced by
-     * correct ones as the bugs get fixed.
-     *
-     * The JSON parser rejects some, but not all invalid sequences.
-     *
     * Problem: we can't easily deal with embedded U+0000.  Parsing
     * the JSON string "this \\u0000" is fun" yields "this \0 is fun",
     * which gets misinterpreted as NUL-terminated "this ".  We should
@ -177,12 +170,6 @@ static void utf8_string(void)
        /* Expected unparse output, defaults to @json_in */
        const char *json_out;
    } test_cases[] = {
-        /*
-         * Bug markers used here:
-         * - bug: not rejected
-         *   JSON parser fails to reject invalid sequence(s)
-         */
-
        /* 0  Control characters */
        {
            /*
@ -330,7 +317,7 @@ static void utf8_string(void)
        {
            /* first one beyond Unicode range: U+110000 */
            "\xF4\x90\x80\x80",
-            "\xF4\x90\x80\x80",
+            NULL,
            "\\uFFFD",
        },
        /* 3  Malformed sequences */
@ -338,49 +325,49 @@ static void utf8_string(void)
        /* 3.1.1  First continuation byte */
        {
            "\x80",
-            "\x80",             /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.1.2  Last continuation byte */
        {
            "\xBF",
-            "\xBF",             /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.1.3  2 continuation bytes */
        {
            "\x80\xBF",
-            "\x80\xBF",         /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        /* 3.1.4  3 continuation bytes */
        {
            "\x80\xBF\x80",
-            "\x80\xBF\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.1.5  4 continuation bytes */
        {
            "\x80\xBF\x80\xBF",
-            "\x80\xBF\x80\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.1.6  5 continuation bytes */
        {
            "\x80\xBF\x80\xBF\x80",
-            "\x80\xBF\x80\xBF\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.1.7  6 continuation bytes */
        {
            "\x80\xBF\x80\xBF\x80\xBF",
-            "\x80\xBF\x80\xBF\x80\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.1.8  7 continuation bytes */
        {
            "\x80\xBF\x80\xBF\x80\xBF\x80",
-            "\x80\xBF\x80\xBF\x80\xBF\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.1.9  Sequence of all 64 possible continuation bytes */
@ -393,16 +380,7 @@ static void utf8_string(void)
            "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
            "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
            "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
-             /* bug: not rejected */
-            "\x80\x81\x82\x83\x84\x85\x86\x87"
-            "\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
-            "\x90\x91\x92\x93\x94\x95\x96\x97"
-            "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
-            "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7"
-            "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
-            "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
-            "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
-            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
@ -410,6 +388,7 @@ static void utf8_string(void)
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
+            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
        /* 3.2  Lonely start characters */
        /* 3.2.1  All 32 first bytes of 2-byte sequences, followed by space */
@ -418,7 +397,7 @@ static void utf8_string(void)
            "\xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF "
            "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 "
            "\xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
-            NULL,               /* bug: accepted partly, see FIXME below */
+            NULL,
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
@ -428,16 +407,14 @@ static void utf8_string(void)
        {
            "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
            "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
-            /* bug: not rejected */
-            "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
-            "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
+            NULL,
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ",
        },
        /* 3.2.3  All 8 first bytes of 4-byte sequences, followed by space */
        {
            "\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
-            NULL,               /* bug: accepted partly, see FIXME below */
+            NULL,
            "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ",
        },
        /* 3.2.4  All 4 first bytes of 5-byte sequences, followed by space */
@ -462,13 +439,13 @@ static void utf8_string(void)
        /* 3.3.2  3-byte sequence with last byte missing (U+0000) */
        {
            "\xE0\x80",
-            "\xE0\x80",         /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.3.3  4-byte sequence with last byte missing (U+0000) */
        {
            "\xF0\x80\x80",
-            "\xF0\x80\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.3.4  5-byte sequence with last byte missing (U+0000) */
@ -486,13 +463,13 @@ static void utf8_string(void)
        /* 3.3.6  2-byte sequence with last byte missing (U+07FF) */
        {
            "\xDF",
-            "\xDF",             /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.3.7  3-byte sequence with last byte missing (U+FFFF) */
        {
            "\xEF\xBF",
-            "\xEF\xBF",         /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 3.3.8  4-byte sequence with last byte missing (U+1FFFFF) */
@ -517,7 +494,7 @@ static void utf8_string(void)
        {
            "\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80"
            "\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF",
-            NULL,               /* bug: accepted partly, see FIXME below */
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
        },
@ -546,12 +523,12 @@ static void utf8_string(void)
        },
        {
            "\xE0\x80\xAF",
-            "\xE0\x80\xAF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            "\xF0\x80\x80\xAF",
-            "\xF0\x80\x80\xAF", /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
@ -579,7 +556,7 @@ static void utf8_string(void)
        {
            /* \U+07FF */
            "\xE0\x9F\xBF",
-            "\xE0\x9F\xBF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
@ -590,7 +567,7 @@ static void utf8_string(void)
             * also 2.2.3
             */
            "\xF0\x8F\xBF\xBC",
-            "\xF0\x8F\xBF\xBC", /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
@ -615,13 +592,13 @@ static void utf8_string(void)
        {
            /* \U+0000 */
            "\xE0\x80\x80",
-            "\xE0\x80\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+0000 */
            "\xF0\x80\x80\x80",
-            "\xF0\x80\x80\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
@ -641,92 +618,92 @@ static void utf8_string(void)
        {
            /* \U+D800 */
            "\xED\xA0\x80",
-            "\xED\xA0\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DB7F */
            "\xED\xAD\xBF",
-            "\xED\xAD\xBF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DB80 */
            "\xED\xAE\x80",
-            "\xED\xAE\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DBFF */
            "\xED\xAF\xBF",
-            "\xED\xAF\xBF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DC00 */
            "\xED\xB0\x80",
-            "\xED\xB0\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DF80 */
            "\xED\xBE\x80",
-            "\xED\xBE\x80",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+DFFF */
            "\xED\xBF\xBF",
-            "\xED\xBF\xBF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* 5.2  Paired UTF-16 surrogates */
        {
            /* \U+D800\U+DC00 */
            "\xED\xA0\x80\xED\xB0\x80",
-            "\xED\xA0\x80\xED\xB0\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+D800\U+DFFF */
            "\xED\xA0\x80\xED\xBF\xBF",
-            "\xED\xA0\x80\xED\xBF\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DB7F\U+DC00 */
            "\xED\xAD\xBF\xED\xB0\x80",
-            "\xED\xAD\xBF\xED\xB0\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DB7F\U+DFFF */
            "\xED\xAD\xBF\xED\xBF\xBF",
-            "\xED\xAD\xBF\xED\xBF\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DB80\U+DC00 */
            "\xED\xAE\x80\xED\xB0\x80",
-            "\xED\xAE\x80\xED\xB0\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DB80\U+DFFF */
            "\xED\xAE\x80\xED\xBF\xBF",
-            "\xED\xAE\x80\xED\xBF\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DBFF\U+DC00 */
            "\xED\xAF\xBF\xED\xB0\x80",
-            "\xED\xAF\xBF\xED\xB0\x80", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        {
            /* \U+DBFF\U+DFFF */
            "\xED\xAF\xBF\xED\xBF\xBF",
-            "\xED\xAF\xBF\xED\xBF\xBF", /* bug: not rejected */
+            NULL,
            "\\uFFFD\\uFFFD",
        },
        /* 5.3  Other illegal code positions */
@ -734,25 +711,25 @@ static void utf8_string(void)
        {
            /* \U+FFFE */
            "\xEF\xBF\xBE",
-            "\xEF\xBF\xBE",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* \U+FFFF */
            "\xEF\xBF\xBF",
-            "\xEF\xBF\xBF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* U+FDD0 */
            "\xEF\xB7\x90",
-            "\xEF\xB7\x90",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        {
            /* U+FDEF */
            "\xEF\xB7\xAF",
-            "\xEF\xB7\xAF",     /* bug: not rejected */
+            NULL,
            "\\uFFFD",
        },
        /* Plane 1 .. 16 noncharacters */
@ -774,23 +751,7 @@ static void utf8_string(void)
            "\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF"
            "\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF"
            "\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF",
-            /* bug: not rejected */
-            "\xF0\x9F\xBF\xBE\xF0\x9F\xBF\xBF"
-            "\xF0\xAF\xBF\xBE\xF0\xAF\xBF\xBF"
-            "\xF0\xBF\xBF\xBE\xF0\xBF\xBF\xBF"
-            "\xF1\x8F\xBF\xBE\xF1\x8F\xBF\xBF"
-            "\xF1\x9F\xBF\xBE\xF1\x9F\xBF\xBF"
-            "\xF1\xAF\xBF\xBE\xF1\xAF\xBF\xBF"
-            "\xF1\xBF\xBF\xBE\xF1\xBF\xBF\xBF"
-            "\xF2\x8F\xBF\xBE\xF2\x8F\xBF\xBF"
-            "\xF2\x9F\xBF\xBE\xF2\x9F\xBF\xBF"
-            "\xF2\xAF\xBF\xBE\xF2\xAF\xBF\xBF"
-            "\xF2\xBF\xBF\xBE\xF2\xBF\xBF\xBF"
-            "\xF3\x8F\xBF\xBE\xF3\x8F\xBF\xBF"
-            "\xF3\x9F\xBF\xBE\xF3\x9F\xBF\xBF"
-            "\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF"
-            "\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF"
-            "\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF",
+            NULL,
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
            "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
@ -829,14 +790,6 @@ static void utf8_string(void)
                    }
                    in = strndup(tail, end - tail);
                    str = from_json_str(in, j, NULL);
-                    /*
-                     * FIXME JSON parser accepts invalid sequence
-                     * starting with \xC2..\xF4
-                     */
-                    if (*in >= '\xC2' && *in <= '\xF4') {
-                        g_free(str);
-                        str = NULL;
-                    }
                    g_assert(!str);
                    g_free(in);
                }