chiark / gitweb /
shared: json - support escaping utf16 surrogate pairs
authorTom Gundersen <teg@jklm.no>
Mon, 22 Dec 2014 13:53:40 +0000 (14:53 +0100)
committerTom Gundersen <teg@jklm.no>
Mon, 22 Dec 2014 19:27:20 +0000 (20:27 +0100)
We originally only supported escaping ucs2 encoded characters (as \uxxxx). This
only covers the BMP. Support escaping also utf16 surrogate pairs (on the form
\uxxxx\uyyyy) to cover all of unicode.

src/shared/json.c
src/test/test-json.c

index 47f801c8589223d511e7ebf1ec3e03bce14af64c..bb3d26f0e5eabf69bdb043fed1106b7f9bfcbfd0 100644 (file)
@@ -53,6 +53,42 @@ static void inc_lines(unsigned *line, const char *s, size_t n) {
         }
 }
 
         }
 }
 
+static int unhex_ucs2(const char *c, uint16_t *ret) {
+        int aa, bb, cc, dd;
+        uint16_t x;
+
+        assert(c);
+        assert(ret);
+
+        aa = unhexchar(c[0]);
+        if (aa < 0)
+                return -EINVAL;
+
+        bb = unhexchar(c[1]);
+        if (bb < 0)
+                return -EINVAL;
+
+        cc = unhexchar(c[2]);
+        if (cc < 0)
+                return -EINVAL;
+
+        dd = unhexchar(c[3]);
+        if (dd < 0)
+                return -EINVAL;
+
+        x =     ((uint16_t) aa << 12) |
+                ((uint16_t) bb << 8) |
+                ((uint16_t) cc << 4) |
+                ((uint16_t) dd);
+
+        if (x <= 0)
+                return -EINVAL;
+
+        *ret = x;
+
+        return 0;
+}
+
 static int json_parse_string(const char **p, char **ret) {
         _cleanup_free_ char *s = NULL;
         size_t n = 0, allocated = 0;
 static int json_parse_string(const char **p, char **ret) {
         _cleanup_free_ char *s = NULL;
         size_t n = 0, allocated = 0;
@@ -119,39 +155,40 @@ static int json_parse_string(const char **p, char **ret) {
                         else if (*c == 't')
                                 ch = '\t';
                         else if (*c == 'u') {
                         else if (*c == 't')
                                 ch = '\t';
                         else if (*c == 'u') {
-                                int aa, bb, cc, dd;
                                 uint16_t x;
                                 uint16_t x;
+                                int r;
 
 
-                                aa = unhexchar(c[1]);
-                                if (aa < 0)
-                                        return -EINVAL;
+                                r = unhex_ucs2(c + 1, &x);
+                                if (r < 0)
+                                        return r;
 
 
-                                bb = unhexchar(c[2]);
-                                if (bb < 0)
-                                        return -EINVAL;
+                                c += 5;
 
 
-                                cc = unhexchar(c[3]);
-                                if (cc < 0)
-                                        return -EINVAL;
+                                if (!GREEDY_REALLOC(s, allocated, n + 4))
+                                        return -ENOMEM;
 
 
-                                dd = unhexchar(c[4]);
-                                if (dd < 0)
+                                if (!utf16_is_surrogate(x))
+                                        n += utf8_encode_unichar(s + n, x);
+                                else if (utf16_is_trailing_surrogate(x))
                                         return -EINVAL;
                                         return -EINVAL;
+                                else {
+                                        uint16_t y;
 
 
+                                        if (c[0] != '\\' || c[1] != 'u')
+                                                return -EINVAL;
 
 
-                                x =     ((uint16_t) aa << 12) |
-                                        ((uint16_t) bb << 8) |
-                                        ((uint16_t) cc << 4) |
-                                        ((uint16_t) dd);
+                                        r = unhex_ucs2(c + 2, &y);
+                                        if (r < 0)
+                                                return r;
 
 
-                                if (x <= 0)
-                                        return -EINVAL;
+                                        c += 6;
 
 
-                                if (!GREEDY_REALLOC(s, allocated, n + 4))
-                                        return -ENOMEM;
+                                        if (!utf16_is_trailing_surrogate(y))
+                                                return -EINVAL;
+
+                                        n += utf8_encode_unichar(s + n, utf16_surrogate_pair_to_unichar(x, y));
+                                }
 
 
-                                n += utf8_encode_unichar(s + n, x);
-                                c += 5;
                                 continue;
                         } else
                                 return -EINVAL;
                                 continue;
                         } else
                                 return -EINVAL;
index e53e8ed50f18d932ebd1c6bb03b29253a895434f..b09131891cf581e24f460e9d03ace596c728210e 100644 (file)
@@ -99,6 +99,9 @@ int main(int argc, char *argv[]) {
         test_one("\"\xef\xbf\xbd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\ufffd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\uf\"", -EINVAL);
         test_one("\"\xef\xbf\xbd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\ufffd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\uf\"", -EINVAL);
+        test_one("\"\\ud800a\"", -EINVAL);
+        test_one("\"\\udc00\\udc00\"", -EINVAL);
+        test_one("\"\\ud801\\udc37\"", JSON_STRING, "\xf0\x90\x90\xb7", JSON_END);
 
         return 0;
 }
 
         return 0;
 }