From 71573c4556aef59cb31c9cf48e8e8f74d01df537 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Apr 2015 11:27:47 +0200 Subject: [PATCH] util: when unescaping C escape sequences support C++11 \u and \U unicode literals We simply recode them in utf8. --- src/shared/utf8.c | 13 +++-- src/shared/utf8.h | 2 + src/shared/util.c | 146 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 137 insertions(+), 24 deletions(-) diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 013c110f0..800884ffe 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -52,7 +52,7 @@ #include "utf8.h" #include "util.h" -static inline bool is_unicode_valid(uint32_t ch) { +bool unichar_is_valid(uint32_t ch) { if (ch >= 0x110000) /* End of unicode space */ return false; @@ -66,7 +66,7 @@ static inline bool is_unicode_valid(uint32_t ch) { return true; } -static bool is_unicode_control(uint32_t ch) { +static bool unichar_is_control(uint32_t ch) { /* 0 to ' '-1 is the C0 range. @@ -156,7 +156,7 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool newline) { val = utf8_encoded_to_unichar(p); if (val < 0 || - is_unicode_control(val) || + unichar_is_control(val) || (!newline && val == '\n')) return false; @@ -276,6 +276,7 @@ char *ascii_is_valid(const char *str) { * occupy. */ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { + if (g < (1 << 7)) { if (out_utf8) out_utf8[0] = g & 0x7f; @@ -301,9 +302,9 @@ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { out_utf8[3] = 0x80 | (g & 0x3f); } return 4; - } else { - return 0; } + + return 0; } char *utf16_to_utf8(const void *s, size_t length) { @@ -394,7 +395,7 @@ int utf8_encoded_valid_unichar(const char *str) { return -EINVAL; /* check if value has valid range */ - if (!is_unicode_valid(unichar)) + if (!unichar_is_valid(unichar)) return -EINVAL; return len; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index 77f663438..e745649f0 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -27,6 +27,8 @@ #define UTF8_REPLACEMENT_CHARACTER "\xef\xbf\xbd" +bool unichar_is_valid(uint32_t c); + const char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; diff --git a/src/shared/util.c b/src/shared/util.c index 8071bb231..ebd4d58f9 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -1368,13 +1368,17 @@ char *cescape(const char *s) { return r; } -static int cunescape_one(const char *p, size_t length, char *ret) { +static int cunescape_one(const char *p, size_t length, char *ret, uint32_t *ret_unicode) { int r = 1; assert(p); assert(*p); assert(ret); + /* Unescapes C style. Returns the unescaped character in ret, + * unless we encountered a \u sequence in which case the full + * unicode character is returned in ret_unicode, instead. */ + if (length != (size_t) -1 && length < 1) return -EINVAL; @@ -1431,15 +1435,92 @@ static int cunescape_one(const char *p, size_t length, char *ret) { if (b < 0) return -EINVAL; - /* don't allow NUL bytes */ + /* Don't allow NUL bytes */ if (a == 0 && b == 0) return -EINVAL; - *ret = (char) ((a << 4) | b); + *ret = (char) ((a << 4U) | b); r = 3; break; } + case 'u': { + /* C++11 style 16bit unicode */ + + int a[4]; + unsigned i; + uint32_t c; + + if (length != (size_t) -1 && length < 5) + return -EINVAL; + + for (i = 0; i < 4; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 12U) | ((uint32_t) a[1] << 8U) | ((uint32_t) a[2] << 4U) | (uint32_t) a[3]; + + /* Don't allow 0 chars */ + if (c == 0) + return -EINVAL; + + if (c < 128) + *ret = c; + else { + if (!ret_unicode) + return -EINVAL; + + *ret = 0; + *ret_unicode = c; + } + + r = 5; + break; + } + + case 'U': { + /* C++11 style 32bit unicode */ + + int a[8]; + unsigned i; + uint32_t c; + + if (length != (size_t) -1 && length < 9) + return -EINVAL; + + for (i = 0; i < 8; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 28U) | ((uint32_t) a[1] << 24U) | ((uint32_t) a[2] << 20U) | ((uint32_t) a[3] << 16U) | + ((uint32_t) a[4] << 12U) | ((uint32_t) a[5] << 8U) | ((uint32_t) a[6] << 4U) | (uint32_t) a[7]; + + /* Don't allow 0 chars */ + if (c == 0) + return -EINVAL; + + /* Don't allow invalid code points */ + if (!unichar_is_valid(c)) + return -EINVAL; + + if (c < 128) + *ret = c; + else { + if (!ret_unicode) + return -EINVAL; + + *ret = 0; + *ret_unicode = c; + } + + r = 9; + break; + } + case '0': case '1': case '2': @@ -1449,7 +1530,8 @@ static int cunescape_one(const char *p, size_t length, char *ret) { case '6': case '7': { /* octal encoding */ - int a, b, c, m; + int a, b, c; + uint32_t m; if (length != (size_t) -1 && length < 4) return -EINVAL; @@ -1471,11 +1553,11 @@ static int cunescape_one(const char *p, size_t length, char *ret) { return -EINVAL; /* Don't allow bytes above 255 */ - m = (a << 6) | (b << 3) | c; + m = ((uint32_t) a << 6U) | ((uint32_t) b << 3U) | (uint32_t) c; if (m > 255) return -EINVAL; - *ret = (char) m; + *ret = m; r = 3; break; } @@ -1508,6 +1590,8 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi for (f = s, t = r + pl; f < s + length; f++) { size_t remaining; + uint32_t u; + char c; int k; remaining = s + length - f; @@ -1530,7 +1614,7 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi return -EINVAL; } - k = cunescape_one(f + 1, remaining - 1, t); + k = cunescape_one(f + 1, remaining - 1, &c, &u); if (k < 0) { if (flags & UNESCAPE_RELAX) { /* Invalid escape code, let's take it literal then */ @@ -1542,8 +1626,14 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi return k; } + if (c != 0) + /* Non-Unicode? Let's encode this directly */ + *(t++) = c; + else + /* Unicode? Then let's encode this in UTF-8 */ + t += utf8_encode_unichar(t, u); + f += k; - t++; } *t = 0; @@ -7219,16 +7309,22 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; - state = VALUE; + if (c != 0) + s[sz++] = c; /* normal explicit char */ + else + sz += utf8_encode_unichar(s, u); /* unicode chars we'll encode as utf8 */ + } else + s[sz++] = c; + state = VALUE; break; case SINGLE_QUOTE: @@ -7260,14 +7356,21 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; + if (c != 0) + s[sz++] = c; + else + sz += utf8_encode_unichar(s, u); + } else + s[sz++] = c; + state = SINGLE_QUOTE; break; @@ -7298,14 +7401,21 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; + if (c != 0) + s[sz++] = c; + else + sz += utf8_encode_unichar(s, u); + } else + s[sz++] = c; + state = DOUBLE_QUOTE; break; -- 2.30.2