X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?a=blobdiff_plain;f=src%2Fshared%2Futf8.c;h=800884ffee99c77b36c6c16f35015a70442409fa;hb=d896ac2d2fbce41a0b11a0618a685adeaf18b8fe;hp=ab57f4276b038734f6c65a6e39b41b954c90502f;hpb=2bb4c7e384c31de4727f1330da3f4de2f0bb7784;p=elogind.git diff --git a/src/shared/utf8.c b/src/shared/utf8.c index ab57f4276..800884ffe 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -52,7 +52,7 @@ #include "utf8.h" #include "util.h" -static inline bool is_unicode_valid(uint32_t ch) { +bool unichar_is_valid(uint32_t ch) { if (ch >= 0x110000) /* End of unicode space */ return false; @@ -66,7 +66,7 @@ static inline bool is_unicode_valid(uint32_t ch) { return true; } -static bool is_unicode_control(uint32_t ch) { +static bool unichar_is_control(uint32_t ch) { /* 0 to ' '-1 is the C0 range. @@ -156,7 +156,7 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool newline) { val = utf8_encoded_to_unichar(p); if (val < 0 || - is_unicode_control(val) || + unichar_is_control(val) || (!newline && val == '\n')) return false; @@ -276,6 +276,7 @@ char *ascii_is_valid(const char *str) { * occupy. */ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { + if (g < (1 << 7)) { if (out_utf8) out_utf8[0] = g & 0x7f; @@ -301,21 +302,51 @@ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { out_utf8[3] = 0x80 | (g & 0x3f); } return 4; - } else { - return 0; } + + return 0; } char *utf16_to_utf8(const void *s, size_t length) { const uint8_t *f; char *r, *t; - r = new(char, (length*3+1)/2 + 1); + r = new(char, (length * 4 + 1) / 2 + 1); if (!r) return NULL; - for (f = s, t = r; f < (const uint8_t*) s + length; f += 2) - t += utf8_encode_unichar(t, (f[1] << 8) | f[0]); + f = s; + t = r; + + while (f < (const uint8_t*) s + length) { + uint16_t w1, w2; + + /* see RFC 2781 section 2.2 */ + + w1 = f[1] << 8 | f[0]; + f += 2; + + if (!utf16_is_surrogate(w1)) { + t += utf8_encode_unichar(t, w1); + + continue; + } + + if (utf16_is_trailing_surrogate(w1)) + continue; + else if (f >= (const uint8_t*) s + length) + break; + + w2 = f[1] << 8 | f[0]; + f += 2; + + if (!utf16_is_trailing_surrogate(w2)) { + f -= 2; + continue; + } + + t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2)); + } *t = 0; return r; @@ -364,7 +395,7 @@ int utf8_encoded_valid_unichar(const char *str) { return -EINVAL; /* check if value has valid range */ - if (!is_unicode_valid(unichar)) + if (!unichar_is_valid(unichar)) return -EINVAL; return len;