From bb5c7798391ce342923ac5f142fc165c1d2e5dfc Mon Sep 17 00:00:00 2001 Message-Id: From: Mark Wooding Date: Sun, 17 Nov 2013 11:31:33 +0000 Subject: [PATCH] UTF-16 support Organization: Straylight/Edgeware From: Richard Kettlewell --- lib/unicode.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++- lib/unicode.h | 33 +++++++++++- lib/vector.h | 4 +- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/lib/unicode.c b/lib/unicode.c index 675f703..27f82b7 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -1,6 +1,6 @@ /* * This file is part of DisOrder - * Copyright (C) 2007, 2009 Richard Kettlewell + * Copyright (C) 2007, 2009, 2013 Richard Kettlewell * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -305,6 +305,133 @@ error: return 0; } +/** @brief Convert UTF-16 to UTF-8 + * @param s Source string + * @param ns Length of source string in code points + * @param ndp Where to store length of destination string (or NULL) + * @return Newly allocated destination string or NULL on error + * + * If the UTF-16 is not valid then NULL is returned. A UTF-16 sequence t is + * invalid if it contains an incomplete surrogate. + * + * The return value is always 0-terminated. The value returned via @p *ndp + * does not include the terminator. + */ +char *utf16_to_utf8(const uint16_t *s, size_t ns, size_t *ndp) { + struct dynstr d; + uint32_t c; + + dynstr_init(&d); + while(ns > 0) { + c = *s++; + --ns; + if(c >= 0xD800 && c <= 0xDBFF) { + if(ns && *s >= 0xDC00 && c <= 0xDFFF) + c = ((c - 0xD800) << 10) + (*s++ - 0xDC00) + 0x10000; + else + goto error; + } else if(c >= 0xDC00 && c <= 0xDFFF) + goto error; + if(c < 0x80) + dynstr_append(&d, c); + else if(c < 0x0800) { + dynstr_append(&d, 0xC0 | (c >> 6)); + dynstr_append(&d, 0x80 | (c & 0x3F)); + } else if(c < 0x10000) { + if(c >= 0xD800 && c <= 0xDFFF) + goto error; + dynstr_append(&d, 0xE0 | (c >> 12)); + dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F)); + dynstr_append(&d, 0x80 | (c & 0x3F)); + } else if(c < 0x110000) { + dynstr_append(&d, 0xF0 | (c >> 18)); + dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F)); + dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F)); + dynstr_append(&d, 0x80 | (c & 0x3F)); + } else + goto error; + } + dynstr_terminate(&d); + if(ndp) + *ndp = d.nvec; + return d.vec; +error: + xfree(d.vec); + return 0; +} + +/** @brief Convert UTF-8 to UTF-16 + * @param s Source string + * @param ns Length of source string in code points + * @param ndp Where to store length of destination string (or NULL) + * @return Newly allocated destination string or NULL on error + * + * The return value is always 0-terminated. The value returned via @p *ndp + * does not include the terminator. + * + * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence + * for a code point is invalid if: + * - it is not the shortest possible sequence for the code point + * - it codes for a UTF-16 surrogate + * - it codes for a value outside the unicode code space + */ +uint16_t *utf8_to_utf16(const char *s, size_t ns, size_t *ndp) { + struct dynstr_utf16 d; + uint32_t c32; + const uint8_t *ss = (const uint8_t *)s; + int n; + + dynstr_utf16_init(&d); + while(ns > 0) { + const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss]; + if(r->count <= ns) { + switch(r->count) { + case 1: + c32 = *ss; + break; + case 2: + if(ss[1] < r->min2 || ss[1] > r->max2) + goto error; + c32 = *ss & 0x1F; + break; + case 3: + if(ss[1] < r->min2 || ss[1] > r->max2) + goto error; + c32 = *ss & 0x0F; + break; + case 4: + if(ss[1] < r->min2 || ss[1] > r->max2) + goto error; + c32 = *ss & 0x07; + break; + default: + goto error; + } + } else + goto error; + for(n = 1; n < r->count; ++n) { + if(ss[n] < 0x80 || ss[n] > 0xBF) + goto error; + c32 = (c32 << 6) | (ss[n] & 0x3F); + } + if(c32 >= 0x10000) { + c32 -= 0x10000; + dynstr_utf16_append(&d, 0xD800 + (c32 >> 10)); + dynstr_utf16_append(&d, 0xDC00 + (c32 & 0x03FF)); + } else + dynstr_utf16_append(&d, c32); + ss += r->count; + ns -= r->count; + } + dynstr_utf16_terminate(&d); + if(ndp) + *ndp = d.nvec; + return d.vec; +error: + xfree(d.vec); + return 0; +} + /** @brief Test whether [s,s+ns) is valid UTF-8 * @param s Start of string * @param ns Length of string @@ -1578,6 +1705,20 @@ error: /*@}*/ +/** @brief Return the length of a 0-terminated UTF-16 string + * @param s Pointer to 0-terminated string + * @return Length of string in code points (excluding terminator) + * + * Unlike the conversion functions no validity checking is done on the string. + */ +size_t utf16_len(const uint16_t *s) { + const uint16_t *t = s; + + while(*t) + ++t; + return (size_t)(t - s); +} + /* Local Variables: c-basic-offset:2 diff --git a/lib/unicode.h b/lib/unicode.h index b6d6813..0d0657d 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -1,6 +1,6 @@ /* * This file is part of DisOrde - * Copyright (C) 2007 Richard Kettlewell + * Copyright (C) 2007, 2008, 2013 Richard Kettlewell * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -44,11 +44,14 @@ typedef int unicode_property_tailor(uint32_t c); char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd); uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd); +char *utf16_to_utf8(const uint16_t *s, size_t ns, size_t *nd); +uint16_t *utf8_to_utf16(const char *s, size_t ns, size_t *nd); int utf8_valid(const char *s, size_t ns); int utf32_combining_class(uint32_t c); size_t utf32_len(const uint32_t *s); +size_t utf16_len(const uint16_t *s); int utf32_cmp(const uint32_t *a, const uint32_t *b); uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp); @@ -99,6 +102,16 @@ static inline char *utf32nt_to_utf8(const uint32_t *s) { return utf32_to_utf8(s, utf32_len(s), 0); } +/** @brief Convert 0-terminated UTF-16 to UTF-8 + * @param s 0-terminated UTF-16 string + * @return 0-terminated UTF-8 string or 0 on error + * + * See utf16_to_utf8() for possible causes of errors. + */ +static inline char *utf16nt_to_utf8(const uint16_t *s) { + return utf16_to_utf8(s, utf16_len(s), 0); +} + /** @brief Convert 0-terminated UTF-8 to UTF-32 * @param s 0-terminated UTF-8 string * @return 0-terminated UTF-32 string or 0 on error @@ -109,6 +122,24 @@ static inline uint32_t *utf8nt_to_utf32(const char *s) { return utf8_to_utf32(s, strlen(s), 0); } +/** @brief Convert 0-terminated UTF-8 to UTF-16 + * @param s 0-terminated UTF-8 string + * @return 0-terminated UTF-16 string or 0 on error + * + * See utf8_to_utf16() for possible causes of errors. + */ +static inline uint16_t *utf8nt_to_utf16(const char *s) { + return utf8_to_utf16(s, strlen(s), 0); +} + +static inline wchar_t *utf8nt_to_wchar(const char *s) { + return (wchar_t *)utf8nt_to_utf32(s); +} + +static inline char *wcharnt_to_utf8(const wchar_t *s) { + return utf32nt_to_utf8((const uint32_t *)s); +} + #endif /* UNICODE_H */ /* diff --git a/lib/vector.h b/lib/vector.h index da84784..fdef9cf 100644 --- a/lib/vector.h +++ b/lib/vector.h @@ -1,6 +1,6 @@ /* * This file is part of DisOrder. - * Copyright (C) 2004, 2005, 2007-2009 Richard Kettlewell + * Copyright (C) 2004, 2005, 2007-2009, 2013 Richard Kettlewell * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -79,6 +79,8 @@ VECTOR_TYPE(vector, char *, xrealloc); /** @brief A dynamic string */ VECTOR_TYPE(dynstr, char, xrealloc_noptr); /** @brief A dynamic unicode string */ +VECTOR_TYPE(dynstr_utf16, uint16_t, xrealloc_noptr); +/** @brief A dynamic wide character string */ VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr); /** @brief A dynamic array of pointers to unicode string */ VECTOR_TYPE(vector32, uint32_t *, xrealloc); -- [mdw]