+
+char *utf16_to_utf8(const void *s, size_t length) {
+ char *r;
+ const uint8_t *f;
+ uint8_t *t;
+
+ r = new(char, (length*3+1)/2 + 1);
+ if (!r)
+ return NULL;
+
+ t = (uint8_t*) r;
+
+ for (f = s; f < (const uint8_t*) s + length; f += 2) {
+ uint16_t c;
+
+ c = (f[1] << 8) | f[0];
+
+ if (c == 0) {
+ *t = 0;
+ return r;
+ } else if (c < 0x80) {
+ *(t++) = (uint8_t) c;
+ } else if (c < 0x800) {
+ *(t++) = (uint8_t) (0xc0 | (c >> 6));
+ *(t++) = (uint8_t) (0x80 | (c & 0x3f));
+ } else {
+ *(t++) = (uint8_t) (0xe0 | (c >> 12));
+ *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
+ *(t++) = (uint8_t) (0x80 | (c & 0x3f));
+ }
+ }
+
+ *t = 0;
+
+ return r;
+}
+
+/* expected size used to encode one unicode char */
+static int utf8_unichar_to_encoded_len(int unichar) {
+ if (unichar < 0x80)
+ return 1;
+ if (unichar < 0x800)
+ return 2;
+ if (unichar < 0x10000)
+ return 3;
+ if (unichar < 0x200000)
+ return 4;
+ if (unichar < 0x4000000)
+ return 5;
+ return 6;
+}
+
+/* validate one encoded unicode char and return its length */
+int utf8_encoded_valid_unichar(const char *str) {
+ int len;
+ int unichar;
+ int i;
+
+ len = utf8_encoded_expected_len(str);
+ if (len == 0)
+ return -1;
+
+ /* ascii is valid */
+ if (len == 1)
+ return 1;
+
+ /* check if expected encoded chars are available */
+ for (i = 0; i < len; i++)
+ if ((str[i] & 0x80) != 0x80)
+ return -1;
+
+ unichar = utf8_encoded_to_unichar(str);
+
+ /* check if encoded length matches encoded value */
+ if (utf8_unichar_to_encoded_len(unichar) != len)
+ return -1;
+
+ /* check if value has valid range */
+ if (!is_unicode_valid(unichar))
+ return -1;
+
+ return len;
+}