#include <stdbool.h>
#include "utf8.h"
+#include "util.h"
#define FILTER_CHAR '_'
'\t' is in C0 range, but more or less harmless and commonly used.
*/
- return (ch < ' ' && ch != '\t') ||
+ return (ch < ' ' && ch != '\t' && ch != '\n') ||
(0x7F <= ch && ch <= 0x9F);
}
-char* utf8_is_printable_n(const char* str, size_t length) {
+bool utf8_is_printable(const char* str, size_t length) {
uint32_t val = 0;
uint32_t min = 0;
const uint8_t *p;
min = (1 << 16);
val = (uint32_t) (*p & 0x07);
} else
- goto error;
+ return false;
p++;
length--;
if (!length || !is_continuation_char(*p))
- goto error;
+ return false;
merge_continuation_char(&val, *p);
TWO_REMAINING:
p++;
length--;
if (!is_continuation_char(*p))
- goto error;
+ return false;
merge_continuation_char(&val, *p);
ONE_REMAINING:
p++;
length--;
if (!is_continuation_char(*p))
- goto error;
+ return false;
merge_continuation_char(&val, *p);
if (val < min)
- goto error;
+ return false;
}
if (is_unicode_control(val))
- goto error;
+ return false;
}
- return (char*) str;
-
-error:
- return NULL;
+ return true;
}
static char* utf8_validate(const char *str, char *output) {
return r;
}
+
+char *utf16_to_utf8(const void *s, size_t length) {
+ char *r;
+ const uint8_t *f;
+ uint8_t *t;
+
+ r = new(char, (length*3+1)/2 + 1);
+ if (!r)
+ return NULL;
+
+ t = (uint8_t*) r;
+
+ for (f = s; f < (const uint8_t*) s + length; f += 2) {
+ uint16_t c;
+
+ c = (f[1] << 8) | f[0];
+
+ if (c == 0) {
+ *t = 0;
+ return r;
+ } else if (c < 0x80) {
+ *(t++) = (uint8_t) c;
+ } else if (c < 0x800) {
+ *(t++) = (uint8_t) (0xc0 | (c >> 6));
+ *(t++) = (uint8_t) (0x80 | (c & 0x3f));
+ } else {
+ *(t++) = (uint8_t) (0xe0 | (c >> 12));
+ *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
+ *(t++) = (uint8_t) (0x80 | (c & 0x3f));
+ }
+ }
+
+ *t = 0;
+
+ return r;
+}