return buf;
}
-/** @brief Convert UTF-8 to UCS-4
- * @param mb Pointer to 0-terminated UTF-8 string
- * @return Pointer to 0-terminated UCS-4 string
- *
- * Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
- * our endianness, and it's easy to convert it ourselves, so we do. See also
- * @ref ucs42utf8().
- */
-uint32_t *utf82ucs4(const char *mb) {
- struct dynstr_ucs4 d;
- uint32_t c;
-
- dynstr_ucs4_init(&d);
- while(*mb) {
- PARSE_UTF8(mb, c,
- error(0, "invalid UTF-8 sequence"); return 0;);
- dynstr_ucs4_append(&d, c);
- }
- dynstr_ucs4_terminate(&d);
- return d.vec;
-}
-
-/** @brief Convert one UCS-4 character to UTF-8
- * @param c Character to convert
- * @param d Dynamic string to append UTF-8 sequence to
- * @return 0 on success, -1 on error
- */
-int one_ucs42utf8(uint32_t c, struct dynstr *d) {
- if(c < 0x80)
- dynstr_append(d, c);
- else if(c < 0x800) {
- dynstr_append(d, 0xC0 | (c >> 6));
- dynstr_append(d, 0x80 | (c & 0x3F));
- } else if(c < 0x10000) {
- dynstr_append(d, 0xE0 | (c >> 12));
- dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
- dynstr_append(d, 0x80 | (c & 0x3F));
- } else if(c < 0x110000) {
- dynstr_append(d, 0xF0 | (c >> 18));
- dynstr_append(d, 0x80 | ((c >> 12) & 0x3F));
- dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
- dynstr_append(d, 0x80 | (c & 0x3F));
- } else {
- error(0, "invalid UCS-4 character %#"PRIx32, c);
- return -1;
- }
- return 0;
-}
-
-/** @brief Convert UCS-4 to UTF-8
- * @param u Pointer to 0-terminated UCS-4 string
- * @return Pointer to 0-terminated UTF-8 string
- *
- * See @ref utf82ucs4().
- */
-char *ucs42utf8(const uint32_t *u) {
- struct dynstr d;
- uint32_t c;
-
- dynstr_init(&d);
- while((c = *u++)) {
- if(one_ucs42utf8(c, &d))
- return 0;
- }
- dynstr_terminate(&d);
- return d.vec;
-}
-
/** @brief Convert from the local multibyte encoding to UTF-8 */
char *mb2utf8(const char *mb) {
return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
else return xstrdup(any);
}
-/** @brief strlen workalike for UCS-4 strings
- *
- * We don't rely on the local @c wchar_t being UCS-4.
- */
-int ucs4cmp(const uint32_t *a, const uint32_t *b) {
- while(*a && *b && *a == *b) ++a, ++b;
- if(*a > *b) return 1;
- else if(*a < *b) return -1;
- else return 0;
-}
-
/** @brief Return nonzero if @p c is a combining character */
static int combining(int c) {
if(c < UNICODE_NCHARS) {
/* Character encoding conversion routines */
-int one_ucs42utf8(uint32_t c, struct dynstr *d);
-
-uint32_t *utf82ucs4(const char *mb);
-char *ucs42utf8(const uint32_t *u);
char *mb2utf8(const char *mb);
char *utf82mb(const char *utf8);
/* various conversions, between multibyte strings (mb) in
return s;
}
-int ucs4cmp(const uint32_t *a, const uint32_t *b);
-/* like strcmp */
-
const char *truncate_for_display(const char *s, long max);
#endif /* CHARSET_H */
char *u8; \
\
insist(validutf8(CHARS)); \
- ucs = utf82ucs4(CHARS); \
+ ucs = utf8_to_utf32(CHARS, strlen(CHARS), 0); \
insist(ucs != 0); \
- insist(!ucs4cmp(w, ucs)); \
- u8 = ucs42utf8(ucs); \
+ insist(!utf32_cmp(w, ucs)); \
+ u8 = utf32_to_utf8(ucs, utf32_len(ucs), 0); \
insist(u8 != 0); \
insist(!strcmp(u8, CHARS)); \
} while(0)
break;
}
if(l) {
+ uint32_t *d;
/* Case-folded data is now normalized */
- canon_expected = ucs42utf8(utf32_decompose_canon(&l, 1, 0));
+ d = utf32_decompose_canon(&l, 1, 0);
+ canon_expected = utf32_to_utf8(d, utf32_len(d), 0);
if(strcmp(canon_folded, canon_expected)) {
fprintf(stderr, "%s:%d: canon-casefolding %#lx got '%s', expected '%s'\n",
__FILE__, __LINE__, (unsigned long)c,
count_error();
}
++tests;
- compat_expected = ucs42utf8(utf32_decompose_compat(&l, 1, 0));
+ d = utf32_decompose_compat(&l, 1, 0);
+ compat_expected = utf32_to_utf8(d, utf32_len(d), 0);
if(strcmp(compat_folded, compat_expected)) {
fprintf(stderr, "%s:%d: compat-casefolding %#lx got '%s', expected '%s'\n",
__FILE__, __LINE__, (unsigned long)c,
int utf32_iterator_grapheme_boundary(utf32_iterator it);
int utf32_iterator_word_boundary(utf32_iterator it);
+/** @brief Convert 0-terminated UTF-32 to UTF-8
+ * @param s 0-terminated UTF-32 string
+ * @return 0-terminated UTF-8 string or 0 on error
+ *
+ * See utf32_to_utf8() for possible causes of errors.
+ */
+static inline char *utf32nt_to_utf8(const uint32_t *s) {
+ return utf32_to_utf8(s, utf32_len(s), 0);
+}
+
+/** @brief Convert 0-terminated UTF-8 to UTF-32
+ * @param s 0-terminated UTF-8 string
+ * @return 0-terminated UTF-32 string or 0 on error
+ *
+ * See utf8_to_utf32() for possible causes of errors.
+ */
+static inline uint32_t *utf8nt_to_utf32(const char *s) {
+ return utf8_to_utf32(s, strlen(s), 0);
+}
+
#endif /* UNICODE_H */
/*
int n;
if(!raw) {
- if(!(ucs = utf82ucs4(s))) exit(EXIT_FAILURE);
+ if(!(ucs = utf8_to_utf32(s, strlen(s), 0))) exit(EXIT_FAILURE);
} else {
ucs = xmalloc_noptr((strlen(s) + 1) * sizeof(uint32_t));
for(n = 0; s[n]; ++n)