2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 /** @file lib/unicode.c
21 * @brief Unicode support functions
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
26 * The idea is that all the strings that hit the database will be in a
27 * particular normalization form, and for the search and tags database
28 * in case-folded form, so they can be naively compared within the
31 * As the code stands this guarantee is not well met!
38 #include <stdio.h> /* TODO */
45 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
48 /** @brief Convert UTF-32 to UTF-8
49 * @param s Source string
50 * @param ns Length of source string in code points
51 * @param ndp Where to store length of destination string (or NULL)
52 * @return Newly allocated destination string or NULL on error
54 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
56 * - it codes for a UTF-16 surrogate
57 * - it codes for a value outside the unicode code space
59 * The return value is always 0-terminated. The value returned via @p *ndp
60 * does not include the terminator.
62 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
72 dynstr_append(&d, 0xC0 | (c >> 6));
73 dynstr_append(&d, 0x80 | (c & 0x3F));
74 } else if(c < 0x10000) {
75 if(c >= 0xD800 && c <= 0xDFFF)
77 dynstr_append(&d, 0xE0 | (c >> 12));
78 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
79 dynstr_append(&d, 0x80 | (c & 0x3F));
80 } else if(c < 0x110000) {
81 dynstr_append(&d, 0xF0 | (c >> 18));
82 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
83 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
84 dynstr_append(&d, 0x80 | (c & 0x3F));
98 /** @brief Convert UTF-8 to UTF-32
99 * @param s Source string
100 * @param ns Length of source string in code points
101 * @param ndp Where to store length of destination string (or NULL)
102 * @return Newly allocated destination string or NULL
104 * The return value is always 0-terminated. The value returned via @p *ndp
105 * does not include the terminator.
107 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
108 * for a code point is invalid if:
109 * - it is not the shortest possible sequence for the code point
110 * - it codes for a UTF-16 surrogate
111 * - it codes for a value outside the unicode code space
113 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
114 struct dynstr_ucs4 d;
116 const uint8_t *ss = (const uint8_t *)s;
118 dynstr_ucs4_init(&d);
122 /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
123 * (Unicode 5.0.0 s3.9 D76)
126 * 7 data bits gives 0x00 - 0x7F and all are acceptable
129 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
131 * 1110xxxx 10xxxxxx 10xxxxxx
132 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
133 * (and UTF-16 surrogates are not acceptable)
135 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136 * 21 data bits gives 0x00000000 - 0x001FFFFF
137 * but only 0x00010000 - 0x0010FFFF are acceptable
139 * It is NOT always the case that the data bits in the first byte are
140 * always non-0 for the acceptable values, so we do a separate check after
146 if(ns < 1) goto error;
149 if((c & 0xC0) != 0x80) goto error;
150 c32 = (c32 << 6) | (c & 0x3F);
151 if(c32 < 0x80) goto error;
152 } else if(c <= 0xEF) {
153 if(ns < 2) goto error;
156 if((c & 0xC0) != 0x80) goto error;
157 c32 = (c32 << 6) | (c & 0x3F);
159 if((c & 0xC0) != 0x80) goto error;
160 c32 = (c32 << 6) | (c & 0x3F);
161 if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
162 } else if(c <= 0xF7) {
163 if(ns < 3) goto error;
166 if((c & 0xC0) != 0x80) goto error;
167 c32 = (c32 << 6) | (c & 0x3F);
169 if((c & 0xC0) != 0x80) goto error;
170 c32 = (c32 << 6) | (c & 0x3F);
172 if((c & 0xC0) != 0x80) goto error;
173 c32 = (c32 << 6) | (c & 0x3F);
174 if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
177 dynstr_ucs4_append(&d, c32);
179 dynstr_ucs4_terminate(&d);
189 /** @defgroup utf32 Functions that operate on UTF-32 strings */
192 /** @brief Return the length of a 0-terminated UTF-32 string
193 * @param s Pointer to 0-terminated string
194 * @return Length of string in code points (excluding terminator)
196 * Unlike the conversion functions no validity checking is done on the string.
198 size_t utf32_len(const uint32_t *s) {
199 const uint32_t *t = s;
203 return (size_t)(t - s);
206 /** @brief Return the combining class of @p c
207 * @param c Code point
208 * @return Combining class of @p c
210 static inline int utf32__combining_class(uint32_t c) {
211 if(c < UNICODE_NCHARS)
212 return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
216 /** @brief Stably sort [s,s+ns) into descending order of combining class
217 * @param s Start of array
218 * @param ns Number of elements, must be at least 1
219 * @param buffer Buffer of at least @p ns elements
221 static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
222 uint32_t *a, *b, *bp;
226 case 1: /* 1-element array is always sorted */
228 case 2: /* 2-element arrays are trivial to sort */
229 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
236 /* Partition the array */
241 /* Sort the two halves of the array */
242 utf32__sort_ccc(a, na, buffer);
243 utf32__sort_ccc(b, nb, buffer);
244 /* Merge them back into one, via the buffer */
246 while(na > 0 && nb > 0) {
247 /* We want descending order of combining class (hence <)
248 * and we want stability within combining classes (hence <=)
250 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
266 memcpy(s, buffer, ns * sizeof(uint32_t));
271 /** @brief Put combining characters into canonical order
272 * @param s Pointer to UTF-32 string
273 * @param ns Length of @p s
274 * @return 0 on success, -1 on error
276 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
279 * Currently we only support a maximum of 1024 combining characters after each
280 * base character. If this limit is exceeded then -1 is returned.
282 static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
284 uint32_t buffer[1024];
286 /* The ordering amounts to a stable sort of each contiguous group of
287 * characters with non-0 combining class. */
289 /* Skip non-combining characters */
290 if(utf32__combining_class(*s) == 0) {
295 /* We must now have at least one combining character; see how many
297 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
302 utf32__sort_ccc(s, nc, buffer);
309 /* Magic numbers from UAX #15 s16 */
317 #define NCount (VCount * TCount)
318 #define SCount (LCount * NCount)
320 /** @brief Guts of the decomposition lookup functions */
321 #define utf32__decompose_one_generic(WHICH) do { \
322 const uint32_t *dc = \
323 (c < UNICODE_NCHARS \
324 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH \
327 /* Found a canonical decomposition in the table */ \
329 utf32__decompose_one_##WHICH(d, *dc++); \
330 } else if(c >= SBase && c < SBase + SCount) { \
331 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
332 const uint32_t SIndex = c - SBase; \
333 const uint32_t L = LBase + SIndex / NCount; \
334 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
335 const uint32_t T = TBase + SIndex % TCount; \
336 dynstr_ucs4_append(d, L); \
337 dynstr_ucs4_append(d, V); \
339 dynstr_ucs4_append(d, T); \
341 /* Equal to own canonical decomposition */ \
342 dynstr_ucs4_append(d, c); \
345 /** @brief Recursively compute the canonical decomposition of @p c
346 * @param d Dynamic string to store decomposition in
347 * @param c Code point to decompose (must be a valid!)
348 * @return 0 on success, -1 on error
350 static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
351 utf32__decompose_one_generic(canon);
354 /** @brief Recursively compute the compatibility decomposition of @p c
355 * @param d Dynamic string to store decomposition in
356 * @param c Code point to decompose (must be a valid!)
357 * @return 0 on success, -1 on error
359 static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
360 utf32__decompose_one_generic(compat);
363 /** @brief Guts of the decomposition functions */
364 #define utf32__decompose_generic(WHICH) do { \
365 struct dynstr_ucs4 d; \
368 dynstr_ucs4_init(&d); \
371 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
373 utf32__decompose_one_##WHICH(&d, c); \
376 if(utf32__canonical_ordering(d.vec, d.nvec)) \
378 dynstr_ucs4_terminate(&d); \
387 /** @brief Canonically decompose @p [s,s+ns)
388 * @param s Pointer to string
389 * @param ns Length of string
390 * @param ndp Where to store length of result
391 * @return Pointer to result string, or NULL
393 * Computes the canonical decomposition of a string and stably sorts combining
394 * characters into canonical order. The result is in Normalization Form D and
395 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
396 * NormalizationTest.txt.
398 * Returns NULL if the string is not valid for either of the following reasons:
399 * - it codes for a UTF-16 surrogate
400 * - it codes for a value outside the unicode code space
402 uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
403 utf32__decompose_generic(canon);
406 /** @brief Compatibility decompose @p [s,s+ns)
407 * @param s Pointer to string
408 * @param ns Length of string
409 * @param ndp Where to store length of result
410 * @return Pointer to result string, or NULL
412 * Computes the compatibility decomposition of a string and stably sorts
413 * combining characters into canonical order. The result is in Normalization
414 * Form KD and (at the time of writing!) passes the NFKD tests defined in
415 * Unicode 5.0's NormalizationTest.txt.
417 * Returns NULL if the string is not valid for either of the following reasons:
418 * - it codes for a UTF-16 surrogate
419 * - it codes for a value outside the unicode code space
421 uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
422 utf32__decompose_generic(compat);
425 /** @brief Single-character case-fold and decompose operation */
426 #define utf32__casefold_one(WHICH) do { \
427 const uint32_t *cf = \
428 (c < UNICODE_NCHARS \
429 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold \
432 /* Found a case-fold mapping in the table */ \
434 utf32__decompose_one_##WHICH(&d, *cf++); \
436 utf32__decompose_one_##WHICH(&d, c); \
439 /** @brief Case-fold @p [s,s+ns)
440 * @param s Pointer to string
441 * @param ns Length of string
442 * @param ndp Where to store length of result
443 * @return Pointer to result string, or NULL
445 * Case-fold the string at @p s according to full default case-folding rules
446 * (s3.13) for caseless matching. The result will be in NFD.
448 * Returns NULL if the string is not valid for either of the following reasons:
449 * - it codes for a UTF-16 surrogate
450 * - it codes for a value outside the unicode code space
452 uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
453 struct dynstr_ucs4 d;
458 /* If the canonical decomposition of the string includes any combining
459 * character that case-folds to a non-combining character then we must
460 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
461 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
462 * canonically decompose to it. */
463 for(n = 0; n < ns; ++n) {
465 if(c < UNICODE_NCHARS
466 && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
467 & unicode_normalize_before_casefold))
471 /* We need a preliminary decomposition */
472 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
476 dynstr_ucs4_init(&d);
479 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
481 utf32__casefold_one(canon);
484 if(utf32__canonical_ordering(d.vec, d.nvec))
486 dynstr_ucs4_terminate(&d);
496 /** @brief Compatibilit case-fold @p [s,s+ns)
497 * @param s Pointer to string
498 * @param ns Length of string
499 * @param ndp Where to store length of result
500 * @return Pointer to result string, or NULL
502 * Case-fold the string at @p s according to full default case-folding rules
503 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
505 * Returns NULL if the string is not valid for either of the following reasons:
506 * - it codes for a UTF-16 surrogate
507 * - it codes for a value outside the unicode code space
509 uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
510 struct dynstr_ucs4 d;
515 for(n = 0; n < ns; ++n) {
517 if(c < UNICODE_NCHARS
518 && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
519 & unicode_normalize_before_casefold))
523 /* We need a preliminary _canonical_ decomposition */
524 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
528 /* This computes NFKD(toCaseFold(s)) */
529 #define compat_casefold_middle() do { \
530 dynstr_ucs4_init(&d); \
533 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
535 utf32__casefold_one(compat); \
538 if(utf32__canonical_ordering(d.vec, d.nvec)) \
541 /* Do the inner (NFKD o toCaseFold) */
542 compat_casefold_middle();
543 /* We can do away with the NFD'd copy of the input now */
547 /* Do the outer (NFKD o toCaseFold) */
548 compat_casefold_middle();
550 dynstr_ucs4_terminate(&d);
560 /** @brief Order a pair of UTF-32 strings
561 * @param a First 0-terminated string
562 * @param b Second 0-terminated string
563 * @return -1, 0 or 1 for a less than, equal to or greater than b
565 * "Comparable to strcmp() at its best."
567 int utf32_cmp(const uint32_t *a, const uint32_t *b) {
568 while(*a && *b && *a == *b) {
572 return *a < *b ? -1 : (*a > *b ? 1 : 0);
576 /** @defgroup Functions that operate on UTF-8 strings */
579 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
580 #define utf8__transform(FN) do { \
581 uint32_t *to32 = 0, *decomp32 = 0; \
582 size_t nto32, ndecomp32; \
585 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
586 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
587 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
594 /** @brief Canonically decompose @p [s,s+ns)
595 * @param s Pointer to string
596 * @param ns Length of string
597 * @param ndp Where to store length of result
598 * @return Pointer to result string, or NULL
600 * Computes the canonical decomposition of a string and stably sorts combining
601 * characters into canonical order. The result is in Normalization Form D and
602 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
603 * NormalizationTest.txt.
605 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
608 * See also utf32_decompose_canon().
610 char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
611 utf8__transform(utf32_decompose_canon);
614 /** @brief Compatibility decompose @p [s,s+ns)
615 * @param s Pointer to string
616 * @param ns Length of string
617 * @param ndp Where to store length of result
618 * @return Pointer to result string, or NULL
620 * Computes the compatibility decomposition of a string and stably sorts
621 * combining characters into canonical order. The result is in Normalization
622 * Form KD and (at the time of writing!) passes the NFKD tests defined in
623 * Unicode 5.0's NormalizationTest.txt.
625 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
628 * See also utf32_decompose_compat().
630 char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
631 utf8__transform(utf32_decompose_compat);
634 /** @brief Case-fold @p [s,s+ns)
635 * @param s Pointer to string
636 * @param ns Length of string
637 * @param ndp Where to store length of result
638 * @return Pointer to result string, or NULL
640 * Case-fold the string at @p s according to full default case-folding rules
641 * (s3.13). The result will be in NFD.
643 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
646 char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
647 utf8__transform(utf32_casefold_canon);
650 /** @brief Compatibility case-fold @p [s,s+ns)
651 * @param s Pointer to string
652 * @param ns Length of string
653 * @param ndp Where to store length of result
654 * @return Pointer to result string, or NULL
656 * Case-fold the string at @p s according to full default case-folding rules
657 * (s3.13). The result will be in NFKD.
659 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
662 char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
663 utf8__transform(utf32_casefold_compat);