2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 /** @file lib/unicode.c
21 * @brief Unicode support functions
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
24 * encoding schemes). The primary encoding form is UTF-32 but convenience
25 * wrappers using UTF-8 are provided for a number of functions.
27 * The idea is that all the strings that hit the database will be in a
28 * particular normalization form, and for the search and tags database
29 * in case-folded form, so they can be naively compared within the
32 * As the code stands this guarantee is not well met!
39 #include <stdio.h> /* TODO */
46 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
49 /** @brief Convert UTF-32 to UTF-8
50 * @param s Source string
51 * @param ns Length of source string in code points
52 * @param ndp Where to store length of destination string (or NULL)
53 * @return Newly allocated destination string or NULL on error
55 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
57 * - it codes for a UTF-16 surrogate
58 * - it codes for a value outside the unicode code space
60 * The return value is always 0-terminated. The value returned via @p *ndp
61 * does not include the terminator.
63 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
73 dynstr_append(&d, 0xC0 | (c >> 6));
74 dynstr_append(&d, 0x80 | (c & 0x3F));
75 } else if(c < 0x10000) {
76 if(c >= 0xD800 && c <= 0xDFFF)
78 dynstr_append(&d, 0xE0 | (c >> 12));
79 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
80 dynstr_append(&d, 0x80 | (c & 0x3F));
81 } else if(c < 0x110000) {
82 dynstr_append(&d, 0xF0 | (c >> 18));
83 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
84 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
85 dynstr_append(&d, 0x80 | (c & 0x3F));
99 /** @brief Convert UTF-8 to UTF-32
100 * @param s Source string
101 * @param ns Length of source string in code points
102 * @param ndp Where to store length of destination string (or NULL)
103 * @return Newly allocated destination string or NULL
105 * The return value is always 0-terminated. The value returned via @p *ndp
106 * does not include the terminator.
108 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
109 * for a code point is invalid if:
110 * - it is not the shortest possible sequence for the code point
111 * - it codes for a UTF-16 surrogate
112 * - it codes for a value outside the unicode code space
114 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
115 struct dynstr_ucs4 d;
117 const uint8_t *ss = (const uint8_t *)s;
119 dynstr_ucs4_init(&d);
123 /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
124 * (Unicode 5.0.0 s3.9 D76)
127 * 7 data bits gives 0x00 - 0x7F and all are acceptable
130 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
132 * 1110xxxx 10xxxxxx 10xxxxxx
133 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
134 * (and UTF-16 surrogates are not acceptable)
136 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
137 * 21 data bits gives 0x00000000 - 0x001FFFFF
138 * but only 0x00010000 - 0x0010FFFF are acceptable
140 * It is NOT always the case that the data bits in the first byte are
141 * always non-0 for the acceptable values, so we do a separate check after
147 if(ns < 1) goto error;
150 if((c & 0xC0) != 0x80) goto error;
151 c32 = (c32 << 6) | (c & 0x3F);
152 if(c32 < 0x80) goto error;
153 } else if(c <= 0xEF) {
154 if(ns < 2) goto error;
157 if((c & 0xC0) != 0x80) goto error;
158 c32 = (c32 << 6) | (c & 0x3F);
160 if((c & 0xC0) != 0x80) goto error;
161 c32 = (c32 << 6) | (c & 0x3F);
162 if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
163 } else if(c <= 0xF7) {
164 if(ns < 3) goto error;
167 if((c & 0xC0) != 0x80) goto error;
168 c32 = (c32 << 6) | (c & 0x3F);
170 if((c & 0xC0) != 0x80) goto error;
171 c32 = (c32 << 6) | (c & 0x3F);
173 if((c & 0xC0) != 0x80) goto error;
174 c32 = (c32 << 6) | (c & 0x3F);
175 if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
178 dynstr_ucs4_append(&d, c32);
180 dynstr_ucs4_terminate(&d);
190 /** @defgroup utf32 Functions that operate on UTF-32 strings */
193 /** @brief Return the length of a 0-terminated UTF-32 string
194 * @param s Pointer to 0-terminated string
195 * @return Length of string in code points (excluding terminator)
197 * Unlike the conversion functions no validity checking is done on the string.
199 size_t utf32_len(const uint32_t *s) {
200 const uint32_t *t = s;
204 return (size_t)(t - s);
207 /** @brief Return the @ref unidata structure for code point @p c
209 * @p c can be any 32-bit value, a sensible value will be returned regardless.
211 static const struct unidata *utf32__unidata(uint32_t c) {
212 /* The bottom half of the table contains almost everything of interest
213 * and we can just return the right thing straight away */
214 if(c < UNICODE_BREAK_START)
215 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
216 /* Within the break everything is unassigned */
217 if(c < UNICODE_BREAK_END)
218 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
219 /* Planes 15 and 16 are (mostly) private use */
220 if((c >= 0xF0000 && c <= 0xFFFFD)
221 || (c >= 0x100000 && c <= 0x10FFFD))
222 return utf32__unidata(0xE000); /* first Co code point */
223 /* Everything else above the break top is unassigned */
224 if(c >= UNICODE_BREAK_TOP)
225 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
226 /* Currently the rest is language tags and variation selectors */
227 c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
228 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
231 /** @brief Return the combining class of @p c
232 * @param c Code point
233 * @return Combining class of @p c
235 static inline int utf32__combining_class(uint32_t c) {
236 return utf32__unidata(c)->ccc;
239 /** @brief Stably sort [s,s+ns) into descending order of combining class
240 * @param s Start of array
241 * @param ns Number of elements, must be at least 1
242 * @param buffer Buffer of at least @p ns elements
244 static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
245 uint32_t *a, *b, *bp;
249 case 1: /* 1-element array is always sorted */
251 case 2: /* 2-element arrays are trivial to sort */
252 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
259 /* Partition the array */
264 /* Sort the two halves of the array */
265 utf32__sort_ccc(a, na, buffer);
266 utf32__sort_ccc(b, nb, buffer);
267 /* Merge them back into one, via the buffer */
269 while(na > 0 && nb > 0) {
270 /* We want descending order of combining class (hence <)
271 * and we want stability within combining classes (hence <=)
273 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
289 memcpy(s, buffer, ns * sizeof(uint32_t));
294 /** @brief Put combining characters into canonical order
295 * @param s Pointer to UTF-32 string
296 * @param ns Length of @p s
297 * @return 0 on success, -1 on error
299 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
302 * Currently we only support a maximum of 1024 combining characters after each
303 * base character. If this limit is exceeded then -1 is returned.
305 static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
307 uint32_t buffer[1024];
309 /* The ordering amounts to a stable sort of each contiguous group of
310 * characters with non-0 combining class. */
312 /* Skip non-combining characters */
313 if(utf32__combining_class(*s) == 0) {
318 /* We must now have at least one combining character; see how many
320 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
325 utf32__sort_ccc(s, nc, buffer);
332 /* Magic numbers from UAX #15 s16 */
340 #define NCount (VCount * TCount)
341 #define SCount (LCount * NCount)
343 /** @brief Guts of the decomposition lookup functions */
344 #define utf32__decompose_one_generic(WHICH) do { \
345 const uint32_t *dc = utf32__unidata(c)->WHICH; \
347 /* Found a canonical decomposition in the table */ \
349 utf32__decompose_one_##WHICH(d, *dc++); \
350 } else if(c >= SBase && c < SBase + SCount) { \
351 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
352 const uint32_t SIndex = c - SBase; \
353 const uint32_t L = LBase + SIndex / NCount; \
354 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
355 const uint32_t T = TBase + SIndex % TCount; \
356 dynstr_ucs4_append(d, L); \
357 dynstr_ucs4_append(d, V); \
359 dynstr_ucs4_append(d, T); \
361 /* Equal to own canonical decomposition */ \
362 dynstr_ucs4_append(d, c); \
365 /** @brief Recursively compute the canonical decomposition of @p c
366 * @param d Dynamic string to store decomposition in
367 * @param c Code point to decompose (must be a valid!)
368 * @return 0 on success, -1 on error
370 static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
371 utf32__decompose_one_generic(canon);
374 /** @brief Recursively compute the compatibility decomposition of @p c
375 * @param d Dynamic string to store decomposition in
376 * @param c Code point to decompose (must be a valid!)
377 * @return 0 on success, -1 on error
379 static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
380 utf32__decompose_one_generic(compat);
383 /** @brief Guts of the decomposition functions */
384 #define utf32__decompose_generic(WHICH) do { \
385 struct dynstr_ucs4 d; \
388 dynstr_ucs4_init(&d); \
391 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
393 utf32__decompose_one_##WHICH(&d, c); \
396 if(utf32__canonical_ordering(d.vec, d.nvec)) \
398 dynstr_ucs4_terminate(&d); \
407 /** @brief Canonically decompose @p [s,s+ns)
408 * @param s Pointer to string
409 * @param ns Length of string
410 * @param ndp Where to store length of result
411 * @return Pointer to result string, or NULL
413 * Computes the canonical decomposition of a string and stably sorts combining
414 * characters into canonical order. The result is in Normalization Form D and
415 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
416 * NormalizationTest.txt.
418 * Returns NULL if the string is not valid for either of the following reasons:
419 * - it codes for a UTF-16 surrogate
420 * - it codes for a value outside the unicode code space
422 uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
423 utf32__decompose_generic(canon);
426 /** @brief Compatibility decompose @p [s,s+ns)
427 * @param s Pointer to string
428 * @param ns Length of string
429 * @param ndp Where to store length of result
430 * @return Pointer to result string, or NULL
432 * Computes the compatibility decomposition of a string and stably sorts
433 * combining characters into canonical order. The result is in Normalization
434 * Form KD and (at the time of writing!) passes the NFKD tests defined in
435 * Unicode 5.0's NormalizationTest.txt.
437 * Returns NULL if the string is not valid for either of the following reasons:
438 * - it codes for a UTF-16 surrogate
439 * - it codes for a value outside the unicode code space
441 uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
442 utf32__decompose_generic(compat);
445 /** @brief Single-character case-fold and decompose operation */
446 #define utf32__casefold_one(WHICH) do { \
447 const uint32_t *cf = utf32__unidata(c)->casefold; \
449 /* Found a case-fold mapping in the table */ \
451 utf32__decompose_one_##WHICH(&d, *cf++); \
453 utf32__decompose_one_##WHICH(&d, c); \
456 /** @brief Case-fold @p [s,s+ns)
457 * @param s Pointer to string
458 * @param ns Length of string
459 * @param ndp Where to store length of result
460 * @return Pointer to result string, or NULL
462 * Case-fold the string at @p s according to full default case-folding rules
463 * (s3.13) for caseless matching. The result will be in NFD.
465 * Returns NULL if the string is not valid for either of the following reasons:
466 * - it codes for a UTF-16 surrogate
467 * - it codes for a value outside the unicode code space
469 uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
470 struct dynstr_ucs4 d;
475 /* If the canonical decomposition of the string includes any combining
476 * character that case-folds to a non-combining character then we must
477 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
478 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
479 * canonically decompose to it. */
480 for(n = 0; n < ns; ++n)
481 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
484 /* We need a preliminary decomposition */
485 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
489 dynstr_ucs4_init(&d);
492 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
494 utf32__casefold_one(canon);
497 if(utf32__canonical_ordering(d.vec, d.nvec))
499 dynstr_ucs4_terminate(&d);
509 /** @brief Compatibilit case-fold @p [s,s+ns)
510 * @param s Pointer to string
511 * @param ns Length of string
512 * @param ndp Where to store length of result
513 * @return Pointer to result string, or NULL
515 * Case-fold the string at @p s according to full default case-folding rules
516 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
518 * Returns NULL if the string is not valid for either of the following reasons:
519 * - it codes for a UTF-16 surrogate
520 * - it codes for a value outside the unicode code space
522 uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
523 struct dynstr_ucs4 d;
528 for(n = 0; n < ns; ++n)
529 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
532 /* We need a preliminary _canonical_ decomposition */
533 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
537 /* This computes NFKD(toCaseFold(s)) */
538 #define compat_casefold_middle() do { \
539 dynstr_ucs4_init(&d); \
542 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
544 utf32__casefold_one(compat); \
547 if(utf32__canonical_ordering(d.vec, d.nvec)) \
550 /* Do the inner (NFKD o toCaseFold) */
551 compat_casefold_middle();
552 /* We can do away with the NFD'd copy of the input now */
556 /* Do the outer (NFKD o toCaseFold) */
557 compat_casefold_middle();
559 dynstr_ucs4_terminate(&d);
569 /** @brief Order a pair of UTF-32 strings
570 * @param a First 0-terminated string
571 * @param b Second 0-terminated string
572 * @return -1, 0 or 1 for a less than, equal to or greater than b
574 * "Comparable to strcmp() at its best."
576 int utf32_cmp(const uint32_t *a, const uint32_t *b) {
577 while(*a && *b && *a == *b) {
581 return *a < *b ? -1 : (*a > *b ? 1 : 0);
584 /** @brief Return the General_Category value for @p c
586 * @return General_Category property value
588 static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
589 return utf32__unidata(c)->general_category;
592 /** @brief Check Grapheme_Cluster_Break property
593 * @param c Code point
594 * @return 0 if it is as described, 1 otherwise
596 static int utf32__is_control_or_cr_or_lf(uint32_t c) {
597 switch(utf32__general_category(c)) {
600 case unicode_General_Category_Zl:
601 case unicode_General_Category_Zp:
602 case unicode_General_Category_Cc:
604 case unicode_General_Category_Cf:
605 if(c == 0x200C || c == 0x200D)
611 #define Hangul_Syllable_Type_NA 0
612 #define Hangul_Syllable_Type_L 0x1100
613 #define Hangul_Syllable_Type_V 0x1160
614 #define Hangul_Syllable_Type_T 0x11A8
615 #define Hangul_Syllable_Type_LV 0xAC00
616 #define Hangul_Syllable_Type_LVT 0xAC01
618 /** @brief Determine Hangul_Syllable_Type of @p c
619 * @param c Code point
620 * @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
622 * If this is a Hangul character then a representative member of its
623 * equivalence class is returned. Otherwise Hangul_Syllable_Type_NA is
626 static uint32_t utf32__hangul_syllable_type(uint32_t c) {
627 /* Dispose of the bulk of the non-Hangul code points first */
628 if(c < 0x1100) return Hangul_Syllable_Type_NA;
629 if(c > 0x1200 && c < 0xAC00) return Hangul_Syllable_Type_NA;
630 if(c >= 0xD800) return Hangul_Syllable_Type_NA;
631 /* Now we pick out the assigned Hangul code points */
632 if((c >= 0x1100 && c <= 0x1159) || c == 0x115F) return Hangul_Syllable_Type_L;
633 if(c >= 0x1160 && c <= 0x11A2) return Hangul_Syllable_Type_V;
634 if(c >= 0x11A8 && c <= 0x11F9) return Hangul_Syllable_Type_T;
635 if(c >= 0xAC00 && c <= 0xD7A3) {
637 return Hangul_Syllable_Type_LV;
639 return Hangul_Syllable_Type_LVT;
641 return Hangul_Syllable_Type_NA;
644 /** @brief Determine Word_Break property
645 * @param c Code point
646 * @return Word_Break property value of @p c
648 static enum unicode_Word_Break utf32__word_break(uint32_t c) {
649 if(c < 0xAC00 || c > 0xD7A3)
650 return utf32__unidata(c)->word_break;
652 return unicode_Word_Break_ALetter;
655 /** @brief Identify a grapheme cluster boundary
656 * @param s Start of string (must be NFD)
657 * @param ns Length of string
658 * @param n Index within string (in [0,ns].)
659 * @return 1 at a grapheme cluster boundary, 0 otherwise
661 * This function identifies default grapheme cluster boundaries as described in
662 * UAX #29 s3. It returns 1 if @p n points at the code point just after a
663 * grapheme cluster boundary (including the hypothetical code point just after
664 * the end of the string).
666 int utf32_is_gcb(const uint32_t *s, size_t ns, size_t n) {
667 uint32_t before, after;
668 uint32_t hbefore, hafter;
670 if(n == 0 || n == ns)
672 /* Now we know that s[n-1] and s[n] are safe to inspect */
676 if(before == 0x000D && after == 0x000A)
679 if(utf32__is_control_or_cr_or_lf(before)
680 || utf32__is_control_or_cr_or_lf(after))
682 hbefore = utf32__hangul_syllable_type(before);
683 hafter = utf32__hangul_syllable_type(after);
685 if(hbefore == Hangul_Syllable_Type_L
686 && (hafter == Hangul_Syllable_Type_L
687 || hafter == Hangul_Syllable_Type_V
688 || hafter == Hangul_Syllable_Type_LV
689 || hafter == Hangul_Syllable_Type_LVT))
692 if((hbefore == Hangul_Syllable_Type_LV
693 || hbefore == Hangul_Syllable_Type_V)
694 && (hafter == Hangul_Syllable_Type_V
695 || hafter == Hangul_Syllable_Type_T))
698 if((hbefore == Hangul_Syllable_Type_LVT
699 || hbefore == Hangul_Syllable_Type_T)
700 && hafter == Hangul_Syllable_Type_T)
703 if(utf32__word_break(after) == unicode_Word_Break_Extend)
709 /** @brief Return true if @p c is ignorable for boundary specifications */
710 static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
711 return (wb == unicode_Word_Break_Extend
712 || wb == unicode_Word_Break_Format);
715 /** @brief Identify a word boundary
716 * @param s Start of string (must be NFD)
717 * @param ns Length of string
718 * @param n Index within string (in [0,ns].)
719 * @return 1 at a word boundary, 0 otherwise
721 * This function identifies default word boundaries as described in UAX #29 s4.
722 * It returns 1 if @p n points at the code point just after a word boundary
723 * (including the hypothetical code point just after the end of the string).
725 int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
726 enum unicode_Word_Break twobefore, before, after, twoafter;
730 if(n == 0 || n == ns)
733 if(s[n-1] == 0x000D && s[n] == 0x000A)
736 /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
737 switch(s[n-1]) { /* bit of a bodge */
745 if(utf32__boundary_ignorable(utf32__word_break(s[n])))
749 /* Gather the property values we'll need for the rest of the test taking the
750 * s6.2 changes into account */
751 /* First we look at the code points after the proposed boundary */
753 after = utf32__word_break(s[nn++]);
754 if(!utf32__boundary_ignorable(after)) {
755 /* X (Extend|Format)* -> X */
756 while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
759 /* It's possible now that nn=ns */
761 twoafter = utf32__word_break(s[nn]);
763 twoafter = unicode_Word_Break_Other;
765 /* Next we look at the code points before the proposed boundary. This is a
768 while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
771 /* s[nn] must be ignorable */
772 before = utf32__word_break(s[nn]);
773 twobefore = unicode_Word_Break_Other;
775 /* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
777 before = utf32__word_break(s[nn - 1]);
779 /* Repeat the exercise */
780 while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
783 twobefore = utf32__word_break(s[nn]);
785 twobefore = utf32__word_break(s[nn - 1]);
789 if(before == unicode_Word_Break_ALetter
790 && after == unicode_Word_Break_ALetter)
793 if(before == unicode_Word_Break_ALetter
794 && after == unicode_Word_Break_MidLetter
795 && twoafter == unicode_Word_Break_ALetter)
798 if(twobefore == unicode_Word_Break_ALetter
799 && before == unicode_Word_Break_MidLetter
800 && after == unicode_Word_Break_ALetter)
803 if(before == unicode_Word_Break_Numeric
804 && after == unicode_Word_Break_Numeric)
807 if(before == unicode_Word_Break_ALetter
808 && after == unicode_Word_Break_Numeric)
811 if(before == unicode_Word_Break_Numeric
812 && after == unicode_Word_Break_ALetter)
815 if(twobefore == unicode_Word_Break_Numeric
816 && before == unicode_Word_Break_MidNum
817 && after == unicode_Word_Break_Numeric)
820 if(before == unicode_Word_Break_Numeric
821 && after == unicode_Word_Break_MidNum
822 && twoafter == unicode_Word_Break_Numeric)
825 if(before == unicode_Word_Break_Katakana
826 && after == unicode_Word_Break_Katakana)
829 if((before == unicode_Word_Break_ALetter
830 || before == unicode_Word_Break_Numeric
831 || before == unicode_Word_Break_Katakana
832 || before == unicode_Word_Break_ExtendNumLet)
833 && after == unicode_Word_Break_ExtendNumLet)
836 if(before == unicode_Word_Break_ExtendNumLet
837 && (after == unicode_Word_Break_ALetter
838 || after == unicode_Word_Break_Numeric
839 || after == unicode_Word_Break_Katakana))
846 /** @defgroup utf8 Functions that operate on UTF-8 strings */
849 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
850 #define utf8__transform(FN) do { \
851 uint32_t *to32 = 0, *decomp32 = 0; \
852 size_t nto32, ndecomp32; \
855 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
856 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
857 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
864 /** @brief Canonically decompose @p [s,s+ns)
865 * @param s Pointer to string
866 * @param ns Length of string
867 * @param ndp Where to store length of result
868 * @return Pointer to result string, or NULL
870 * Computes the canonical decomposition of a string and stably sorts combining
871 * characters into canonical order. The result is in Normalization Form D and
872 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
873 * NormalizationTest.txt.
875 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
878 * See also utf32_decompose_canon().
880 char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
881 utf8__transform(utf32_decompose_canon);
884 /** @brief Compatibility decompose @p [s,s+ns)
885 * @param s Pointer to string
886 * @param ns Length of string
887 * @param ndp Where to store length of result
888 * @return Pointer to result string, or NULL
890 * Computes the compatibility decomposition of a string and stably sorts
891 * combining characters into canonical order. The result is in Normalization
892 * Form KD and (at the time of writing!) passes the NFKD tests defined in
893 * Unicode 5.0's NormalizationTest.txt.
895 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
898 * See also utf32_decompose_compat().
900 char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
901 utf8__transform(utf32_decompose_compat);
904 /** @brief Case-fold @p [s,s+ns)
905 * @param s Pointer to string
906 * @param ns Length of string
907 * @param ndp Where to store length of result
908 * @return Pointer to result string, or NULL
910 * Case-fold the string at @p s according to full default case-folding rules
911 * (s3.13). The result will be in NFD.
913 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
916 char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
917 utf8__transform(utf32_casefold_canon);
920 /** @brief Compatibility case-fold @p [s,s+ns)
921 * @param s Pointer to string
922 * @param ns Length of string
923 * @param ndp Where to store length of result
924 * @return Pointer to result string, or NULL
926 * Case-fold the string at @p s according to full default case-folding rules
927 * (s3.13). The result will be in NFKD.
929 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
932 char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
933 utf8__transform(utf32_casefold_compat);