Commit | Line | Data |
---|---|---|
e5a5a138 RK |
1 | /* |
2 | * This file is part of DisOrder | |
3 | * Copyright (C) 2007 Richard Kettlewell | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
18 | * USA | |
19 | */ | |
20 | /** @file lib/unicode.c | |
21 | * @brief Unicode support functions | |
22 | * | |
23 | * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the | |
24 | * encoding schemes). | |
25 | * | |
26 | * The idea is that all the strings that hit the database will be in a | |
27 | * particular normalization form, and for the search and tags database | |
28 | * in case-folded form, so they can be naively compared within the | |
29 | * database code. | |
30 | * | |
31 | * As the code stands this guarantee is not well met! | |
32 | */ | |
33 | ||
34 | #include <config.h> | |
35 | #include "types.h" | |
36 | ||
37 | #include <string.h> | |
38 | #include <stdio.h> /* TODO */ | |
39 | ||
40 | #include "mem.h" | |
41 | #include "vector.h" | |
42 | #include "unicode.h" | |
43 | #include "unidata.h" | |
44 | ||
45 | /** @defgroup utftransform Functions that transform between different Unicode encoding forms */ | |
46 | /*@{*/ | |
47 | ||
48 | /** @brief Convert UTF-32 to UTF-8 | |
49 | * @param s Source string | |
50 | * @param ns Length of source string in code points | |
51 | * @param ndp Where to store length of destination string (or NULL) | |
52 | * @return Newly allocated destination string or NULL on error | |
53 | * | |
54 | * If the UTF-32 is not valid then NULL is returned. A UTF-32 code | |
55 | * point is invalid if: | |
56 | * - it codes for a UTF-16 surrogate | |
57 | * - it codes for a value outside the unicode code space | |
58 | * | |
59 | * The return value is always 0-terminated. The value returned via @p | |
60 | * *ndp does not include the terminator. | |
61 | */ | |
62 | char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) { | |
63 | struct dynstr d; | |
64 | uint32_t c; | |
65 | ||
66 | dynstr_init(&d); | |
67 | while(ns > 0) { | |
68 | c = *s++; | |
69 | if(c < 0x80) | |
70 | dynstr_append(&d, c); | |
71 | else if(c < 0x0800) { | |
72 | dynstr_append(&d, 0xC0 | (c >> 6)); | |
73 | dynstr_append(&d, 0x80 | (c & 0x3F)); | |
74 | } else if(c < 0x10000) { | |
75 | if(c >= 0xDF800 && c <= 0xDFFF) | |
76 | goto error; | |
77 | dynstr_append(&d, 0xE0 | (c >> 12)); | |
78 | dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F)); | |
79 | dynstr_append(&d, 0x80 | (c & 0x3F)); | |
80 | } else if(c < 0x110000) { | |
81 | dynstr_append(&d, 0xF0 | (c >> 18)); | |
82 | dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F)); | |
83 | dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F)); | |
84 | dynstr_append(&d, 0x80 | (c & 0x3F)); | |
85 | } else | |
86 | goto error; | |
87 | --ns; | |
88 | } | |
89 | dynstr_terminate(&d); | |
90 | if(ndp) | |
91 | *ndp = d.nvec; | |
92 | return d.vec; | |
93 | error: | |
94 | xfree(d.vec); | |
95 | return 0; | |
96 | } | |
97 | ||
98 | /** @brief Convert UTF-8 to UTF-32 | |
99 | * @param s Source string | |
100 | * @param ns Length of source string in code points | |
101 | * @param ndp Where to store length of destination string (or NULL) | |
102 | * @return Newly allocated destination string or NULL | |
103 | * | |
104 | * The return value is always 0-terminated. The value returned via @p | |
105 | * *ndp does not include the terminator. | |
106 | * | |
107 | * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence | |
108 | * for a code point is invalid if: | |
109 | * - it is not the shortest possible sequence for the code point | |
110 | * - it codes for a UTF-16 surrogate | |
111 | * - it codes for a value outside the unicode code space | |
112 | */ | |
113 | uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) { | |
114 | struct dynstr_ucs4 d; | |
115 | uint32_t c32, c; | |
116 | const uint8_t *ss = (const uint8_t *)s; | |
117 | ||
118 | dynstr_ucs4_init(&d); | |
119 | while(ns > 0) { | |
120 | c = *ss++; | |
121 | --ns; | |
122 | /* | |
123 | * Acceptable UTF-8 is: | |
124 | * | |
125 | * 0xxxxxxx | |
126 | * 7 data bits gives 0x00 - 0x7F and all are acceptable | |
127 | * | |
128 | * 110xxxxx 10xxxxxx | |
129 | * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable | |
130 | * | |
131 | * 1110xxxx 10xxxxxx 10xxxxxx | |
132 | * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable | |
133 | * (and UTF-16 surrogates are not acceptable) | |
134 | * | |
135 | * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
136 | * 21 data bits gives 0x00000000 - 0x001FFFFF | |
137 | * but only 0x00010000 - 0x0010FFFF are acceptable | |
138 | * | |
139 | * It is NOT always the case that the data bits in the first byte | |
140 | * are always non-0 for the acceptable values, so we do a separate | |
141 | * check after decoding. | |
142 | */ | |
143 | if(c < 0x80) | |
144 | c32 = c; | |
145 | else if(c <= 0xDF) { | |
146 | if(ns < 1) goto error; | |
147 | c32 = c & 0x1F; | |
148 | c = *ss++; | |
149 | if((c & 0xC0) != 0x80) goto error; | |
150 | c32 = (c32 << 6) | (c & 0x3F); | |
151 | if(c32 < 0x80) goto error; | |
152 | } else if(c <= 0xEF) { | |
153 | if(ns < 2) goto error; | |
154 | c32 = c & 0x0F; | |
155 | c = *ss++; | |
156 | if((c & 0xC0) != 0x80) goto error; | |
157 | c32 = (c32 << 6) | (c & 0x3F); | |
158 | c = *ss++; | |
159 | if((c & 0xC0) != 0x80) goto error; | |
160 | c32 = (c32 << 6) | (c & 0x3F); | |
161 | if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error; | |
162 | } else if(c <= 0xF7) { | |
163 | if(ns < 3) goto error; | |
164 | c32 = c & 0x07; | |
165 | c = *ss++; | |
166 | if((c & 0xC0) != 0x80) goto error; | |
167 | c32 = (c32 << 6) | (c & 0x3F); | |
168 | c = *ss++; | |
169 | if((c & 0xC0) != 0x80) goto error; | |
170 | c32 = (c32 << 6) | (c & 0x3F); | |
171 | c = *ss++; | |
172 | if((c & 0xC0) != 0x80) goto error; | |
173 | c32 = (c32 << 6) | (c & 0x3F); | |
174 | if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error; | |
175 | } else | |
176 | goto error; | |
177 | dynstr_ucs4_append(&d, c32); | |
178 | } | |
179 | dynstr_ucs4_terminate(&d); | |
180 | if(ndp) | |
181 | *ndp = d.nvec; | |
182 | return d.vec; | |
183 | error: | |
184 | xfree(d.vec); | |
185 | return 0; | |
186 | } | |
187 | ||
188 | /*@}*/ | |
189 | /** @defgroup utf32 Functions that operate on UTF-32 strings */ | |
190 | /*@{*/ | |
191 | ||
192 | /** @brief Return the length of a 0-terminated UTF-32 string | |
193 | * @param s Pointer to 0-terminated string | |
194 | * @return Length of string in code points (excluding terminator) | |
195 | * | |
196 | * Unlike the conversion functions no validity checking is done on the | |
197 | * string. | |
198 | */ | |
199 | size_t utf32_len(const uint32_t *s) { | |
200 | const uint32_t *t = s; | |
201 | ||
202 | while(*t) | |
203 | ++t; | |
204 | return (size_t)(t - s); | |
205 | } | |
206 | ||
207 | /** @brief Return the combining class of @p c | |
208 | * @param c Code point | |
209 | * @return Combining class of @p c | |
210 | */ | |
211 | static inline int utf32__combining_class(uint32_t c) { | |
212 | if(c < UNICODE_NCHARS) | |
213 | return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc; | |
214 | return 0; | |
215 | } | |
216 | ||
217 | /** @brief Stably sort [s,s+ns) into descending order of combining class | |
218 | * @param s Start of array | |
219 | * @param ns Number of elements, must be at least 1 | |
220 | * @param buffer Buffer of at least @p ns elements | |
221 | */ | |
222 | static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) { | |
223 | uint32_t *a, *b, *bp; | |
224 | size_t na, nb; | |
225 | ||
226 | switch(ns) { | |
227 | case 1: /* 1-element array is always sorted */ | |
228 | return; | |
229 | case 2: /* 2-element arrays are trivial to sort */ | |
230 | if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) { | |
231 | uint32_t tmp = s[0]; | |
232 | s[0] = s[1]; | |
233 | s[1] = tmp; | |
234 | } | |
235 | return; | |
236 | default: | |
237 | /* Partition the array */ | |
238 | na = ns / 2; | |
239 | nb = ns - na; | |
240 | a = s; | |
241 | b = s + na; | |
242 | /* Sort the two halves of the array */ | |
243 | utf32__sort_ccc(a, na, buffer); | |
244 | utf32__sort_ccc(b, nb, buffer); | |
245 | /* Merge them back into one, via the buffer */ | |
246 | bp = buffer; | |
247 | while(na > 0 && nb > 0) { | |
248 | /* We want descending order of combining class (hence <) | |
249 | * and we want stability within combining classes (hence <=) | |
250 | */ | |
251 | if(utf32__combining_class(*a) <= utf32__combining_class(*b)) { | |
252 | *bp++ = *a++; | |
253 | --na; | |
254 | } else { | |
255 | *bp++ = *b++; | |
256 | --nb; | |
257 | } | |
258 | } | |
259 | while(na > 0) { | |
260 | *bp++ = *a++; | |
261 | --na; | |
262 | } | |
263 | while(nb > 0) { | |
264 | *bp++ = *b++; | |
265 | --nb; | |
266 | } | |
267 | memcpy(s, buffer, ns * sizeof(uint32_t)); | |
268 | return; | |
269 | } | |
270 | } | |
271 | ||
272 | /** @brief Put combining characters into canonical order | |
273 | * @param s Pointer to UTF-32 string | |
274 | * @param ns Length of @p s | |
275 | * @return 0 on success, -1 on error | |
276 | * | |
277 | * @p s is modified in-place. See Unicode 5.0 s3.11 for details of | |
278 | * the ordering. | |
279 | * | |
280 | * Currently we only support a maximum of 1024 combining characters | |
281 | * after each base character. If this limit is exceeded then -1 is | |
282 | * returned. | |
283 | */ | |
284 | static int utf32__canonical_ordering(uint32_t *s, size_t ns) { | |
285 | size_t nc; | |
286 | uint32_t buffer[1024]; | |
287 | ||
288 | /* The ordering amounts to a stable sort of each contiguous group of | |
289 | * characters with non-0 combining class. */ | |
290 | while(ns > 0) { | |
291 | /* Skip non-combining characters */ | |
292 | if(utf32__combining_class(*s) == 0) { | |
293 | ++s; | |
294 | --ns; | |
295 | continue; | |
296 | } | |
297 | /* We must now have at least one combining character; see how many | |
298 | * there are */ | |
299 | for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc) | |
300 | ; | |
301 | if(nc > 1024) | |
302 | return -1; | |
303 | /* Sort the array */ | |
304 | utf32__sort_ccc(s, nc, buffer); | |
305 | s += nc; | |
306 | ns -= nc; | |
307 | } | |
308 | return 0; | |
309 | } | |
310 | ||
311 | /* Magic numbers from UAX #15 s16 */ | |
312 | #define SBase 0xAC00 | |
313 | #define LBase 0x1100 | |
314 | #define VBase 0x1161 | |
315 | #define TBase 0x11A7 | |
316 | #define LCount 19 | |
317 | #define VCount 21 | |
318 | #define TCount 28 | |
319 | #define NCount (VCount * TCount) | |
320 | #define SCount (LCount * NCount) | |
321 | ||
322 | /** @brief Guts of the decomposition lookup functions */ | |
323 | #define utf32__decompose_one_generic(WHICH) do { \ | |
324 | const uint32_t *dc = \ | |
325 | (c < UNICODE_NCHARS \ | |
326 | ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH \ | |
327 | : 0); \ | |
328 | if(dc) { \ | |
329 | /* Found a canonical decomposition in the table */ \ | |
330 | while(*dc) \ | |
331 | utf32__decompose_one_##WHICH(d, *dc++); \ | |
332 | } else if(c >= SBase && c < SBase + SCount) { \ | |
333 | /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \ | |
334 | const uint32_t SIndex = c - SBase; \ | |
335 | const uint32_t L = LBase + SIndex / NCount; \ | |
336 | const uint32_t V = VBase + (SIndex % NCount) / TCount; \ | |
337 | const uint32_t T = TBase + SIndex % TCount; \ | |
338 | dynstr_ucs4_append(d, L); \ | |
339 | dynstr_ucs4_append(d, V); \ | |
340 | if(T != TBase) \ | |
341 | dynstr_ucs4_append(d, T); \ | |
342 | } else \ | |
343 | /* Equal to own canonical decomposition */ \ | |
344 | dynstr_ucs4_append(d, c); \ | |
345 | } while(0) | |
346 | ||
347 | /** @brief Recursively compute the canonical decomposition of @p c | |
348 | * @param d Dynamic string to store decomposition in | |
349 | * @param c Code point to decompose (must be a valid!) | |
350 | * @return 0 on success, -1 on error | |
351 | */ | |
352 | static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) { | |
353 | utf32__decompose_one_generic(canon); | |
354 | } | |
355 | ||
356 | /** @brief Recursively compute the compatibility decomposition of @p c | |
357 | * @param d Dynamic string to store decomposition in | |
358 | * @param c Code point to decompose (must be a valid!) | |
359 | * @return 0 on success, -1 on error | |
360 | */ | |
361 | static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) { | |
362 | utf32__decompose_one_generic(compat); | |
363 | } | |
364 | ||
365 | /** @brief Guts of the decomposition functions */ | |
366 | #define utf32__decompose_generic(WHICH) do { \ | |
367 | struct dynstr_ucs4 d; \ | |
368 | uint32_t c; \ | |
369 | \ | |
370 | dynstr_ucs4_init(&d); \ | |
371 | while(ns) { \ | |
372 | c = *s++; \ | |
373 | if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF) \ | |
374 | goto error; \ | |
375 | utf32__decompose_one_##WHICH(&d, c); \ | |
376 | --ns; \ | |
377 | } \ | |
378 | if(utf32__canonical_ordering(d.vec, d.nvec)) \ | |
379 | goto error; \ | |
380 | dynstr_ucs4_terminate(&d); \ | |
381 | if(ndp) \ | |
382 | *ndp = d.nvec; \ | |
383 | return d.vec; \ | |
384 | error: \ | |
385 | xfree(d.vec); \ | |
386 | return 0; \ | |
387 | } while(0) | |
388 | ||
389 | /** @brief Canonically decompose @p [s,s+ns) | |
390 | * @param s Pointer to string | |
391 | * @param ns Length of string | |
392 | * @param ndp Where to store length of result | |
393 | * @return Pointer to result string, or NULL | |
394 | * | |
395 | * Computes the canonical decomposition of a string and stably sorts combining | |
396 | * characters into canonical order. The result is in Normalization Form D and | |
397 | * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's | |
398 | * NormalizationTest.txt. | |
399 | * | |
400 | * Returns NULL if the string is not valid for either of the following | |
401 | * reasons: | |
402 | * - it codes for a UTF-16 surrogate | |
403 | * - it codes for a value outside the unicode code space | |
404 | */ | |
405 | uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) { | |
406 | utf32__decompose_generic(canon); | |
407 | } | |
408 | ||
409 | /** @brief Compatibility decompose @p [s,s+ns) | |
410 | * @param s Pointer to string | |
411 | * @param ns Length of string | |
412 | * @param ndp Where to store length of result | |
413 | * @return Pointer to result string, or NULL | |
414 | * | |
415 | * Computes the compatibility decomposition of a string and stably sorts | |
416 | * combining characters into canonical order. The result is in Normalization | |
417 | * Form KD and (at the time of writing!) passes the NFKD tests defined in | |
418 | * Unicode 5.0's NormalizationTest.txt. | |
419 | * | |
420 | * Returns NULL if the string is not valid for either of the following | |
421 | * reasons: | |
422 | * - it codes for a UTF-16 surrogate | |
423 | * - it codes for a value outside the unicode code space | |
424 | */ | |
425 | uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) { | |
426 | utf32__decompose_generic(compat); | |
427 | } | |
428 | ||
429 | /** @brief Case-fold @p C | |
430 | * @param D String to append to | |
431 | * @param C Character to fold | |
432 | */ | |
433 | static inline void utf32__casefold_one_canon(struct dynstr_ucs4 *d, uint32_t c) { | |
434 | const uint32_t *cf = | |
435 | (c < UNICODE_NCHARS | |
436 | ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold | |
437 | : 0); | |
438 | if(cf) { | |
439 | /* Found a case-fold mapping in the table */ | |
440 | while(*cf) | |
441 | utf32__decompose_one_canon(d, *cf++); | |
442 | } else | |
443 | utf32__decompose_one_canon(d, c); | |
444 | } | |
445 | ||
446 | /** @brief Case-fold @p [s,s+ns) | |
447 | * @param s Pointer to string | |
448 | * @param ns Length of string | |
449 | * @param ndp Where to store length of result | |
450 | * @return Pointer to result string, or NULL | |
451 | * | |
452 | * Case-fold the string at @p s according to full default case-folding rules | |
453 | * (s3.13). The result will be in NFD. | |
454 | * | |
455 | * Returns NULL if the string is not valid for either of the following | |
456 | * reasons: | |
457 | * - it codes for a UTF-16 surrogate | |
458 | * - it codes for a value outside the unicode code space | |
459 | */ | |
460 | uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) { | |
461 | struct dynstr_ucs4 d; | |
462 | uint32_t c; | |
463 | size_t n; | |
464 | uint32_t *ss = 0; | |
465 | ||
466 | /* If the canonical decomposition of the string includes any combining | |
467 | * character that case-folds to a non-combining character then we must | |
468 | * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING | |
469 | * GREEK YPOGEGRAMMENI in its decomposition and the various characters that | |
470 | * canonically decompose to it. */ | |
471 | for(n = 0; n < ns; ++n) { | |
472 | c = s[n]; | |
473 | if(c < UNICODE_NCHARS | |
474 | && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags | |
475 | & unicode_normalize_before_casefold)) | |
476 | break; | |
477 | } | |
478 | if(n < ns) { | |
479 | /* We need a preliminary decomposition */ | |
480 | if(!(ss = utf32_decompose_canon(s, ns, &ns))) | |
481 | return 0; | |
482 | s = ss; | |
483 | } | |
484 | dynstr_ucs4_init(&d); | |
485 | while(ns) { | |
486 | c = *s++; | |
487 | if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF) | |
488 | goto error; | |
489 | utf32__casefold_one_canon(&d, c); | |
490 | --ns; | |
491 | } | |
492 | if(utf32__canonical_ordering(d.vec, d.nvec)) | |
493 | goto error; | |
494 | dynstr_ucs4_terminate(&d); | |
495 | if(ndp) | |
496 | *ndp = d.nvec; | |
497 | return d.vec; | |
498 | error: | |
499 | xfree(d.vec); | |
500 | xfree(ss); | |
501 | return 0; | |
502 | } | |
503 | ||
504 | /** @brief Order a pair of UTF-32 strings | |
505 | * @param a First 0-terminated string | |
506 | * @param b Second 0-terminated string | |
507 | * @return -1, 0 or 1 for a less than, equal to or greater than b | |
508 | * | |
509 | * "Comparable to strcmp() at its best." | |
510 | */ | |
511 | int utf32_cmp(const uint32_t *a, const uint32_t *b) { | |
512 | while(*a && *b && *a == *b) { | |
513 | ++a; | |
514 | ++b; | |
515 | } | |
516 | return *a < *b ? -1 : (*a > *b ? 1 : 0); | |
517 | } | |
518 | ||
519 | /*@}*/ | |
520 | /** @defgroup Functions that operate on UTF-8 strings */ | |
521 | /*@{*/ | |
522 | ||
523 | /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */ | |
524 | #define utf8__transform(FN) do { \ | |
525 | uint32_t *to32 = 0, *decomp32 = 0; \ | |
526 | size_t nto32, ndecomp32; \ | |
527 | char *decomp8 = 0; \ | |
528 | \ | |
529 | if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \ | |
530 | if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \ | |
531 | decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \ | |
532 | error: \ | |
533 | xfree(to32); \ | |
534 | xfree(decomp32); \ | |
535 | return decomp8; \ | |
536 | } while(0) | |
537 | ||
538 | /** @brief Canonically decompose @p [s,s+ns) | |
539 | * @param s Pointer to string | |
540 | * @param ns Length of string | |
541 | * @param ndp Where to store length of result | |
542 | * @return Pointer to result string, or NULL | |
543 | * | |
544 | * Computes the canonical decomposition of a string and stably sorts combining | |
545 | * characters into canonical order. The result is in Normalization Form D and | |
546 | * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's | |
547 | * NormalizationTest.txt. | |
548 | * | |
549 | * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why | |
550 | * this might be. | |
551 | * | |
552 | * See also utf32_decompose_canon(). | |
553 | */ | |
554 | char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) { | |
555 | utf8__transform(utf32_decompose_canon); | |
556 | } | |
557 | ||
558 | /** @brief Compatibility decompose @p [s,s+ns) | |
559 | * @param s Pointer to string | |
560 | * @param ns Length of string | |
561 | * @param ndp Where to store length of result | |
562 | * @return Pointer to result string, or NULL | |
563 | * | |
564 | * Computes the compatibility decomposition of a string and stably sorts | |
565 | * combining characters into canonical order. The result is in Normalization | |
566 | * Form KD and (at the time of writing!) passes the NFKD tests defined in | |
567 | * Unicode 5.0's NormalizationTest.txt. | |
568 | * | |
569 | * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why | |
570 | * this might be. | |
571 | * | |
572 | * See also utf32_decompose_compat(). | |
573 | */ | |
574 | char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) { | |
575 | utf8__transform(utf32_decompose_compat); | |
576 | } | |
577 | ||
578 | /** @brief Case-fold @p [s,s+ns) | |
579 | * @param s Pointer to string | |
580 | * @param ns Length of string | |
581 | * @param ndp Where to store length of result | |
582 | * @return Pointer to result string, or NULL | |
583 | * | |
584 | * Case-fold the string at @p s according to full default case-folding rules | |
585 | * (s3.13). The result will be in NFD. | |
586 | * | |
587 | * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why | |
588 | * this might be. | |
589 | */ | |
590 | char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) { | |
591 | utf8__transform(utf32_casefold_canon); | |
592 | } | |
593 | ||
594 | /** @brief Compatibility case-fold @p [s,s+ns) | |
595 | * @param s Pointer to string | |
596 | * @param ns Length of string | |
597 | * @param ndp Where to store length of result | |
598 | * @return Pointer to result string, or NULL | |
599 | * | |
600 | * Case-fold the string at @p s according to full default case-folding rules | |
601 | * (s3.13). The result will be in NFKD. | |
602 | * | |
603 | * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why | |
604 | * this might be. | |
605 | */ | |
606 | #if 0 | |
607 | char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) { | |
608 | utf8__transform(utf32_casefold_compat); | |
609 | } | |
610 | #endif | |
611 | ||
612 | /*@}*/ | |
613 | ||
614 | /* | |
615 | Local Variables: | |
616 | c-basic-offset:2 | |
617 | comment-column:40 | |
618 | fill-column:79 | |
619 | indent-tabs-mode:nil | |
620 | End: | |
621 | */ |