chiark / gitweb /
Start of Unicode support rewrite
[disorder] / lib / unicode.c
CommitLineData
e5a5a138
RK
1/*
2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20/** @file lib/unicode.c
21 * @brief Unicode support functions
22 *
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
24 * encoding schemes).
25 *
26 * The idea is that all the strings that hit the database will be in a
27 * particular normalization form, and for the search and tags database
28 * in case-folded form, so they can be naively compared within the
29 * database code.
30 *
31 * As the code stands this guarantee is not well met!
32 */
33
34#include <config.h>
35#include "types.h"
36
37#include <string.h>
38#include <stdio.h> /* TODO */
39
40#include "mem.h"
41#include "vector.h"
42#include "unicode.h"
43#include "unidata.h"
44
45/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
46/*@{*/
47
48/** @brief Convert UTF-32 to UTF-8
49 * @param s Source string
50 * @param ns Length of source string in code points
51 * @param ndp Where to store length of destination string (or NULL)
52 * @return Newly allocated destination string or NULL on error
53 *
54 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code
55 * point is invalid if:
56 * - it codes for a UTF-16 surrogate
57 * - it codes for a value outside the unicode code space
58 *
59 * The return value is always 0-terminated. The value returned via @p
60 * *ndp does not include the terminator.
61 */
62char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
63 struct dynstr d;
64 uint32_t c;
65
66 dynstr_init(&d);
67 while(ns > 0) {
68 c = *s++;
69 if(c < 0x80)
70 dynstr_append(&d, c);
71 else if(c < 0x0800) {
72 dynstr_append(&d, 0xC0 | (c >> 6));
73 dynstr_append(&d, 0x80 | (c & 0x3F));
74 } else if(c < 0x10000) {
75 if(c >= 0xDF800 && c <= 0xDFFF)
76 goto error;
77 dynstr_append(&d, 0xE0 | (c >> 12));
78 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
79 dynstr_append(&d, 0x80 | (c & 0x3F));
80 } else if(c < 0x110000) {
81 dynstr_append(&d, 0xF0 | (c >> 18));
82 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
83 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
84 dynstr_append(&d, 0x80 | (c & 0x3F));
85 } else
86 goto error;
87 --ns;
88 }
89 dynstr_terminate(&d);
90 if(ndp)
91 *ndp = d.nvec;
92 return d.vec;
93error:
94 xfree(d.vec);
95 return 0;
96}
97
98/** @brief Convert UTF-8 to UTF-32
99 * @param s Source string
100 * @param ns Length of source string in code points
101 * @param ndp Where to store length of destination string (or NULL)
102 * @return Newly allocated destination string or NULL
103 *
104 * The return value is always 0-terminated. The value returned via @p
105 * *ndp does not include the terminator.
106 *
107 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
108 * for a code point is invalid if:
109 * - it is not the shortest possible sequence for the code point
110 * - it codes for a UTF-16 surrogate
111 * - it codes for a value outside the unicode code space
112 */
113uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
114 struct dynstr_ucs4 d;
115 uint32_t c32, c;
116 const uint8_t *ss = (const uint8_t *)s;
117
118 dynstr_ucs4_init(&d);
119 while(ns > 0) {
120 c = *ss++;
121 --ns;
122 /*
123 * Acceptable UTF-8 is:
124 *
125 * 0xxxxxxx
126 * 7 data bits gives 0x00 - 0x7F and all are acceptable
127 *
128 * 110xxxxx 10xxxxxx
129 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
130 *
131 * 1110xxxx 10xxxxxx 10xxxxxx
132 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
133 * (and UTF-16 surrogates are not acceptable)
134 *
135 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136 * 21 data bits gives 0x00000000 - 0x001FFFFF
137 * but only 0x00010000 - 0x0010FFFF are acceptable
138 *
139 * It is NOT always the case that the data bits in the first byte
140 * are always non-0 for the acceptable values, so we do a separate
141 * check after decoding.
142 */
143 if(c < 0x80)
144 c32 = c;
145 else if(c <= 0xDF) {
146 if(ns < 1) goto error;
147 c32 = c & 0x1F;
148 c = *ss++;
149 if((c & 0xC0) != 0x80) goto error;
150 c32 = (c32 << 6) | (c & 0x3F);
151 if(c32 < 0x80) goto error;
152 } else if(c <= 0xEF) {
153 if(ns < 2) goto error;
154 c32 = c & 0x0F;
155 c = *ss++;
156 if((c & 0xC0) != 0x80) goto error;
157 c32 = (c32 << 6) | (c & 0x3F);
158 c = *ss++;
159 if((c & 0xC0) != 0x80) goto error;
160 c32 = (c32 << 6) | (c & 0x3F);
161 if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
162 } else if(c <= 0xF7) {
163 if(ns < 3) goto error;
164 c32 = c & 0x07;
165 c = *ss++;
166 if((c & 0xC0) != 0x80) goto error;
167 c32 = (c32 << 6) | (c & 0x3F);
168 c = *ss++;
169 if((c & 0xC0) != 0x80) goto error;
170 c32 = (c32 << 6) | (c & 0x3F);
171 c = *ss++;
172 if((c & 0xC0) != 0x80) goto error;
173 c32 = (c32 << 6) | (c & 0x3F);
174 if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
175 } else
176 goto error;
177 dynstr_ucs4_append(&d, c32);
178 }
179 dynstr_ucs4_terminate(&d);
180 if(ndp)
181 *ndp = d.nvec;
182 return d.vec;
183error:
184 xfree(d.vec);
185 return 0;
186}
187
188/*@}*/
189/** @defgroup utf32 Functions that operate on UTF-32 strings */
190/*@{*/
191
192/** @brief Return the length of a 0-terminated UTF-32 string
193 * @param s Pointer to 0-terminated string
194 * @return Length of string in code points (excluding terminator)
195 *
196 * Unlike the conversion functions no validity checking is done on the
197 * string.
198 */
199size_t utf32_len(const uint32_t *s) {
200 const uint32_t *t = s;
201
202 while(*t)
203 ++t;
204 return (size_t)(t - s);
205}
206
207/** @brief Return the combining class of @p c
208 * @param c Code point
209 * @return Combining class of @p c
210 */
211static inline int utf32__combining_class(uint32_t c) {
212 if(c < UNICODE_NCHARS)
213 return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
214 return 0;
215}
216
217/** @brief Stably sort [s,s+ns) into descending order of combining class
218 * @param s Start of array
219 * @param ns Number of elements, must be at least 1
220 * @param buffer Buffer of at least @p ns elements
221 */
222static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
223 uint32_t *a, *b, *bp;
224 size_t na, nb;
225
226 switch(ns) {
227 case 1: /* 1-element array is always sorted */
228 return;
229 case 2: /* 2-element arrays are trivial to sort */
230 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
231 uint32_t tmp = s[0];
232 s[0] = s[1];
233 s[1] = tmp;
234 }
235 return;
236 default:
237 /* Partition the array */
238 na = ns / 2;
239 nb = ns - na;
240 a = s;
241 b = s + na;
242 /* Sort the two halves of the array */
243 utf32__sort_ccc(a, na, buffer);
244 utf32__sort_ccc(b, nb, buffer);
245 /* Merge them back into one, via the buffer */
246 bp = buffer;
247 while(na > 0 && nb > 0) {
248 /* We want descending order of combining class (hence <)
249 * and we want stability within combining classes (hence <=)
250 */
251 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
252 *bp++ = *a++;
253 --na;
254 } else {
255 *bp++ = *b++;
256 --nb;
257 }
258 }
259 while(na > 0) {
260 *bp++ = *a++;
261 --na;
262 }
263 while(nb > 0) {
264 *bp++ = *b++;
265 --nb;
266 }
267 memcpy(s, buffer, ns * sizeof(uint32_t));
268 return;
269 }
270}
271
272/** @brief Put combining characters into canonical order
273 * @param s Pointer to UTF-32 string
274 * @param ns Length of @p s
275 * @return 0 on success, -1 on error
276 *
277 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of
278 * the ordering.
279 *
280 * Currently we only support a maximum of 1024 combining characters
281 * after each base character. If this limit is exceeded then -1 is
282 * returned.
283 */
284static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
285 size_t nc;
286 uint32_t buffer[1024];
287
288 /* The ordering amounts to a stable sort of each contiguous group of
289 * characters with non-0 combining class. */
290 while(ns > 0) {
291 /* Skip non-combining characters */
292 if(utf32__combining_class(*s) == 0) {
293 ++s;
294 --ns;
295 continue;
296 }
297 /* We must now have at least one combining character; see how many
298 * there are */
299 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
300 ;
301 if(nc > 1024)
302 return -1;
303 /* Sort the array */
304 utf32__sort_ccc(s, nc, buffer);
305 s += nc;
306 ns -= nc;
307 }
308 return 0;
309}
310
311/* Magic numbers from UAX #15 s16 */
312#define SBase 0xAC00
313#define LBase 0x1100
314#define VBase 0x1161
315#define TBase 0x11A7
316#define LCount 19
317#define VCount 21
318#define TCount 28
319#define NCount (VCount * TCount)
320#define SCount (LCount * NCount)
321
322/** @brief Guts of the decomposition lookup functions */
323#define utf32__decompose_one_generic(WHICH) do { \
324 const uint32_t *dc = \
325 (c < UNICODE_NCHARS \
326 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH \
327 : 0); \
328 if(dc) { \
329 /* Found a canonical decomposition in the table */ \
330 while(*dc) \
331 utf32__decompose_one_##WHICH(d, *dc++); \
332 } else if(c >= SBase && c < SBase + SCount) { \
333 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
334 const uint32_t SIndex = c - SBase; \
335 const uint32_t L = LBase + SIndex / NCount; \
336 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
337 const uint32_t T = TBase + SIndex % TCount; \
338 dynstr_ucs4_append(d, L); \
339 dynstr_ucs4_append(d, V); \
340 if(T != TBase) \
341 dynstr_ucs4_append(d, T); \
342 } else \
343 /* Equal to own canonical decomposition */ \
344 dynstr_ucs4_append(d, c); \
345} while(0)
346
347/** @brief Recursively compute the canonical decomposition of @p c
348 * @param d Dynamic string to store decomposition in
349 * @param c Code point to decompose (must be a valid!)
350 * @return 0 on success, -1 on error
351 */
352static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
353 utf32__decompose_one_generic(canon);
354}
355
356/** @brief Recursively compute the compatibility decomposition of @p c
357 * @param d Dynamic string to store decomposition in
358 * @param c Code point to decompose (must be a valid!)
359 * @return 0 on success, -1 on error
360 */
361static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
362 utf32__decompose_one_generic(compat);
363}
364
365/** @brief Guts of the decomposition functions */
366#define utf32__decompose_generic(WHICH) do { \
367 struct dynstr_ucs4 d; \
368 uint32_t c; \
369 \
370 dynstr_ucs4_init(&d); \
371 while(ns) { \
372 c = *s++; \
373 if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF) \
374 goto error; \
375 utf32__decompose_one_##WHICH(&d, c); \
376 --ns; \
377 } \
378 if(utf32__canonical_ordering(d.vec, d.nvec)) \
379 goto error; \
380 dynstr_ucs4_terminate(&d); \
381 if(ndp) \
382 *ndp = d.nvec; \
383 return d.vec; \
384error: \
385 xfree(d.vec); \
386 return 0; \
387} while(0)
388
389/** @brief Canonically decompose @p [s,s+ns)
390 * @param s Pointer to string
391 * @param ns Length of string
392 * @param ndp Where to store length of result
393 * @return Pointer to result string, or NULL
394 *
395 * Computes the canonical decomposition of a string and stably sorts combining
396 * characters into canonical order. The result is in Normalization Form D and
397 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
398 * NormalizationTest.txt.
399 *
400 * Returns NULL if the string is not valid for either of the following
401 * reasons:
402 * - it codes for a UTF-16 surrogate
403 * - it codes for a value outside the unicode code space
404 */
405uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
406 utf32__decompose_generic(canon);
407}
408
409/** @brief Compatibility decompose @p [s,s+ns)
410 * @param s Pointer to string
411 * @param ns Length of string
412 * @param ndp Where to store length of result
413 * @return Pointer to result string, or NULL
414 *
415 * Computes the compatibility decomposition of a string and stably sorts
416 * combining characters into canonical order. The result is in Normalization
417 * Form KD and (at the time of writing!) passes the NFKD tests defined in
418 * Unicode 5.0's NormalizationTest.txt.
419 *
420 * Returns NULL if the string is not valid for either of the following
421 * reasons:
422 * - it codes for a UTF-16 surrogate
423 * - it codes for a value outside the unicode code space
424 */
425uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
426 utf32__decompose_generic(compat);
427}
428
429/** @brief Case-fold @p C
430 * @param D String to append to
431 * @param C Character to fold
432 */
433static inline void utf32__casefold_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
434 const uint32_t *cf =
435 (c < UNICODE_NCHARS
436 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold
437 : 0);
438 if(cf) {
439 /* Found a case-fold mapping in the table */
440 while(*cf)
441 utf32__decompose_one_canon(d, *cf++);
442 } else
443 utf32__decompose_one_canon(d, c);
444}
445
446/** @brief Case-fold @p [s,s+ns)
447 * @param s Pointer to string
448 * @param ns Length of string
449 * @param ndp Where to store length of result
450 * @return Pointer to result string, or NULL
451 *
452 * Case-fold the string at @p s according to full default case-folding rules
453 * (s3.13). The result will be in NFD.
454 *
455 * Returns NULL if the string is not valid for either of the following
456 * reasons:
457 * - it codes for a UTF-16 surrogate
458 * - it codes for a value outside the unicode code space
459 */
460uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
461 struct dynstr_ucs4 d;
462 uint32_t c;
463 size_t n;
464 uint32_t *ss = 0;
465
466 /* If the canonical decomposition of the string includes any combining
467 * character that case-folds to a non-combining character then we must
468 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
469 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
470 * canonically decompose to it. */
471 for(n = 0; n < ns; ++n) {
472 c = s[n];
473 if(c < UNICODE_NCHARS
474 && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
475 & unicode_normalize_before_casefold))
476 break;
477 }
478 if(n < ns) {
479 /* We need a preliminary decomposition */
480 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
481 return 0;
482 s = ss;
483 }
484 dynstr_ucs4_init(&d);
485 while(ns) {
486 c = *s++;
487 if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF)
488 goto error;
489 utf32__casefold_one_canon(&d, c);
490 --ns;
491 }
492 if(utf32__canonical_ordering(d.vec, d.nvec))
493 goto error;
494 dynstr_ucs4_terminate(&d);
495 if(ndp)
496 *ndp = d.nvec;
497 return d.vec;
498error:
499 xfree(d.vec);
500 xfree(ss);
501 return 0;
502}
503
504/** @brief Order a pair of UTF-32 strings
505 * @param a First 0-terminated string
506 * @param b Second 0-terminated string
507 * @return -1, 0 or 1 for a less than, equal to or greater than b
508 *
509 * "Comparable to strcmp() at its best."
510 */
511int utf32_cmp(const uint32_t *a, const uint32_t *b) {
512 while(*a && *b && *a == *b) {
513 ++a;
514 ++b;
515 }
516 return *a < *b ? -1 : (*a > *b ? 1 : 0);
517}
518
519/*@}*/
520/** @defgroup Functions that operate on UTF-8 strings */
521/*@{*/
522
523/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
524#define utf8__transform(FN) do { \
525 uint32_t *to32 = 0, *decomp32 = 0; \
526 size_t nto32, ndecomp32; \
527 char *decomp8 = 0; \
528 \
529 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
530 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
531 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
532error: \
533 xfree(to32); \
534 xfree(decomp32); \
535 return decomp8; \
536} while(0)
537
538/** @brief Canonically decompose @p [s,s+ns)
539 * @param s Pointer to string
540 * @param ns Length of string
541 * @param ndp Where to store length of result
542 * @return Pointer to result string, or NULL
543 *
544 * Computes the canonical decomposition of a string and stably sorts combining
545 * characters into canonical order. The result is in Normalization Form D and
546 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
547 * NormalizationTest.txt.
548 *
549 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
550 * this might be.
551 *
552 * See also utf32_decompose_canon().
553 */
554char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
555 utf8__transform(utf32_decompose_canon);
556}
557
558/** @brief Compatibility decompose @p [s,s+ns)
559 * @param s Pointer to string
560 * @param ns Length of string
561 * @param ndp Where to store length of result
562 * @return Pointer to result string, or NULL
563 *
564 * Computes the compatibility decomposition of a string and stably sorts
565 * combining characters into canonical order. The result is in Normalization
566 * Form KD and (at the time of writing!) passes the NFKD tests defined in
567 * Unicode 5.0's NormalizationTest.txt.
568 *
569 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
570 * this might be.
571 *
572 * See also utf32_decompose_compat().
573 */
574char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
575 utf8__transform(utf32_decompose_compat);
576}
577
578/** @brief Case-fold @p [s,s+ns)
579 * @param s Pointer to string
580 * @param ns Length of string
581 * @param ndp Where to store length of result
582 * @return Pointer to result string, or NULL
583 *
584 * Case-fold the string at @p s according to full default case-folding rules
585 * (s3.13). The result will be in NFD.
586 *
587 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
588 * this might be.
589 */
590char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
591 utf8__transform(utf32_casefold_canon);
592}
593
594/** @brief Compatibility case-fold @p [s,s+ns)
595 * @param s Pointer to string
596 * @param ns Length of string
597 * @param ndp Where to store length of result
598 * @return Pointer to result string, or NULL
599 *
600 * Case-fold the string at @p s according to full default case-folding rules
601 * (s3.13). The result will be in NFKD.
602 *
603 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
604 * this might be.
605 */
606#if 0
607char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
608 utf8__transform(utf32_casefold_compat);
609}
610#endif
611
612/*@}*/
613
614/*
615Local Variables:
616c-basic-offset:2
617comment-column:40
618fill-column:79
619indent-tabs-mode:nil
620End:
621*/