chiark / gitweb /
untested utf32_is_word_boundary() and associated table changes
[disorder] / lib / unicode.c
CommitLineData
e5a5a138
RK
1/*
2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20/** @file lib/unicode.c
21 * @brief Unicode support functions
22 *
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
35b651f0
RK
24 * encoding schemes). The primary encoding form is UTF-32 but convenience
25 * wrappers using UTF-8 are provided for a number of functions.
e5a5a138
RK
26 *
27 * The idea is that all the strings that hit the database will be in a
28 * particular normalization form, and for the search and tags database
29 * in case-folded form, so they can be naively compared within the
30 * database code.
31 *
32 * As the code stands this guarantee is not well met!
33 */
34
35#include <config.h>
36#include "types.h"
37
38#include <string.h>
39#include <stdio.h> /* TODO */
40
41#include "mem.h"
42#include "vector.h"
43#include "unicode.h"
44#include "unidata.h"
45
46/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
47/*@{*/
48
49/** @brief Convert UTF-32 to UTF-8
50 * @param s Source string
51 * @param ns Length of source string in code points
52 * @param ndp Where to store length of destination string (or NULL)
53 * @return Newly allocated destination string or NULL on error
54 *
56fd389c
RK
55 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
56 * invalid if:
e5a5a138
RK
57 * - it codes for a UTF-16 surrogate
58 * - it codes for a value outside the unicode code space
59 *
56fd389c
RK
60 * The return value is always 0-terminated. The value returned via @p *ndp
61 * does not include the terminator.
e5a5a138
RK
62 */
63char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
64 struct dynstr d;
65 uint32_t c;
66
67 dynstr_init(&d);
68 while(ns > 0) {
69 c = *s++;
70 if(c < 0x80)
71 dynstr_append(&d, c);
72 else if(c < 0x0800) {
73 dynstr_append(&d, 0xC0 | (c >> 6));
74 dynstr_append(&d, 0x80 | (c & 0x3F));
75 } else if(c < 0x10000) {
56fd389c 76 if(c >= 0xD800 && c <= 0xDFFF)
e5a5a138
RK
77 goto error;
78 dynstr_append(&d, 0xE0 | (c >> 12));
79 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
80 dynstr_append(&d, 0x80 | (c & 0x3F));
81 } else if(c < 0x110000) {
82 dynstr_append(&d, 0xF0 | (c >> 18));
83 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
84 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
85 dynstr_append(&d, 0x80 | (c & 0x3F));
86 } else
87 goto error;
88 --ns;
89 }
90 dynstr_terminate(&d);
91 if(ndp)
92 *ndp = d.nvec;
93 return d.vec;
94error:
95 xfree(d.vec);
96 return 0;
97}
98
99/** @brief Convert UTF-8 to UTF-32
100 * @param s Source string
101 * @param ns Length of source string in code points
102 * @param ndp Where to store length of destination string (or NULL)
103 * @return Newly allocated destination string or NULL
104 *
56fd389c
RK
105 * The return value is always 0-terminated. The value returned via @p *ndp
106 * does not include the terminator.
e5a5a138
RK
107 *
108 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
109 * for a code point is invalid if:
110 * - it is not the shortest possible sequence for the code point
111 * - it codes for a UTF-16 surrogate
112 * - it codes for a value outside the unicode code space
113 */
114uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
115 struct dynstr_ucs4 d;
116 uint32_t c32, c;
117 const uint8_t *ss = (const uint8_t *)s;
118
119 dynstr_ucs4_init(&d);
120 while(ns > 0) {
121 c = *ss++;
122 --ns;
56fd389c
RK
123 /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
124 * (Unicode 5.0.0 s3.9 D76)
e5a5a138
RK
125 *
126 * 0xxxxxxx
127 * 7 data bits gives 0x00 - 0x7F and all are acceptable
128 *
129 * 110xxxxx 10xxxxxx
130 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
131 *
132 * 1110xxxx 10xxxxxx 10xxxxxx
133 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
134 * (and UTF-16 surrogates are not acceptable)
135 *
136 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
137 * 21 data bits gives 0x00000000 - 0x001FFFFF
138 * but only 0x00010000 - 0x0010FFFF are acceptable
139 *
56fd389c
RK
140 * It is NOT always the case that the data bits in the first byte are
141 * always non-0 for the acceptable values, so we do a separate check after
142 * decoding.
e5a5a138
RK
143 */
144 if(c < 0x80)
145 c32 = c;
146 else if(c <= 0xDF) {
147 if(ns < 1) goto error;
148 c32 = c & 0x1F;
149 c = *ss++;
150 if((c & 0xC0) != 0x80) goto error;
151 c32 = (c32 << 6) | (c & 0x3F);
152 if(c32 < 0x80) goto error;
153 } else if(c <= 0xEF) {
154 if(ns < 2) goto error;
155 c32 = c & 0x0F;
156 c = *ss++;
157 if((c & 0xC0) != 0x80) goto error;
158 c32 = (c32 << 6) | (c & 0x3F);
159 c = *ss++;
160 if((c & 0xC0) != 0x80) goto error;
161 c32 = (c32 << 6) | (c & 0x3F);
162 if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
163 } else if(c <= 0xF7) {
164 if(ns < 3) goto error;
165 c32 = c & 0x07;
166 c = *ss++;
167 if((c & 0xC0) != 0x80) goto error;
168 c32 = (c32 << 6) | (c & 0x3F);
169 c = *ss++;
170 if((c & 0xC0) != 0x80) goto error;
171 c32 = (c32 << 6) | (c & 0x3F);
172 c = *ss++;
173 if((c & 0xC0) != 0x80) goto error;
174 c32 = (c32 << 6) | (c & 0x3F);
175 if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
176 } else
177 goto error;
178 dynstr_ucs4_append(&d, c32);
179 }
180 dynstr_ucs4_terminate(&d);
181 if(ndp)
182 *ndp = d.nvec;
183 return d.vec;
184error:
185 xfree(d.vec);
186 return 0;
187}
188
189/*@}*/
190/** @defgroup utf32 Functions that operate on UTF-32 strings */
191/*@{*/
192
193/** @brief Return the length of a 0-terminated UTF-32 string
194 * @param s Pointer to 0-terminated string
195 * @return Length of string in code points (excluding terminator)
196 *
56fd389c 197 * Unlike the conversion functions no validity checking is done on the string.
e5a5a138
RK
198 */
199size_t utf32_len(const uint32_t *s) {
200 const uint32_t *t = s;
201
202 while(*t)
203 ++t;
204 return (size_t)(t - s);
205}
206
207/** @brief Return the combining class of @p c
208 * @param c Code point
209 * @return Combining class of @p c
210 */
211static inline int utf32__combining_class(uint32_t c) {
212 if(c < UNICODE_NCHARS)
213 return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
214 return 0;
215}
216
217/** @brief Stably sort [s,s+ns) into descending order of combining class
218 * @param s Start of array
219 * @param ns Number of elements, must be at least 1
220 * @param buffer Buffer of at least @p ns elements
221 */
222static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
223 uint32_t *a, *b, *bp;
224 size_t na, nb;
225
226 switch(ns) {
227 case 1: /* 1-element array is always sorted */
228 return;
229 case 2: /* 2-element arrays are trivial to sort */
230 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
231 uint32_t tmp = s[0];
232 s[0] = s[1];
233 s[1] = tmp;
234 }
235 return;
236 default:
237 /* Partition the array */
238 na = ns / 2;
239 nb = ns - na;
240 a = s;
241 b = s + na;
242 /* Sort the two halves of the array */
243 utf32__sort_ccc(a, na, buffer);
244 utf32__sort_ccc(b, nb, buffer);
245 /* Merge them back into one, via the buffer */
246 bp = buffer;
247 while(na > 0 && nb > 0) {
248 /* We want descending order of combining class (hence <)
249 * and we want stability within combining classes (hence <=)
250 */
251 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
252 *bp++ = *a++;
253 --na;
254 } else {
255 *bp++ = *b++;
256 --nb;
257 }
258 }
259 while(na > 0) {
260 *bp++ = *a++;
261 --na;
262 }
263 while(nb > 0) {
264 *bp++ = *b++;
265 --nb;
266 }
267 memcpy(s, buffer, ns * sizeof(uint32_t));
268 return;
269 }
270}
271
272/** @brief Put combining characters into canonical order
273 * @param s Pointer to UTF-32 string
274 * @param ns Length of @p s
275 * @return 0 on success, -1 on error
276 *
56fd389c
RK
277 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
278 * ordering.
e5a5a138 279 *
56fd389c
RK
280 * Currently we only support a maximum of 1024 combining characters after each
281 * base character. If this limit is exceeded then -1 is returned.
e5a5a138
RK
282 */
283static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
284 size_t nc;
285 uint32_t buffer[1024];
286
287 /* The ordering amounts to a stable sort of each contiguous group of
288 * characters with non-0 combining class. */
289 while(ns > 0) {
290 /* Skip non-combining characters */
291 if(utf32__combining_class(*s) == 0) {
292 ++s;
293 --ns;
294 continue;
295 }
296 /* We must now have at least one combining character; see how many
297 * there are */
298 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
299 ;
300 if(nc > 1024)
301 return -1;
302 /* Sort the array */
303 utf32__sort_ccc(s, nc, buffer);
304 s += nc;
305 ns -= nc;
306 }
307 return 0;
308}
309
310/* Magic numbers from UAX #15 s16 */
311#define SBase 0xAC00
312#define LBase 0x1100
313#define VBase 0x1161
314#define TBase 0x11A7
315#define LCount 19
316#define VCount 21
317#define TCount 28
318#define NCount (VCount * TCount)
319#define SCount (LCount * NCount)
320
321/** @brief Guts of the decomposition lookup functions */
322#define utf32__decompose_one_generic(WHICH) do { \
323 const uint32_t *dc = \
324 (c < UNICODE_NCHARS \
325 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH \
326 : 0); \
327 if(dc) { \
328 /* Found a canonical decomposition in the table */ \
329 while(*dc) \
330 utf32__decompose_one_##WHICH(d, *dc++); \
331 } else if(c >= SBase && c < SBase + SCount) { \
332 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
333 const uint32_t SIndex = c - SBase; \
334 const uint32_t L = LBase + SIndex / NCount; \
335 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
336 const uint32_t T = TBase + SIndex % TCount; \
337 dynstr_ucs4_append(d, L); \
338 dynstr_ucs4_append(d, V); \
339 if(T != TBase) \
340 dynstr_ucs4_append(d, T); \
341 } else \
342 /* Equal to own canonical decomposition */ \
343 dynstr_ucs4_append(d, c); \
344} while(0)
345
346/** @brief Recursively compute the canonical decomposition of @p c
347 * @param d Dynamic string to store decomposition in
348 * @param c Code point to decompose (must be a valid!)
349 * @return 0 on success, -1 on error
350 */
351static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
352 utf32__decompose_one_generic(canon);
353}
354
355/** @brief Recursively compute the compatibility decomposition of @p c
356 * @param d Dynamic string to store decomposition in
357 * @param c Code point to decompose (must be a valid!)
358 * @return 0 on success, -1 on error
359 */
360static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
361 utf32__decompose_one_generic(compat);
362}
363
364/** @brief Guts of the decomposition functions */
365#define utf32__decompose_generic(WHICH) do { \
366 struct dynstr_ucs4 d; \
367 uint32_t c; \
368 \
369 dynstr_ucs4_init(&d); \
370 while(ns) { \
371 c = *s++; \
56fd389c 372 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
e5a5a138
RK
373 goto error; \
374 utf32__decompose_one_##WHICH(&d, c); \
375 --ns; \
376 } \
377 if(utf32__canonical_ordering(d.vec, d.nvec)) \
378 goto error; \
379 dynstr_ucs4_terminate(&d); \
380 if(ndp) \
381 *ndp = d.nvec; \
382 return d.vec; \
383error: \
384 xfree(d.vec); \
385 return 0; \
386} while(0)
387
388/** @brief Canonically decompose @p [s,s+ns)
389 * @param s Pointer to string
390 * @param ns Length of string
391 * @param ndp Where to store length of result
392 * @return Pointer to result string, or NULL
393 *
394 * Computes the canonical decomposition of a string and stably sorts combining
395 * characters into canonical order. The result is in Normalization Form D and
396 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
397 * NormalizationTest.txt.
398 *
56fd389c 399 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
400 * - it codes for a UTF-16 surrogate
401 * - it codes for a value outside the unicode code space
402 */
403uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
404 utf32__decompose_generic(canon);
405}
406
407/** @brief Compatibility decompose @p [s,s+ns)
408 * @param s Pointer to string
409 * @param ns Length of string
410 * @param ndp Where to store length of result
411 * @return Pointer to result string, or NULL
412 *
413 * Computes the compatibility decomposition of a string and stably sorts
414 * combining characters into canonical order. The result is in Normalization
415 * Form KD and (at the time of writing!) passes the NFKD tests defined in
416 * Unicode 5.0's NormalizationTest.txt.
417 *
56fd389c 418 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
419 * - it codes for a UTF-16 surrogate
420 * - it codes for a value outside the unicode code space
421 */
422uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
423 utf32__decompose_generic(compat);
424}
425
56fd389c
RK
426/** @brief Single-character case-fold and decompose operation */
427#define utf32__casefold_one(WHICH) do { \
428 const uint32_t *cf = \
429 (c < UNICODE_NCHARS \
430 ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold \
431 : 0); \
432 if(cf) { \
433 /* Found a case-fold mapping in the table */ \
434 while(*cf) \
435 utf32__decompose_one_##WHICH(&d, *cf++); \
436 } else \
437 utf32__decompose_one_##WHICH(&d, c); \
438} while(0)
e5a5a138
RK
439
440/** @brief Case-fold @p [s,s+ns)
441 * @param s Pointer to string
442 * @param ns Length of string
443 * @param ndp Where to store length of result
444 * @return Pointer to result string, or NULL
445 *
446 * Case-fold the string at @p s according to full default case-folding rules
56fd389c 447 * (s3.13) for caseless matching. The result will be in NFD.
e5a5a138 448 *
56fd389c 449 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
450 * - it codes for a UTF-16 surrogate
451 * - it codes for a value outside the unicode code space
452 */
453uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
454 struct dynstr_ucs4 d;
455 uint32_t c;
456 size_t n;
457 uint32_t *ss = 0;
458
459 /* If the canonical decomposition of the string includes any combining
460 * character that case-folds to a non-combining character then we must
461 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
462 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
463 * canonically decompose to it. */
464 for(n = 0; n < ns; ++n) {
465 c = s[n];
466 if(c < UNICODE_NCHARS
467 && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
468 & unicode_normalize_before_casefold))
469 break;
470 }
471 if(n < ns) {
472 /* We need a preliminary decomposition */
473 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
474 return 0;
475 s = ss;
476 }
477 dynstr_ucs4_init(&d);
478 while(ns) {
479 c = *s++;
56fd389c 480 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
e5a5a138 481 goto error;
56fd389c 482 utf32__casefold_one(canon);
e5a5a138
RK
483 --ns;
484 }
485 if(utf32__canonical_ordering(d.vec, d.nvec))
486 goto error;
487 dynstr_ucs4_terminate(&d);
488 if(ndp)
489 *ndp = d.nvec;
490 return d.vec;
491error:
492 xfree(d.vec);
493 xfree(ss);
494 return 0;
495}
496
56fd389c
RK
497/** @brief Compatibilit case-fold @p [s,s+ns)
498 * @param s Pointer to string
499 * @param ns Length of string
500 * @param ndp Where to store length of result
501 * @return Pointer to result string, or NULL
502 *
503 * Case-fold the string at @p s according to full default case-folding rules
504 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
505 *
506 * Returns NULL if the string is not valid for either of the following reasons:
507 * - it codes for a UTF-16 surrogate
508 * - it codes for a value outside the unicode code space
509 */
510uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
511 struct dynstr_ucs4 d;
512 uint32_t c;
513 size_t n;
514 uint32_t *ss = 0;
515
516 for(n = 0; n < ns; ++n) {
517 c = s[n];
518 if(c < UNICODE_NCHARS
519 && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
520 & unicode_normalize_before_casefold))
521 break;
522 }
523 if(n < ns) {
524 /* We need a preliminary _canonical_ decomposition */
525 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
526 return 0;
527 s = ss;
528 }
529 /* This computes NFKD(toCaseFold(s)) */
530#define compat_casefold_middle() do { \
531 dynstr_ucs4_init(&d); \
532 while(ns) { \
533 c = *s++; \
534 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
535 goto error; \
536 utf32__casefold_one(compat); \
537 --ns; \
538 } \
539 if(utf32__canonical_ordering(d.vec, d.nvec)) \
540 goto error; \
541} while(0)
542 /* Do the inner (NFKD o toCaseFold) */
543 compat_casefold_middle();
544 /* We can do away with the NFD'd copy of the input now */
545 xfree(ss);
546 s = ss = d.vec;
547 ns = d.nvec;
548 /* Do the outer (NFKD o toCaseFold) */
549 compat_casefold_middle();
550 /* That's all */
551 dynstr_ucs4_terminate(&d);
552 if(ndp)
553 *ndp = d.nvec;
554 return d.vec;
555error:
556 xfree(d.vec);
557 xfree(ss);
558 return 0;
559}
560
e5a5a138
RK
561/** @brief Order a pair of UTF-32 strings
562 * @param a First 0-terminated string
563 * @param b Second 0-terminated string
564 * @return -1, 0 or 1 for a less than, equal to or greater than b
565 *
566 * "Comparable to strcmp() at its best."
567 */
568int utf32_cmp(const uint32_t *a, const uint32_t *b) {
569 while(*a && *b && *a == *b) {
570 ++a;
571 ++b;
572 }
573 return *a < *b ? -1 : (*a > *b ? 1 : 0);
574}
575
35b651f0
RK
576/** @brief Return the General_Category value for @p c
577 * @param Code point
578 * @return General_Category property value
579 */
580static inline enum unicode_gc_cat utf32__general_category(uint32_t c) {
581 if(c < UNICODE_NCHARS) {
582 const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
583 return ud->gc;
584 } else
585 return unicode_gc_Cn;
586}
587
588/** @brief Check Grapheme_Cluster_Break property
589 * @param c Code point
590 * @return 0 if it is as described, 1 otherwise
591 */
592static int utf32__is_control_or_cr_or_lf(uint32_t c) {
593 switch(utf32__general_category(c)) {
594 default:
595 return 0;
596 case unicode_gc_Zl:
597 case unicode_gc_Zp:
598 case unicode_gc_Cc:
599 return 1;
600 case unicode_gc_Cf:
601 if(c == 0x200C || c == 0x200D)
602 return 0;
603 return 1;
604 }
605}
606
607#define Hangul_Syllable_Type_NA 0
608#define Hangul_Syllable_Type_L 0x1100
609#define Hangul_Syllable_Type_V 0x1160
610#define Hangul_Syllable_Type_T 0x11A8
611#define Hangul_Syllable_Type_LV 0xAC00
612#define Hangul_Syllable_Type_LVT 0xAC01
613
614/** @brief Determine Hangul_Syllable_Type of @p c
615 * @param c Code point
616 * @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
617 *
618 * If this is a Hangul character then a representative member of its
619 * equivalence class is returned. Otherwise Hangul_Syllable_Type_NA is
620 * returned.
621 */
622static uint32_t utf32__hangul_syllable_type(uint32_t c) {
623 /* Dispose of the bulk of the non-Hangul code points first */
624 if(c < 0x1100) return Hangul_Syllable_Type_NA;
625 if(c > 0x1200 && c < 0xAC00) return Hangul_Syllable_Type_NA;
626 if(c >= 0xD800) return Hangul_Syllable_Type_NA;
627 /* Now we pick out the assigned Hangul code points */
628 if((c >= 0x1100 && c <= 0x1159) || c == 0x115F) return Hangul_Syllable_Type_L;
629 if(c >= 0x1160 && c <= 0x11A2) return Hangul_Syllable_Type_V;
630 if(c >= 0x11A8 && c <= 0x11F9) return Hangul_Syllable_Type_T;
631 if(c >= 0xAC00 && c <= 0xD7A3) {
632 if(c % 28 == 16)
633 return Hangul_Syllable_Type_LV;
634 else
635 return Hangul_Syllable_Type_LVT;
636 }
637 return Hangul_Syllable_Type_NA;
638}
639
0b7052da
RK
640/** @brief Determine Word_Break property
641 * @param c Code point
642 * @return Word_Break property value of @p c
643 */
644static enum unicode_Word_Break utf32__word_break(uint32_t c) {
645 if(c < 0xAC00 || c > 0xD7A3) {
646 if(c < UNICODE_NCHARS)
647 return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].word_break;
648 else
649 return unicode_Word_Break_Other;
650 } else
651 return unicode_Word_Break_ALetter;
652}
653
35b651f0
RK
654/** @brief Identify a grapheme cluster boundary
655 * @param s Start of string (must be NFD)
656 * @param ns Length of string
657 * @param n Index within string (in [0,ns].)
658 * @return 1 at a grapheme cluster boundary, 0 otherwise
659 *
660 * This function identifies default grapheme cluster boundaries as described in
661 * UAX #29 s3. It returns 1 if @p n points at the code point just after a
662 * grapheme cluster boundary (including the hypothetical code point just after
663 * the end of the string).
35b651f0
RK
664 */
665int utf32_is_gcb(const uint32_t *s, size_t ns, size_t n) {
666 uint32_t before, after;
667 uint32_t hbefore, hafter;
668 /* GB1 and GB2 */
669 if(n == 0 || n == ns)
670 return 1;
671 /* Now we know that s[n-1] and s[n] are safe to inspect */
672 /* GB3 */
673 before = s[n-1];
674 after = s[n];
675 if(before == 0x000D && after == 0x000A)
676 return 0;
677 /* GB4 and GB5 */
678 if(utf32__is_control_or_cr_or_lf(before)
679 || utf32__is_control_or_cr_or_lf(after))
680 return 1;
681 hbefore = utf32__hangul_syllable_type(before);
682 hafter = utf32__hangul_syllable_type(after);
683 /* GB6 */
684 if(hbefore == Hangul_Syllable_Type_L
e2452add
RK
685 && (hafter == Hangul_Syllable_Type_L
686 || hafter == Hangul_Syllable_Type_V
687 || hafter == Hangul_Syllable_Type_LV
688 || hafter == Hangul_Syllable_Type_LVT))
35b651f0
RK
689 return 0;
690 /* GB7 */
691 if((hbefore == Hangul_Syllable_Type_LV
692 || hbefore == Hangul_Syllable_Type_V)
693 && (hafter == Hangul_Syllable_Type_V
694 || hafter == Hangul_Syllable_Type_T))
695 return 0;
696 /* GB8 */
697 if((hbefore == Hangul_Syllable_Type_LVT
698 || hbefore == Hangul_Syllable_Type_T)
699 && hafter == Hangul_Syllable_Type_T)
700 return 0;
701 /* GB9 */
0b7052da 702 if(utf32__word_break(after) == unicode_Word_Break_Extend)
35b651f0
RK
703 return 0;
704 /* GB10 */
705 return 1;
706}
707
0b7052da
RK
708/** @brief Return true if code point @p n is part of an initial sequence of Format/Extend
709 * @param s Start of string
710 * @param ns Length of string
711 * @param n Start position
712 * @return True if it is, false otherwise
713 *
714 * This whole stack is not very efficient; we assume we don't see many of the
715 * problematic characters.
716 */
717static int utf32__is_initial_sequence(const uint32_t *s,
718 size_t attribute((unused)) ns,
719 size_t n) {
720 enum unicode_Word_Break wb;
721
722 while(n > 0) {
723 --n;
724 wb = utf32__word_break(s[n]);
725 if(wb != unicode_Word_Break_Extend
726 && wb != unicode_Word_Break_Format)
727 return 0;
728 }
729 return 1;
730}
731
732/** @brief Return the index of the first non-Extend/Format character from n
733 * @param s Start of string
734 * @param ns Length of string
735 * @param n Start position
736 * @return Index of first suitable character or @p ns
737 */
738static size_t utf32__first_not_ignorable(const uint32_t *s, size_t ns,
739 size_t n) {
740 while(n < ns) {
741 const enum unicode_Word_Break wb = utf32__word_break(s[n]);
742 if((wb != unicode_Word_Break_Extend
743 && wb != unicode_Word_Break_Format)
744 || utf32__is_initial_sequence(s, ns, n))
745 return n;
746 ++n;
747 }
748 return ns;
749}
750
751/** @brief Return the index of the last non-Extend/Format character from n
752 * @param s Start of string
753 * @param ns Length of string
754 * @param n Start position
755 * @return Index of first suitable character or (size_t)-1
756 */
757static size_t utf32__last_not_ignorable(const uint32_t *s, size_t ns,
758 size_t n) {
759 do {
760 const enum unicode_Word_Break wb = utf32__word_break(s[n]);
761 if((wb != unicode_Word_Break_Extend
762 && wb != unicode_Word_Break_Format)
763 || utf32__is_initial_sequence(s, ns, n))
764 return n;
765 } while(n--);
766 return n; /* will be (size_t)-1 */
767}
768
769/** @brief Identify a word boundary
770 * @param s Start of string (must be NFD)
771 * @param ns Length of string
772 * @param n Index within string (in [0,ns].)
773 * @return 1 at a word boundary, 0 otherwise
774 *
775 * This function identifies default word boundaries as described in UAX #29 s4.
776 * It returns 1 if @p n points at the code point just after a word boundary
777 * (including the hypothetical code point just after the end of the string).
778 */
779int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
780 enum unicode_Word_Break twobefore, before, after, twoafter;
781 size_t nn;
782
783 /* WB1 and WB2 */
784 if(n == 0 || n == ns)
785 return 1;
786 /* WB3 */
787 if(s[n-1] == 0x000D && s[n] == 0x000A)
788 return 0;
789 /* WB4 */
790 /* The stated requirement here is to ignore code points with a Word_Break
791 * value of _Extend or _Format wherever they may appear unless they are part
792 * of an initial sequence of such characters. */
793 twobefore = before = after = twoafter = unicode_Word_Break_Other;
794 nn = utf32__last_not_ignorable(s, ns, n - 1/* > 0 */);
795 if(nn != (size_t)-1) {
796 before = utf32__word_break(s[nn]);
797 if(nn != 0) {
798 nn = utf32__last_not_ignorable(s, ns, nn - 1);
799 if(nn != (size_t)-1)
800 twobefore = utf32__word_break(s[nn]);
801 }
802 }
803 nn = utf32__first_not_ignorable(s, ns, n);
804 if(nn < ns) {
805 after = utf32__word_break(s[nn]);
806 if(nn < ns) {
807 nn = utf32__first_not_ignorable(s, ns, nn + 1);
808 if(nn < ns)
809 twoafter = utf32__word_break(s[nn]);
810 }
811 }
812 /* WB5 */
813 if(before == unicode_Word_Break_ALetter
814 && after == unicode_Word_Break_ALetter)
815 return 0;
816 /* WB6 */
817 if(before == unicode_Word_Break_ALetter
818 && after == unicode_Word_Break_MidLetter
819 && twoafter == unicode_Word_Break_ALetter)
820 return 0;
821 /* WB7 */
822 if(twobefore == unicode_Word_Break_ALetter
823 && before == unicode_Word_Break_MidLetter
824 && after == unicode_Word_Break_ALetter)
825 return 0;
826 /* WB8 */
827 if(before == unicode_Word_Break_Numeric
828 && after == unicode_Word_Break_Numeric)
829 return 0;
830 /* WB9 */
831 if(before == unicode_Word_Break_ALetter
832 && after == unicode_Word_Break_Numeric)
833 return 0;
834 /* WB10 */
835 if(before == unicode_Word_Break_Numeric
836 && after == unicode_Word_Break_ALetter)
837 return 0;
838 /* WB11 */
839 if(twobefore == unicode_Word_Break_Numeric
840 && before == unicode_Word_Break_MidNum
841 && after == unicode_Word_Break_Numeric)
842 return 0;
843 /* WB12 */
844 if(before == unicode_Word_Break_Numeric
845 && after == unicode_Word_Break_MidNum
846 && twoafter == unicode_Word_Break_Numeric)
847 return 0;
848 /* WB13 */
849 if(before == unicode_Word_Break_Katakana
850 && after == unicode_Word_Break_Katakana)
851 return 0;
852 /* WB13a */
853 if((before == unicode_Word_Break_ALetter
854 || before == unicode_Word_Break_Numeric
855 || before == unicode_Word_Break_Katakana
856 || before == unicode_Word_Break_ExtendNumLet)
857 && after == unicode_Word_Break_ExtendNumLet)
858 return 0;
859 /* WB13b */
860 if(before == unicode_Word_Break_ExtendNumLet
861 && (after == unicode_Word_Break_ALetter
862 || after == unicode_Word_Break_Numeric
863 || after == unicode_Word_Break_Katakana))
864 return 0;
865 /* WB14 */
866 return 1;
867}
868
e5a5a138
RK
869/*@}*/
870/** @defgroup Functions that operate on UTF-8 strings */
871/*@{*/
872
873/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
874#define utf8__transform(FN) do { \
875 uint32_t *to32 = 0, *decomp32 = 0; \
876 size_t nto32, ndecomp32; \
877 char *decomp8 = 0; \
878 \
879 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
880 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
881 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
882error: \
883 xfree(to32); \
884 xfree(decomp32); \
885 return decomp8; \
886} while(0)
887
888/** @brief Canonically decompose @p [s,s+ns)
889 * @param s Pointer to string
890 * @param ns Length of string
891 * @param ndp Where to store length of result
892 * @return Pointer to result string, or NULL
893 *
894 * Computes the canonical decomposition of a string and stably sorts combining
895 * characters into canonical order. The result is in Normalization Form D and
896 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
897 * NormalizationTest.txt.
898 *
899 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
900 * this might be.
901 *
902 * See also utf32_decompose_canon().
903 */
904char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
905 utf8__transform(utf32_decompose_canon);
906}
907
908/** @brief Compatibility decompose @p [s,s+ns)
909 * @param s Pointer to string
910 * @param ns Length of string
911 * @param ndp Where to store length of result
912 * @return Pointer to result string, or NULL
913 *
914 * Computes the compatibility decomposition of a string and stably sorts
915 * combining characters into canonical order. The result is in Normalization
916 * Form KD and (at the time of writing!) passes the NFKD tests defined in
917 * Unicode 5.0's NormalizationTest.txt.
918 *
919 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
920 * this might be.
921 *
922 * See also utf32_decompose_compat().
923 */
924char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
925 utf8__transform(utf32_decompose_compat);
926}
927
928/** @brief Case-fold @p [s,s+ns)
929 * @param s Pointer to string
930 * @param ns Length of string
931 * @param ndp Where to store length of result
932 * @return Pointer to result string, or NULL
933 *
934 * Case-fold the string at @p s according to full default case-folding rules
935 * (s3.13). The result will be in NFD.
936 *
937 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
938 * this might be.
939 */
940char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
941 utf8__transform(utf32_casefold_canon);
942}
943
944/** @brief Compatibility case-fold @p [s,s+ns)
945 * @param s Pointer to string
946 * @param ns Length of string
947 * @param ndp Where to store length of result
948 * @return Pointer to result string, or NULL
949 *
950 * Case-fold the string at @p s according to full default case-folding rules
951 * (s3.13). The result will be in NFKD.
952 *
953 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
954 * this might be.
955 */
e5a5a138
RK
956char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
957 utf8__transform(utf32_casefold_compat);
958}
e5a5a138
RK
959
960/*@}*/
961
962/*
963Local Variables:
964c-basic-offset:2
965comment-column:40
966fill-column:79
967indent-tabs-mode:nil
968End:
969*/