chiark / gitweb /
more idiomatic grapheme breaking
[disorder] / lib / unicode.c
CommitLineData
e5a5a138
RK
1/*
2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20/** @file lib/unicode.c
21 * @brief Unicode support functions
22 *
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
35b651f0
RK
24 * encoding schemes). The primary encoding form is UTF-32 but convenience
25 * wrappers using UTF-8 are provided for a number of functions.
e5a5a138
RK
26 *
27 * The idea is that all the strings that hit the database will be in a
28 * particular normalization form, and for the search and tags database
29 * in case-folded form, so they can be naively compared within the
30 * database code.
31 *
32 * As the code stands this guarantee is not well met!
33 */
34
35#include <config.h>
36#include "types.h"
37
38#include <string.h>
39#include <stdio.h> /* TODO */
40
41#include "mem.h"
42#include "vector.h"
43#include "unicode.h"
44#include "unidata.h"
45
46/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
47/*@{*/
48
49/** @brief Convert UTF-32 to UTF-8
50 * @param s Source string
51 * @param ns Length of source string in code points
52 * @param ndp Where to store length of destination string (or NULL)
53 * @return Newly allocated destination string or NULL on error
54 *
56fd389c
RK
55 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
56 * invalid if:
e5a5a138
RK
57 * - it codes for a UTF-16 surrogate
58 * - it codes for a value outside the unicode code space
59 *
56fd389c
RK
60 * The return value is always 0-terminated. The value returned via @p *ndp
61 * does not include the terminator.
e5a5a138
RK
62 */
63char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
64 struct dynstr d;
65 uint32_t c;
66
67 dynstr_init(&d);
68 while(ns > 0) {
69 c = *s++;
70 if(c < 0x80)
71 dynstr_append(&d, c);
72 else if(c < 0x0800) {
73 dynstr_append(&d, 0xC0 | (c >> 6));
74 dynstr_append(&d, 0x80 | (c & 0x3F));
75 } else if(c < 0x10000) {
56fd389c 76 if(c >= 0xD800 && c <= 0xDFFF)
e5a5a138
RK
77 goto error;
78 dynstr_append(&d, 0xE0 | (c >> 12));
79 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
80 dynstr_append(&d, 0x80 | (c & 0x3F));
81 } else if(c < 0x110000) {
82 dynstr_append(&d, 0xF0 | (c >> 18));
83 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
84 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
85 dynstr_append(&d, 0x80 | (c & 0x3F));
86 } else
87 goto error;
88 --ns;
89 }
90 dynstr_terminate(&d);
91 if(ndp)
92 *ndp = d.nvec;
93 return d.vec;
94error:
95 xfree(d.vec);
96 return 0;
97}
98
99/** @brief Convert UTF-8 to UTF-32
100 * @param s Source string
101 * @param ns Length of source string in code points
102 * @param ndp Where to store length of destination string (or NULL)
103 * @return Newly allocated destination string or NULL
104 *
56fd389c
RK
105 * The return value is always 0-terminated. The value returned via @p *ndp
106 * does not include the terminator.
e5a5a138
RK
107 *
108 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
109 * for a code point is invalid if:
110 * - it is not the shortest possible sequence for the code point
111 * - it codes for a UTF-16 surrogate
112 * - it codes for a value outside the unicode code space
113 */
114uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
115 struct dynstr_ucs4 d;
116 uint32_t c32, c;
117 const uint8_t *ss = (const uint8_t *)s;
118
119 dynstr_ucs4_init(&d);
120 while(ns > 0) {
121 c = *ss++;
122 --ns;
56fd389c
RK
123 /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
124 * (Unicode 5.0.0 s3.9 D76)
e5a5a138
RK
125 *
126 * 0xxxxxxx
127 * 7 data bits gives 0x00 - 0x7F and all are acceptable
128 *
129 * 110xxxxx 10xxxxxx
130 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
131 *
132 * 1110xxxx 10xxxxxx 10xxxxxx
133 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
134 * (and UTF-16 surrogates are not acceptable)
135 *
136 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
137 * 21 data bits gives 0x00000000 - 0x001FFFFF
138 * but only 0x00010000 - 0x0010FFFF are acceptable
139 *
56fd389c
RK
140 * It is NOT always the case that the data bits in the first byte are
141 * always non-0 for the acceptable values, so we do a separate check after
142 * decoding.
e5a5a138
RK
143 */
144 if(c < 0x80)
145 c32 = c;
146 else if(c <= 0xDF) {
147 if(ns < 1) goto error;
148 c32 = c & 0x1F;
149 c = *ss++;
150 if((c & 0xC0) != 0x80) goto error;
151 c32 = (c32 << 6) | (c & 0x3F);
152 if(c32 < 0x80) goto error;
153 } else if(c <= 0xEF) {
154 if(ns < 2) goto error;
155 c32 = c & 0x0F;
156 c = *ss++;
157 if((c & 0xC0) != 0x80) goto error;
158 c32 = (c32 << 6) | (c & 0x3F);
159 c = *ss++;
160 if((c & 0xC0) != 0x80) goto error;
161 c32 = (c32 << 6) | (c & 0x3F);
162 if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
163 } else if(c <= 0xF7) {
164 if(ns < 3) goto error;
165 c32 = c & 0x07;
166 c = *ss++;
167 if((c & 0xC0) != 0x80) goto error;
168 c32 = (c32 << 6) | (c & 0x3F);
169 c = *ss++;
170 if((c & 0xC0) != 0x80) goto error;
171 c32 = (c32 << 6) | (c & 0x3F);
172 c = *ss++;
173 if((c & 0xC0) != 0x80) goto error;
174 c32 = (c32 << 6) | (c & 0x3F);
175 if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
176 } else
177 goto error;
178 dynstr_ucs4_append(&d, c32);
179 }
180 dynstr_ucs4_terminate(&d);
181 if(ndp)
182 *ndp = d.nvec;
183 return d.vec;
184error:
185 xfree(d.vec);
186 return 0;
187}
188
189/*@}*/
190/** @defgroup utf32 Functions that operate on UTF-32 strings */
191/*@{*/
192
193/** @brief Return the length of a 0-terminated UTF-32 string
194 * @param s Pointer to 0-terminated string
195 * @return Length of string in code points (excluding terminator)
196 *
56fd389c 197 * Unlike the conversion functions no validity checking is done on the string.
e5a5a138
RK
198 */
199size_t utf32_len(const uint32_t *s) {
200 const uint32_t *t = s;
201
202 while(*t)
203 ++t;
204 return (size_t)(t - s);
205}
206
bcf9ed7f
RK
207/** @brief Return the @ref unidata structure for code point @p c
208 *
209 * @p c can be any 32-bit value, a sensible value will be returned regardless.
210 */
211static const struct unidata *utf32__unidata(uint32_t c) {
1a05e381
RK
212 /* The bottom half of the table contains almost everything of interest
213 * and we can just return the right thing straight away */
214 if(c < UNICODE_BREAK_START)
bcf9ed7f 215 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
1a05e381
RK
216 /* Within the break everything is unassigned */
217 if(c < UNICODE_BREAK_END)
218 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
219 /* Planes 15 and 16 are (mostly) private use */
220 if((c >= 0xF0000 && c <= 0xFFFFD)
221 || (c >= 0x100000 && c <= 0x10FFFD))
222 return utf32__unidata(0xE000); /* first Co code point */
223 /* Everything else above the break top is unassigned */
224 if(c >= UNICODE_BREAK_TOP)
225 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
226 /* Currently the rest is language tags and variation selectors */
227 c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
228 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
bcf9ed7f
RK
229}
230
e5a5a138
RK
231/** @brief Return the combining class of @p c
232 * @param c Code point
233 * @return Combining class of @p c
234 */
235static inline int utf32__combining_class(uint32_t c) {
bcf9ed7f 236 return utf32__unidata(c)->ccc;
e5a5a138
RK
237}
238
239/** @brief Stably sort [s,s+ns) into descending order of combining class
240 * @param s Start of array
241 * @param ns Number of elements, must be at least 1
242 * @param buffer Buffer of at least @p ns elements
243 */
244static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
245 uint32_t *a, *b, *bp;
246 size_t na, nb;
247
248 switch(ns) {
249 case 1: /* 1-element array is always sorted */
250 return;
251 case 2: /* 2-element arrays are trivial to sort */
252 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
253 uint32_t tmp = s[0];
254 s[0] = s[1];
255 s[1] = tmp;
256 }
257 return;
258 default:
259 /* Partition the array */
260 na = ns / 2;
261 nb = ns - na;
262 a = s;
263 b = s + na;
264 /* Sort the two halves of the array */
265 utf32__sort_ccc(a, na, buffer);
266 utf32__sort_ccc(b, nb, buffer);
267 /* Merge them back into one, via the buffer */
268 bp = buffer;
269 while(na > 0 && nb > 0) {
270 /* We want descending order of combining class (hence <)
271 * and we want stability within combining classes (hence <=)
272 */
273 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
274 *bp++ = *a++;
275 --na;
276 } else {
277 *bp++ = *b++;
278 --nb;
279 }
280 }
281 while(na > 0) {
282 *bp++ = *a++;
283 --na;
284 }
285 while(nb > 0) {
286 *bp++ = *b++;
287 --nb;
288 }
289 memcpy(s, buffer, ns * sizeof(uint32_t));
290 return;
291 }
292}
293
294/** @brief Put combining characters into canonical order
295 * @param s Pointer to UTF-32 string
296 * @param ns Length of @p s
297 * @return 0 on success, -1 on error
298 *
56fd389c
RK
299 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
300 * ordering.
e5a5a138 301 *
56fd389c
RK
302 * Currently we only support a maximum of 1024 combining characters after each
303 * base character. If this limit is exceeded then -1 is returned.
e5a5a138
RK
304 */
305static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
306 size_t nc;
307 uint32_t buffer[1024];
308
309 /* The ordering amounts to a stable sort of each contiguous group of
310 * characters with non-0 combining class. */
311 while(ns > 0) {
312 /* Skip non-combining characters */
313 if(utf32__combining_class(*s) == 0) {
314 ++s;
315 --ns;
316 continue;
317 }
318 /* We must now have at least one combining character; see how many
319 * there are */
320 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
321 ;
322 if(nc > 1024)
323 return -1;
324 /* Sort the array */
325 utf32__sort_ccc(s, nc, buffer);
326 s += nc;
327 ns -= nc;
328 }
329 return 0;
330}
331
332/* Magic numbers from UAX #15 s16 */
333#define SBase 0xAC00
334#define LBase 0x1100
335#define VBase 0x1161
336#define TBase 0x11A7
337#define LCount 19
338#define VCount 21
339#define TCount 28
340#define NCount (VCount * TCount)
341#define SCount (LCount * NCount)
342
343/** @brief Guts of the decomposition lookup functions */
344#define utf32__decompose_one_generic(WHICH) do { \
bcf9ed7f 345 const uint32_t *dc = utf32__unidata(c)->WHICH; \
e5a5a138
RK
346 if(dc) { \
347 /* Found a canonical decomposition in the table */ \
348 while(*dc) \
349 utf32__decompose_one_##WHICH(d, *dc++); \
350 } else if(c >= SBase && c < SBase + SCount) { \
351 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
352 const uint32_t SIndex = c - SBase; \
353 const uint32_t L = LBase + SIndex / NCount; \
354 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
355 const uint32_t T = TBase + SIndex % TCount; \
356 dynstr_ucs4_append(d, L); \
357 dynstr_ucs4_append(d, V); \
358 if(T != TBase) \
359 dynstr_ucs4_append(d, T); \
360 } else \
361 /* Equal to own canonical decomposition */ \
362 dynstr_ucs4_append(d, c); \
363} while(0)
364
365/** @brief Recursively compute the canonical decomposition of @p c
366 * @param d Dynamic string to store decomposition in
367 * @param c Code point to decompose (must be a valid!)
368 * @return 0 on success, -1 on error
369 */
370static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
371 utf32__decompose_one_generic(canon);
372}
373
374/** @brief Recursively compute the compatibility decomposition of @p c
375 * @param d Dynamic string to store decomposition in
376 * @param c Code point to decompose (must be a valid!)
377 * @return 0 on success, -1 on error
378 */
379static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
380 utf32__decompose_one_generic(compat);
381}
382
383/** @brief Guts of the decomposition functions */
384#define utf32__decompose_generic(WHICH) do { \
385 struct dynstr_ucs4 d; \
386 uint32_t c; \
387 \
388 dynstr_ucs4_init(&d); \
389 while(ns) { \
390 c = *s++; \
56fd389c 391 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
e5a5a138
RK
392 goto error; \
393 utf32__decompose_one_##WHICH(&d, c); \
394 --ns; \
395 } \
396 if(utf32__canonical_ordering(d.vec, d.nvec)) \
397 goto error; \
398 dynstr_ucs4_terminate(&d); \
399 if(ndp) \
400 *ndp = d.nvec; \
401 return d.vec; \
402error: \
403 xfree(d.vec); \
404 return 0; \
405} while(0)
406
407/** @brief Canonically decompose @p [s,s+ns)
408 * @param s Pointer to string
409 * @param ns Length of string
410 * @param ndp Where to store length of result
411 * @return Pointer to result string, or NULL
412 *
413 * Computes the canonical decomposition of a string and stably sorts combining
414 * characters into canonical order. The result is in Normalization Form D and
415 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
416 * NormalizationTest.txt.
417 *
56fd389c 418 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
419 * - it codes for a UTF-16 surrogate
420 * - it codes for a value outside the unicode code space
421 */
422uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
423 utf32__decompose_generic(canon);
424}
425
426/** @brief Compatibility decompose @p [s,s+ns)
427 * @param s Pointer to string
428 * @param ns Length of string
429 * @param ndp Where to store length of result
430 * @return Pointer to result string, or NULL
431 *
432 * Computes the compatibility decomposition of a string and stably sorts
433 * combining characters into canonical order. The result is in Normalization
434 * Form KD and (at the time of writing!) passes the NFKD tests defined in
435 * Unicode 5.0's NormalizationTest.txt.
436 *
56fd389c 437 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
438 * - it codes for a UTF-16 surrogate
439 * - it codes for a value outside the unicode code space
440 */
441uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
442 utf32__decompose_generic(compat);
443}
444
56fd389c
RK
445/** @brief Single-character case-fold and decompose operation */
446#define utf32__casefold_one(WHICH) do { \
bcf9ed7f 447 const uint32_t *cf = utf32__unidata(c)->casefold; \
56fd389c
RK
448 if(cf) { \
449 /* Found a case-fold mapping in the table */ \
450 while(*cf) \
451 utf32__decompose_one_##WHICH(&d, *cf++); \
452 } else \
453 utf32__decompose_one_##WHICH(&d, c); \
454} while(0)
e5a5a138
RK
455
456/** @brief Case-fold @p [s,s+ns)
457 * @param s Pointer to string
458 * @param ns Length of string
459 * @param ndp Where to store length of result
460 * @return Pointer to result string, or NULL
461 *
462 * Case-fold the string at @p s according to full default case-folding rules
56fd389c 463 * (s3.13) for caseless matching. The result will be in NFD.
e5a5a138 464 *
56fd389c 465 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
466 * - it codes for a UTF-16 surrogate
467 * - it codes for a value outside the unicode code space
468 */
469uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
470 struct dynstr_ucs4 d;
471 uint32_t c;
472 size_t n;
473 uint32_t *ss = 0;
474
475 /* If the canonical decomposition of the string includes any combining
476 * character that case-folds to a non-combining character then we must
477 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
478 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
479 * canonically decompose to it. */
bcf9ed7f
RK
480 for(n = 0; n < ns; ++n)
481 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
e5a5a138 482 break;
e5a5a138
RK
483 if(n < ns) {
484 /* We need a preliminary decomposition */
485 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
486 return 0;
487 s = ss;
488 }
489 dynstr_ucs4_init(&d);
490 while(ns) {
491 c = *s++;
56fd389c 492 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
e5a5a138 493 goto error;
56fd389c 494 utf32__casefold_one(canon);
e5a5a138
RK
495 --ns;
496 }
497 if(utf32__canonical_ordering(d.vec, d.nvec))
498 goto error;
499 dynstr_ucs4_terminate(&d);
500 if(ndp)
501 *ndp = d.nvec;
502 return d.vec;
503error:
504 xfree(d.vec);
505 xfree(ss);
506 return 0;
507}
508
56fd389c
RK
509/** @brief Compatibilit case-fold @p [s,s+ns)
510 * @param s Pointer to string
511 * @param ns Length of string
512 * @param ndp Where to store length of result
513 * @return Pointer to result string, or NULL
514 *
515 * Case-fold the string at @p s according to full default case-folding rules
516 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
517 *
518 * Returns NULL if the string is not valid for either of the following reasons:
519 * - it codes for a UTF-16 surrogate
520 * - it codes for a value outside the unicode code space
521 */
522uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
523 struct dynstr_ucs4 d;
524 uint32_t c;
525 size_t n;
526 uint32_t *ss = 0;
527
bcf9ed7f
RK
528 for(n = 0; n < ns; ++n)
529 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
56fd389c 530 break;
56fd389c
RK
531 if(n < ns) {
532 /* We need a preliminary _canonical_ decomposition */
533 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
534 return 0;
535 s = ss;
536 }
537 /* This computes NFKD(toCaseFold(s)) */
538#define compat_casefold_middle() do { \
539 dynstr_ucs4_init(&d); \
540 while(ns) { \
541 c = *s++; \
542 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
543 goto error; \
544 utf32__casefold_one(compat); \
545 --ns; \
546 } \
547 if(utf32__canonical_ordering(d.vec, d.nvec)) \
548 goto error; \
549} while(0)
550 /* Do the inner (NFKD o toCaseFold) */
551 compat_casefold_middle();
552 /* We can do away with the NFD'd copy of the input now */
553 xfree(ss);
554 s = ss = d.vec;
555 ns = d.nvec;
556 /* Do the outer (NFKD o toCaseFold) */
557 compat_casefold_middle();
558 /* That's all */
559 dynstr_ucs4_terminate(&d);
560 if(ndp)
561 *ndp = d.nvec;
562 return d.vec;
563error:
564 xfree(d.vec);
565 xfree(ss);
566 return 0;
567}
568
e5a5a138
RK
569/** @brief Order a pair of UTF-32 strings
570 * @param a First 0-terminated string
571 * @param b Second 0-terminated string
572 * @return -1, 0 or 1 for a less than, equal to or greater than b
573 *
574 * "Comparable to strcmp() at its best."
575 */
576int utf32_cmp(const uint32_t *a, const uint32_t *b) {
577 while(*a && *b && *a == *b) {
578 ++a;
579 ++b;
580 }
581 return *a < *b ? -1 : (*a > *b ? 1 : 0);
582}
583
35b651f0
RK
584/** @brief Return the General_Category value for @p c
585 * @param Code point
586 * @return General_Category property value
587 */
14523635 588static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
bcf9ed7f 589 return utf32__unidata(c)->general_category;
35b651f0
RK
590}
591
1625e11a 592/** @brief Determine Grapheme_Break property
35b651f0 593 * @param c Code point
1625e11a 594 * @return Grapheme_Break property value of @p c
35b651f0 595 */
0ee05b26 596static inline enum unicode_Grapheme_Break utf32__grapheme_break(uint32_t c) {
1625e11a 597 return utf32__unidata(c)->grapheme_break;
35b651f0
RK
598}
599
0b7052da
RK
600/** @brief Determine Word_Break property
601 * @param c Code point
602 * @return Word_Break property value of @p c
603 */
0ee05b26 604static inline enum unicode_Word_Break utf32__word_break(uint32_t c) {
0e843521 605 return utf32__unidata(c)->word_break;
0b7052da
RK
606}
607
35b651f0
RK
608/** @brief Identify a grapheme cluster boundary
609 * @param s Start of string (must be NFD)
610 * @param ns Length of string
611 * @param n Index within string (in [0,ns].)
612 * @return 1 at a grapheme cluster boundary, 0 otherwise
613 *
614 * This function identifies default grapheme cluster boundaries as described in
615 * UAX #29 s3. It returns 1 if @p n points at the code point just after a
616 * grapheme cluster boundary (including the hypothetical code point just after
617 * the end of the string).
35b651f0 618 */
1625e11a 619int utf32_is_grapheme_boundary(const uint32_t *s, size_t ns, size_t n) {
35b651f0 620 uint32_t before, after;
1625e11a 621 enum unicode_Grapheme_Break gbbefore, gbafter;
35b651f0
RK
622 /* GB1 and GB2 */
623 if(n == 0 || n == ns)
624 return 1;
625 /* Now we know that s[n-1] and s[n] are safe to inspect */
626 /* GB3 */
627 before = s[n-1];
628 after = s[n];
629 if(before == 0x000D && after == 0x000A)
630 return 0;
1625e11a
RK
631 gbbefore = utf32__grapheme_break(before);
632 gbafter = utf32__grapheme_break(after);
633 /* GB4 */
634 if(gbbefore == unicode_Grapheme_Break_Control
635 || before == 0x000D
636 || before == 0x000A)
637 return 1;
638 /* GB5 */
639 if(gbafter == unicode_Grapheme_Break_Control
640 || after == 0x000D
641 || after == 0x000A)
35b651f0 642 return 1;
35b651f0 643 /* GB6 */
1625e11a
RK
644 if(gbbefore == unicode_Grapheme_Break_L
645 && (gbafter == unicode_Grapheme_Break_L
646 || gbafter == unicode_Grapheme_Break_V
647 || gbafter == unicode_Grapheme_Break_LV
648 || gbafter == unicode_Grapheme_Break_LVT))
35b651f0
RK
649 return 0;
650 /* GB7 */
1625e11a
RK
651 if((gbbefore == unicode_Grapheme_Break_LV
652 || gbbefore == unicode_Grapheme_Break_V)
653 && (gbafter == unicode_Grapheme_Break_V
654 || gbafter == unicode_Grapheme_Break_T))
35b651f0
RK
655 return 0;
656 /* GB8 */
1625e11a
RK
657 if((gbbefore == unicode_Grapheme_Break_LVT
658 || gbbefore == unicode_Grapheme_Break_T)
659 && gbafter == unicode_Grapheme_Break_T)
35b651f0
RK
660 return 0;
661 /* GB9 */
0ee05b26 662 if(gbafter == unicode_Grapheme_Break_Extend)
35b651f0
RK
663 return 0;
664 /* GB10 */
665 return 1;
666}
667
bb48024f
RK
668/** @brief Return true if @p c is ignorable for boundary specifications */
669static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
670 return (wb == unicode_Word_Break_Extend
671 || wb == unicode_Word_Break_Format);
0b7052da
RK
672}
673
674/** @brief Identify a word boundary
675 * @param s Start of string (must be NFD)
676 * @param ns Length of string
677 * @param n Index within string (in [0,ns].)
678 * @return 1 at a word boundary, 0 otherwise
679 *
680 * This function identifies default word boundaries as described in UAX #29 s4.
681 * It returns 1 if @p n points at the code point just after a word boundary
682 * (including the hypothetical code point just after the end of the string).
683 */
684int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
685 enum unicode_Word_Break twobefore, before, after, twoafter;
686 size_t nn;
687
688 /* WB1 and WB2 */
689 if(n == 0 || n == ns)
690 return 1;
691 /* WB3 */
692 if(s[n-1] == 0x000D && s[n] == 0x000A)
693 return 0;
694 /* WB4 */
bb48024f
RK
695 /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
696 switch(s[n-1]) { /* bit of a bodge */
697 case 0x000A:
698 case 0x000D:
699 case 0x0085:
700 case 0x2028:
701 case 0x2029:
702 break;
703 default:
704 if(utf32__boundary_ignorable(utf32__word_break(s[n])))
705 return 0;
706 break;
0b7052da 707 }
bb48024f
RK
708 /* Gather the property values we'll need for the rest of the test taking the
709 * s6.2 changes into account */
710 /* First we look at the code points after the proposed boundary */
711 nn = n; /* <ns */
712 after = utf32__word_break(s[nn++]);
713 if(!utf32__boundary_ignorable(after)) {
714 /* X (Extend|Format)* -> X */
715 while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
716 ++nn;
717 }
718 /* It's possible now that nn=ns */
719 if(nn < ns)
720 twoafter = utf32__word_break(s[nn]);
721 else
722 twoafter = unicode_Word_Break_Other;
723
724 /* Next we look at the code points before the proposed boundary. This is a
725 * bit fiddlier. */
726 nn = n;
727 while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
728 --nn;
729 if(nn == 0) {
730 /* s[nn] must be ignorable */
731 before = utf32__word_break(s[nn]);
732 twobefore = unicode_Word_Break_Other;
733 } else {
734 /* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
735 * ignorable. */
736 before = utf32__word_break(s[nn - 1]);
737 --nn;
738 /* Repeat the exercise */
739 while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
740 --nn;
741 if(nn == 0)
742 twobefore = utf32__word_break(s[nn]);
743 else
744 twobefore = utf32__word_break(s[nn - 1]);
0b7052da 745 }
bb48024f 746
0b7052da
RK
747 /* WB5 */
748 if(before == unicode_Word_Break_ALetter
749 && after == unicode_Word_Break_ALetter)
750 return 0;
751 /* WB6 */
752 if(before == unicode_Word_Break_ALetter
753 && after == unicode_Word_Break_MidLetter
754 && twoafter == unicode_Word_Break_ALetter)
755 return 0;
756 /* WB7 */
757 if(twobefore == unicode_Word_Break_ALetter
758 && before == unicode_Word_Break_MidLetter
759 && after == unicode_Word_Break_ALetter)
760 return 0;
761 /* WB8 */
762 if(before == unicode_Word_Break_Numeric
763 && after == unicode_Word_Break_Numeric)
764 return 0;
765 /* WB9 */
766 if(before == unicode_Word_Break_ALetter
767 && after == unicode_Word_Break_Numeric)
768 return 0;
769 /* WB10 */
770 if(before == unicode_Word_Break_Numeric
771 && after == unicode_Word_Break_ALetter)
772 return 0;
773 /* WB11 */
774 if(twobefore == unicode_Word_Break_Numeric
775 && before == unicode_Word_Break_MidNum
776 && after == unicode_Word_Break_Numeric)
777 return 0;
778 /* WB12 */
779 if(before == unicode_Word_Break_Numeric
780 && after == unicode_Word_Break_MidNum
781 && twoafter == unicode_Word_Break_Numeric)
782 return 0;
783 /* WB13 */
784 if(before == unicode_Word_Break_Katakana
785 && after == unicode_Word_Break_Katakana)
786 return 0;
787 /* WB13a */
788 if((before == unicode_Word_Break_ALetter
789 || before == unicode_Word_Break_Numeric
790 || before == unicode_Word_Break_Katakana
791 || before == unicode_Word_Break_ExtendNumLet)
792 && after == unicode_Word_Break_ExtendNumLet)
793 return 0;
794 /* WB13b */
795 if(before == unicode_Word_Break_ExtendNumLet
796 && (after == unicode_Word_Break_ALetter
797 || after == unicode_Word_Break_Numeric
798 || after == unicode_Word_Break_Katakana))
799 return 0;
800 /* WB14 */
801 return 1;
802}
803
e5a5a138 804/*@}*/
349b7b74 805/** @defgroup utf8 Functions that operate on UTF-8 strings */
e5a5a138
RK
806/*@{*/
807
808/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
809#define utf8__transform(FN) do { \
810 uint32_t *to32 = 0, *decomp32 = 0; \
811 size_t nto32, ndecomp32; \
812 char *decomp8 = 0; \
813 \
814 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
815 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
816 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
817error: \
818 xfree(to32); \
819 xfree(decomp32); \
820 return decomp8; \
821} while(0)
822
823/** @brief Canonically decompose @p [s,s+ns)
824 * @param s Pointer to string
825 * @param ns Length of string
826 * @param ndp Where to store length of result
827 * @return Pointer to result string, or NULL
828 *
829 * Computes the canonical decomposition of a string and stably sorts combining
830 * characters into canonical order. The result is in Normalization Form D and
831 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
832 * NormalizationTest.txt.
833 *
834 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
835 * this might be.
836 *
837 * See also utf32_decompose_canon().
838 */
839char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
840 utf8__transform(utf32_decompose_canon);
841}
842
843/** @brief Compatibility decompose @p [s,s+ns)
844 * @param s Pointer to string
845 * @param ns Length of string
846 * @param ndp Where to store length of result
847 * @return Pointer to result string, or NULL
848 *
849 * Computes the compatibility decomposition of a string and stably sorts
850 * combining characters into canonical order. The result is in Normalization
851 * Form KD and (at the time of writing!) passes the NFKD tests defined in
852 * Unicode 5.0's NormalizationTest.txt.
853 *
854 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
855 * this might be.
856 *
857 * See also utf32_decompose_compat().
858 */
859char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
860 utf8__transform(utf32_decompose_compat);
861}
862
863/** @brief Case-fold @p [s,s+ns)
864 * @param s Pointer to string
865 * @param ns Length of string
866 * @param ndp Where to store length of result
867 * @return Pointer to result string, or NULL
868 *
869 * Case-fold the string at @p s according to full default case-folding rules
870 * (s3.13). The result will be in NFD.
871 *
872 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
873 * this might be.
874 */
875char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
876 utf8__transform(utf32_casefold_canon);
877}
878
879/** @brief Compatibility case-fold @p [s,s+ns)
880 * @param s Pointer to string
881 * @param ns Length of string
882 * @param ndp Where to store length of result
883 * @return Pointer to result string, or NULL
884 *
885 * Case-fold the string at @p s according to full default case-folding rules
886 * (s3.13). The result will be in NFKD.
887 *
888 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
889 * this might be.
890 */
e5a5a138
RK
891char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
892 utf8__transform(utf32_casefold_compat);
893}
e5a5a138
RK
894
895/*@}*/
896
897/*
898Local Variables:
899c-basic-offset:2
900comment-column:40
901fill-column:79
902indent-tabs-mode:nil
903End:
904*/