chiark / gitweb /
Merge branch 'master' of git.distorted.org.uk:~mdw/publish/public-git/disorder
[disorder] / lib / unicode.c
CommitLineData
e5a5a138
RK
1/*
2 * This file is part of DisOrder
bb5c7798 3 * Copyright (C) 2007, 2009, 2013 Richard Kettlewell
e5a5a138 4 *
e7eb3a27 5 * This program is free software: you can redistribute it and/or modify
e5a5a138 6 * it under the terms of the GNU General Public License as published by
e7eb3a27 7 * the Free Software Foundation, either version 3 of the License, or
e5a5a138 8 * (at your option) any later version.
e7eb3a27
RK
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
e5a5a138 15 * You should have received a copy of the GNU General Public License
e7eb3a27 16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
e5a5a138
RK
17 */
18/** @file lib/unicode.c
19 * @brief Unicode support functions
20 *
21 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
35b651f0
RK
22 * encoding schemes). The primary encoding form is UTF-32 but convenience
23 * wrappers using UTF-8 are provided for a number of functions.
e5a5a138
RK
24 *
25 * The idea is that all the strings that hit the database will be in a
26 * particular normalization form, and for the search and tags database
27 * in case-folded form, so they can be naively compared within the
28 * database code.
29 *
30 * As the code stands this guarantee is not well met!
0ae60b83
RK
31 *
32 * Subpages:
33 * - @ref utf32props
34 * - @ref utftransform
35 * - @ref utf32iterator
36 * - @ref utf32
37 * - @ref utf8
e5a5a138
RK
38 */
39
05b75f8d 40#include "common.h"
e5a5a138
RK
41
42#include "mem.h"
43#include "vector.h"
44#include "unicode.h"
45#include "unidata.h"
46
092f426f
RK
47/** @defgroup utf32props Unicode Code Point Properties */
48/*@{*/
49
50static const struct unidata *utf32__unidata_hard(uint32_t c);
51
52/** @brief Find definition of code point @p c
53 * @param c Code point
54 * @return Pointer to @ref unidata structure for @p c
55 *
56 * @p c can be any 32-bit value, a sensible value will be returned regardless.
57 * The returned pointer is NOT guaranteed to be unique to @p c.
58 */
59static inline const struct unidata *utf32__unidata(uint32_t c) {
60 /* The bottom half of the table contains almost everything of interest
61 * and we can just return the right thing straight away */
62 if(c < UNICODE_BREAK_START)
63 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
64 else
65 return utf32__unidata_hard(c);
66}
67
68/** @brief Find definition of code point @p c
69 * @param c Code point
70 * @return Pointer to @ref unidata structure for @p c
71 *
72 * @p c can be any 32-bit value, a sensible value will be returned regardless.
73 * The returned pointer is NOT guaranteed to be unique to @p c.
74 *
75 * Don't use this function (although it will work fine) - use utf32__unidata()
76 * instead.
77 */
78static const struct unidata *utf32__unidata_hard(uint32_t c) {
79 if(c < UNICODE_BREAK_START)
80 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
81 /* Within the break everything is unassigned */
82 if(c < UNICODE_BREAK_END)
83 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
84 /* Planes 15 and 16 are (mostly) private use */
85 if((c >= 0xF0000 && c <= 0xFFFFD)
86 || (c >= 0x100000 && c <= 0x10FFFD))
87 return utf32__unidata(0xE000); /* first Co code point */
88 /* Everything else above the break top is unassigned */
89 if(c >= UNICODE_BREAK_TOP)
90 return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
91 /* Currently the rest is language tags and variation selectors */
92 c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
93 return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
94}
95
96/** @brief Return the combining class of @p c
97 * @param c Code point
98 * @return Combining class of @p c
99 *
100 * @p c can be any 32-bit value, a sensible value will be returned regardless.
101 */
102static inline int utf32__combining_class(uint32_t c) {
103 return utf32__unidata(c)->ccc;
104}
105
3c82b504
RK
106/** @brief Return the combining class of @p c
107 * @param c Code point
108 * @return Combining class of @p c
109 *
110 * @p c can be any 32-bit value, a sensible value will be returned regardless.
111 */
112int utf32_combining_class(uint32_t c) {
113 return utf32__combining_class(c);
114}
115
092f426f 116/** @brief Return the General_Category value for @p c
0ae60b83 117 * @param c Code point
092f426f
RK
118 * @return General_Category property value
119 *
120 * @p c can be any 32-bit value, a sensible value will be returned regardless.
121 */
122static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
123 return utf32__unidata(c)->general_category;
124}
125
126/** @brief Determine Grapheme_Break property
127 * @param c Code point
128 * @return Grapheme_Break property value of @p c
129 *
130 * @p c can be any 32-bit value, a sensible value will be returned regardless.
131 */
132static inline enum unicode_Grapheme_Break utf32__grapheme_break(uint32_t c) {
133 return utf32__unidata(c)->grapheme_break;
134}
135
136/** @brief Determine Word_Break property
137 * @param c Code point
138 * @return Word_Break property value of @p c
139 *
140 * @p c can be any 32-bit value, a sensible value will be returned regardless.
141 */
142static inline enum unicode_Word_Break utf32__word_break(uint32_t c) {
143 return utf32__unidata(c)->word_break;
144}
145
146/** @brief Determine Sentence_Break property
147 * @param c Code point
148 * @return Word_Break property value of @p c
149 *
150 * @p c can be any 32-bit value, a sensible value will be returned regardless.
151 */
152static inline enum unicode_Sentence_Break utf32__sentence_break(uint32_t c) {
153 return utf32__unidata(c)->sentence_break;
154}
155
156/** @brief Return true if @p c is ignorable for boundary specifications
157 * @param wb Word break property value
158 * @return non-0 if @p wb is unicode_Word_Break_Extend or unicode_Word_Break_Format
159 */
160static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
161 return (wb == unicode_Word_Break_Extend
162 || wb == unicode_Word_Break_Format);
163}
164
f98fcddb
RK
165/** @brief Return the canonical decomposition of @p c
166 * @param c Code point
167 * @return 0-terminated canonical decomposition, or 0
168 */
169static inline const uint32_t *utf32__decomposition_canon(uint32_t c) {
170 const struct unidata *const data = utf32__unidata(c);
171 const uint32_t *const decomp = data->decomp;
172
173 if(decomp && !(data->flags & unicode_compatibility_decomposition))
174 return decomp;
175 else
176 return 0;
177}
178
179/** @brief Return the compatibility decomposition of @p c
180 * @param c Code point
181 * @return 0-terminated decomposition, or 0
182 */
183static inline const uint32_t *utf32__decomposition_compat(uint32_t c) {
184 return utf32__unidata(c)->decomp;
185}
186
092f426f 187/*@}*/
e5a5a138
RK
188/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
189/*@{*/
190
191/** @brief Convert UTF-32 to UTF-8
192 * @param s Source string
193 * @param ns Length of source string in code points
194 * @param ndp Where to store length of destination string (or NULL)
195 * @return Newly allocated destination string or NULL on error
196 *
56fd389c
RK
197 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
198 * invalid if:
e5a5a138
RK
199 * - it codes for a UTF-16 surrogate
200 * - it codes for a value outside the unicode code space
201 *
56fd389c
RK
202 * The return value is always 0-terminated. The value returned via @p *ndp
203 * does not include the terminator.
e5a5a138
RK
204 */
205char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
206 struct dynstr d;
207 uint32_t c;
208
209 dynstr_init(&d);
210 while(ns > 0) {
211 c = *s++;
212 if(c < 0x80)
213 dynstr_append(&d, c);
214 else if(c < 0x0800) {
215 dynstr_append(&d, 0xC0 | (c >> 6));
216 dynstr_append(&d, 0x80 | (c & 0x3F));
217 } else if(c < 0x10000) {
56fd389c 218 if(c >= 0xD800 && c <= 0xDFFF)
e5a5a138
RK
219 goto error;
220 dynstr_append(&d, 0xE0 | (c >> 12));
221 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
222 dynstr_append(&d, 0x80 | (c & 0x3F));
223 } else if(c < 0x110000) {
224 dynstr_append(&d, 0xF0 | (c >> 18));
225 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
226 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
227 dynstr_append(&d, 0x80 | (c & 0x3F));
228 } else
229 goto error;
230 --ns;
231 }
232 dynstr_terminate(&d);
233 if(ndp)
234 *ndp = d.nvec;
235 return d.vec;
236error:
237 xfree(d.vec);
238 return 0;
239}
240
241/** @brief Convert UTF-8 to UTF-32
242 * @param s Source string
243 * @param ns Length of source string in code points
244 * @param ndp Where to store length of destination string (or NULL)
f98fcddb 245 * @return Newly allocated destination string or NULL on error
e5a5a138 246 *
56fd389c
RK
247 * The return value is always 0-terminated. The value returned via @p *ndp
248 * does not include the terminator.
e5a5a138
RK
249 *
250 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
251 * for a code point is invalid if:
252 * - it is not the shortest possible sequence for the code point
253 * - it codes for a UTF-16 surrogate
254 * - it codes for a value outside the unicode code space
255 */
256uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
257 struct dynstr_ucs4 d;
32b158f2 258 uint32_t c32;
e5a5a138 259 const uint8_t *ss = (const uint8_t *)s;
32b158f2 260 int n;
e5a5a138
RK
261
262 dynstr_ucs4_init(&d);
263 while(ns > 0) {
32b158f2
RK
264 const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss];
265 if(r->count <= ns) {
266 switch(r->count) {
267 case 1:
268 c32 = *ss;
269 break;
270 case 2:
271 if(ss[1] < r->min2 || ss[1] > r->max2)
272 goto error;
273 c32 = *ss & 0x1F;
274 break;
275 case 3:
276 if(ss[1] < r->min2 || ss[1] > r->max2)
277 goto error;
278 c32 = *ss & 0x0F;
279 break;
280 case 4:
281 if(ss[1] < r->min2 || ss[1] > r->max2)
282 goto error;
283 c32 = *ss & 0x07;
284 break;
285 default:
286 goto error;
287 }
e5a5a138
RK
288 } else
289 goto error;
32b158f2
RK
290 for(n = 1; n < r->count; ++n) {
291 if(ss[n] < 0x80 || ss[n] > 0xBF)
292 goto error;
293 c32 = (c32 << 6) | (ss[n] & 0x3F);
294 }
e5a5a138 295 dynstr_ucs4_append(&d, c32);
32b158f2
RK
296 ss += r->count;
297 ns -= r->count;
e5a5a138
RK
298 }
299 dynstr_ucs4_terminate(&d);
300 if(ndp)
301 *ndp = d.nvec;
302 return d.vec;
303error:
304 xfree(d.vec);
305 return 0;
306}
307
bb5c7798
RK
308/** @brief Convert UTF-16 to UTF-8
309 * @param s Source string
310 * @param ns Length of source string in code points
311 * @param ndp Where to store length of destination string (or NULL)
312 * @return Newly allocated destination string or NULL on error
313 *
314 * If the UTF-16 is not valid then NULL is returned. A UTF-16 sequence t is
315 * invalid if it contains an incomplete surrogate.
316 *
317 * The return value is always 0-terminated. The value returned via @p *ndp
318 * does not include the terminator.
319 */
320char *utf16_to_utf8(const uint16_t *s, size_t ns, size_t *ndp) {
321 struct dynstr d;
322 uint32_t c;
323
324 dynstr_init(&d);
325 while(ns > 0) {
326 c = *s++;
327 --ns;
328 if(c >= 0xD800 && c <= 0xDBFF) {
329 if(ns && *s >= 0xDC00 && c <= 0xDFFF)
330 c = ((c - 0xD800) << 10) + (*s++ - 0xDC00) + 0x10000;
331 else
332 goto error;
333 } else if(c >= 0xDC00 && c <= 0xDFFF)
334 goto error;
335 if(c < 0x80)
336 dynstr_append(&d, c);
337 else if(c < 0x0800) {
338 dynstr_append(&d, 0xC0 | (c >> 6));
339 dynstr_append(&d, 0x80 | (c & 0x3F));
340 } else if(c < 0x10000) {
341 if(c >= 0xD800 && c <= 0xDFFF)
342 goto error;
343 dynstr_append(&d, 0xE0 | (c >> 12));
344 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
345 dynstr_append(&d, 0x80 | (c & 0x3F));
346 } else if(c < 0x110000) {
347 dynstr_append(&d, 0xF0 | (c >> 18));
348 dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
349 dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
350 dynstr_append(&d, 0x80 | (c & 0x3F));
351 } else
352 goto error;
353 }
354 dynstr_terminate(&d);
355 if(ndp)
356 *ndp = d.nvec;
357 return d.vec;
358error:
359 xfree(d.vec);
360 return 0;
361}
362
363/** @brief Convert UTF-8 to UTF-16
364 * @param s Source string
365 * @param ns Length of source string in code points
366 * @param ndp Where to store length of destination string (or NULL)
367 * @return Newly allocated destination string or NULL on error
368 *
369 * The return value is always 0-terminated. The value returned via @p *ndp
370 * does not include the terminator.
371 *
372 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
373 * for a code point is invalid if:
374 * - it is not the shortest possible sequence for the code point
375 * - it codes for a UTF-16 surrogate
376 * - it codes for a value outside the unicode code space
377 */
378uint16_t *utf8_to_utf16(const char *s, size_t ns, size_t *ndp) {
379 struct dynstr_utf16 d;
380 uint32_t c32;
381 const uint8_t *ss = (const uint8_t *)s;
382 int n;
383
384 dynstr_utf16_init(&d);
385 while(ns > 0) {
386 const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss];
387 if(r->count <= ns) {
388 switch(r->count) {
389 case 1:
390 c32 = *ss;
391 break;
392 case 2:
393 if(ss[1] < r->min2 || ss[1] > r->max2)
394 goto error;
395 c32 = *ss & 0x1F;
396 break;
397 case 3:
398 if(ss[1] < r->min2 || ss[1] > r->max2)
399 goto error;
400 c32 = *ss & 0x0F;
401 break;
402 case 4:
403 if(ss[1] < r->min2 || ss[1] > r->max2)
404 goto error;
405 c32 = *ss & 0x07;
406 break;
407 default:
408 goto error;
409 }
410 } else
411 goto error;
412 for(n = 1; n < r->count; ++n) {
413 if(ss[n] < 0x80 || ss[n] > 0xBF)
414 goto error;
415 c32 = (c32 << 6) | (ss[n] & 0x3F);
416 }
417 if(c32 >= 0x10000) {
418 c32 -= 0x10000;
419 dynstr_utf16_append(&d, 0xD800 + (c32 >> 10));
420 dynstr_utf16_append(&d, 0xDC00 + (c32 & 0x03FF));
421 } else
422 dynstr_utf16_append(&d, c32);
423 ss += r->count;
424 ns -= r->count;
425 }
426 dynstr_utf16_terminate(&d);
427 if(ndp)
428 *ndp = d.nvec;
429 return d.vec;
430error:
431 xfree(d.vec);
432 return 0;
433}
434
18cda350
RK
435/** @brief Test whether [s,s+ns) is valid UTF-8
436 * @param s Start of string
437 * @param ns Length of string
438 * @return non-0 if @p s is valid UTF-8, 0 if it is not valid
439 *
440 * This function is intended to be much faster than calling utf8_to_utf32() and
441 * throwing away the result.
442 */
443int utf8_valid(const char *s, size_t ns) {
444 const uint8_t *ss = (const uint8_t *)s;
445 while(ns > 0) {
446 const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss];
447 if(r->count <= ns) {
448 switch(r->count) {
449 case 1:
450 break;
451 case 2:
452 if(ss[1] < r->min2 || ss[1] > r->max2)
453 return 0;
454 break;
455 case 3:
456 if(ss[1] < r->min2 || ss[1] > r->max2)
457 return 0;
458 if(ss[2] < 0x80 || ss[2] > 0xBF)
459 return 0;
460 break;
461 case 4:
462 if(ss[1] < r->min2 || ss[1] > r->max2)
463 return 0;
464 if(ss[2] < 0x80 || ss[2] > 0xBF)
465 return 0;
466 if(ss[3] < 0x80 || ss[3] > 0xBF)
467 return 0;
468 break;
469 default:
470 return 0;
471 }
472 } else
473 return 0;
474 ss += r->count;
475 ns -= r->count;
476 }
477 return 1;
478}
479
092f426f
RK
480/*@}*/
481/** @defgroup utf32iterator UTF-32 string iterators */
482/*@{*/
483
484struct utf32_iterator_data {
485 /** @brief Start of string */
486 const uint32_t *s;
487
488 /** @brief Length of string */
489 size_t ns;
490
491 /** @brief Current position */
492 size_t n;
493
494 /** @brief Last two non-ignorable characters or (uint32_t)-1
495 *
496 * last[1] is the non-Extend/Format character just before position @p n;
497 * last[0] is the one just before that.
498 *
499 * Exception 1: if there is no such non-Extend/Format character then an
500 * Extend/Format character is accepted instead.
501 *
502 * Exception 2: if there is no such character even taking that into account
503 * the value is (uint32_t)-1.
504 */
505 uint32_t last[2];
092f426f 506
c85b7022
RK
507 /** @brief Tailoring for Word_Break */
508 unicode_property_tailor *word_break;
509};
092f426f
RK
510
511/** @brief Initialize an internal private iterator
512 * @param it Iterator
513 * @param s Start of string
514 * @param ns Length of string
515 * @param n Absolute position
516 */
517static void utf32__iterator_init(utf32_iterator it,
518 const uint32_t *s, size_t ns, size_t n) {
519 it->s = s;
520 it->ns = ns;
521 it->n = 0;
522 it->last[0] = it->last[1] = -1;
c85b7022 523 it->word_break = 0;
b21a155c 524 utf32_iterator_set(it, n);
092f426f
RK
525}
526
c85b7022
RK
527/** @brief Create a new iterator pointing at the start of a string
528 * @param s Start of string
529 * @param ns Length of string
530 * @return New iterator
531 */
532utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
533 utf32_iterator it = xmalloc(sizeof *it);
534 utf32__iterator_init(it, s, ns, 0);
535 return it;
536}
537
538/** @brief Tailor this iterator's interpretation of the Word_Break property.
539 * @param it Iterator
540 * @param pt Property tailor function or NULL
541 *
542 * After calling this the iterator will call @p pt to determine the Word_Break
543 * property of each code point. If it returns -1 the default value will be
544 * used otherwise the returned value will be used.
545 *
546 * @p pt can be NULL to revert to the default value of the property.
547 *
548 * It is safe to call this function at any time; the iterator's internal state
549 * will be reset to suit the new tailoring.
550 */
551void utf32_iterator_tailor_word_break(utf32_iterator it,
552 unicode_property_tailor *pt) {
553 it->word_break = pt;
554 utf32_iterator_set(it, it->n);
555}
556
557static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
558 uint32_t c) {
559 if(!it->word_break)
560 return utf32__word_break(c);
561 else {
562 const int t = it->word_break(c);
563
564 if(t < 0)
565 return utf32__word_break(c);
566 else
567 return t;
568 }
569}
570
092f426f
RK
571/** @brief Destroy an iterator
572 * @param it Iterator
573 */
574void utf32_iterator_destroy(utf32_iterator it) {
575 xfree(it);
576}
577
578/** @brief Find the current position of an interator
579 * @param it Iterator
580 */
581size_t utf32_iterator_where(utf32_iterator it) {
582 return it->n;
583}
584
585/** @brief Set an iterator's absolute position
586 * @param it Iterator
587 * @param n Absolute position
588 * @return 0 on success, non-0 on error
589 *
590 * It is an error to position the iterator outside the string (but acceptable
591 * to point it at the hypothetical post-final character). If an invalid value
592 * of @p n is specified then the iterator is not changed.
f98fcddb
RK
593 *
594 * This function works by backing up and then advancing to reconstruct the
595 * iterator's internal state for position @p n. The worst case will be O(n)
596 * time complexity (with a worse constant factor that utf32_iterator_advance())
597 * but the typical case is essentially constant-time.
092f426f
RK
598 */
599int utf32_iterator_set(utf32_iterator it, size_t n) {
5617aaff
RK
600 /* We can't just jump to position @p n; the @p last[] values will be wrong.
601 * What we need is to jump a bit behind @p n and then advance forward,
602 * updating @p last[] along the way. How far back? We need to cross two
603 * non-ignorable code points as we advance forwards, so we'd better pass two
604 * such characters on the way back (if such are available).
605 */
b21a155c 606 size_t m;
5617aaff
RK
607
608 if(n > it->ns) /* range check */
092f426f 609 return -1;
b21a155c
RK
610 /* Walk backwards skipping ignorable code points */
611 m = n;
c85b7022
RK
612 while(m > 0
613 && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
614 it->s[m-1]))))
b21a155c
RK
615 --m;
616 /* Either m=0 or s[m-1] is not ignorable */
617 if(m > 0) {
618 --m;
619 /* s[m] is our first non-ignorable code; look for a second in the same
620 way **/
c85b7022
RK
621 while(m > 0
622 && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
623 it->s[m-1]))))
5617aaff 624 --m;
b21a155c
RK
625 /* Either m=0 or s[m-1] is not ignorable */
626 if(m > 0)
627 --m;
628 }
629 it->last[0] = it->last[1] = -1;
5617aaff
RK
630 it->n = m;
631 return utf32_iterator_advance(it, n - m);
092f426f
RK
632}
633
634/** @brief Advance an iterator
635 * @param it Iterator
636 * @param count Number of code points to advance by
637 * @return 0 on success, non-0 on error
638 *
639 * It is an error to advance an iterator beyond the hypothetical post-final
640 * character of the string. If an invalid value of @p n is specified then the
641 * iterator is not changed.
642 *
643 * This function has O(n) time complexity: it works by advancing naively
644 * forwards through the string.
645 */
646int utf32_iterator_advance(utf32_iterator it, size_t count) {
647 if(count <= it->ns - it->n) {
648 while(count > 0) {
649 const uint32_t c = it->s[it->n];
c85b7022 650 const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
092f426f
RK
651 if(it->last[1] == (uint32_t)-1
652 || !utf32__boundary_ignorable(wb)) {
653 it->last[0] = it->last[1];
654 it->last[1] = c;
655 }
656 ++it->n;
657 --count;
658 }
659 return 0;
660 } else
661 return -1;
662}
663
664/** @brief Find the current code point
665 * @param it Iterator
666 * @return Current code point or 0
667 *
668 * If the iterator points at the hypothetical post-final character of the
669 * string then 0 is returned. NB that this doesn't mean that there aren't any
670 * 0 code points inside the string!
671 */
672uint32_t utf32_iterator_code(utf32_iterator it) {
673 if(it->n < it->ns)
674 return it->s[it->n];
675 else
676 return 0;
677}
678
679/** @brief Test for a grapheme boundary
680 * @param it Iterator
681 * @return Non-0 if pointing just after a grapheme boundary, otherwise 0
f98fcddb
RK
682 *
683 * This function identifies default grapheme cluster boundaries as described in
684 * UAX #29 s3. It returns non-0 if @p it points at the code point just after a
685 * grapheme cluster boundary (including the hypothetical code point just after
686 * the end of the string).
092f426f
RK
687 */
688int utf32_iterator_grapheme_boundary(utf32_iterator it) {
689 uint32_t before, after;
690 enum unicode_Grapheme_Break gbbefore, gbafter;
691 /* GB1 and GB2 */
692 if(it->n == 0 || it->n == it->ns)
693 return 1;
694 /* Now we know that s[n-1] and s[n] are safe to inspect */
695 /* GB3 */
696 before = it->s[it->n-1];
697 after = it->s[it->n];
698 if(before == 0x000D && after == 0x000A)
699 return 0;
700 gbbefore = utf32__grapheme_break(before);
701 gbafter = utf32__grapheme_break(after);
702 /* GB4 */
703 if(gbbefore == unicode_Grapheme_Break_Control
704 || before == 0x000D
705 || before == 0x000A)
706 return 1;
707 /* GB5 */
708 if(gbafter == unicode_Grapheme_Break_Control
709 || after == 0x000D
710 || after == 0x000A)
711 return 1;
712 /* GB6 */
713 if(gbbefore == unicode_Grapheme_Break_L
714 && (gbafter == unicode_Grapheme_Break_L
715 || gbafter == unicode_Grapheme_Break_V
716 || gbafter == unicode_Grapheme_Break_LV
717 || gbafter == unicode_Grapheme_Break_LVT))
718 return 0;
719 /* GB7 */
720 if((gbbefore == unicode_Grapheme_Break_LV
721 || gbbefore == unicode_Grapheme_Break_V)
722 && (gbafter == unicode_Grapheme_Break_V
723 || gbafter == unicode_Grapheme_Break_T))
724 return 0;
725 /* GB8 */
726 if((gbbefore == unicode_Grapheme_Break_LVT
727 || gbbefore == unicode_Grapheme_Break_T)
728 && gbafter == unicode_Grapheme_Break_T)
729 return 0;
730 /* GB9 */
731 if(gbafter == unicode_Grapheme_Break_Extend)
732 return 0;
e2e88ad8
RK
733 /* GB9a */
734 if(gbafter == unicode_Grapheme_Break_SpacingMark)
735 return 0;
736 /* GB9b */
737 if(gbbefore == unicode_Grapheme_Break_Prepend)
738 return 0;
092f426f
RK
739 /* GB10 */
740 return 1;
741
742}
743
744/** @brief Test for a word boundary
745 * @param it Iterator
746 * @return Non-0 if pointing just after a word boundary, otherwise 0
f98fcddb
RK
747 *
748 * This function identifies default word boundaries as described in UAX #29 s4.
749 * It returns non-0 if @p it points at the code point just after a word
750 * boundary (including the hypothetical code point just after the end of the
751 * string) and 0 otherwise.
092f426f
RK
752 */
753int utf32_iterator_word_boundary(utf32_iterator it) {
36f522a4 754 uint32_t before, after;
2dc0bc24 755 enum unicode_Word_Break wbtwobefore, wbbefore, wbafter, wbtwoafter;
092f426f
RK
756 size_t nn;
757
758 /* WB1 and WB2 */
759 if(it->n == 0 || it->n == it->ns)
760 return 1;
36f522a4
RK
761 before = it->s[it->n-1];
762 after = it->s[it->n];
092f426f 763 /* WB3 */
36f522a4 764 if(before == 0x000D && after == 0x000A)
092f426f 765 return 0;
fb4c61da 766 /* WB3a */
36f522a4
RK
767 if(utf32__iterator_word_break(it, before) == unicode_Word_Break_Newline
768 || before == 0x000D
769 || before == 0x000A)
fb4c61da
RK
770 return 1;
771 /* WB3b */
36f522a4
RK
772 if(utf32__iterator_word_break(it, after) == unicode_Word_Break_Newline
773 || after == 0x000D
774 || after == 0x000A)
fb4c61da 775 return 1;
092f426f
RK
776 /* WB4 */
777 /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
36f522a4
RK
778 if(utf32__sentence_break(before) != unicode_Sentence_Break_Sep
779 && utf32__boundary_ignorable(utf32__iterator_word_break(it, after)))
092f426f
RK
780 return 0;
781 /* Gather the property values we'll need for the rest of the test taking the
782 * s6.2 changes into account */
783 /* First we look at the code points after the proposed boundary */
784 nn = it->n; /* <it->ns */
2dc0bc24
RK
785 wbafter = utf32__iterator_word_break(it, it->s[nn++]);
786 if(!utf32__boundary_ignorable(wbafter)) {
092f426f
RK
787 /* X (Extend|Format)* -> X */
788 while(nn < it->ns
c85b7022
RK
789 && utf32__boundary_ignorable(utf32__iterator_word_break(it,
790 it->s[nn])))
092f426f
RK
791 ++nn;
792 }
793 /* It's possible now that nn=ns */
794 if(nn < it->ns)
2dc0bc24 795 wbtwoafter = utf32__iterator_word_break(it, it->s[nn]);
092f426f 796 else
2dc0bc24 797 wbtwoafter = unicode_Word_Break_Other;
092f426f
RK
798
799 /* We've already recorded the non-ignorable code points before the proposed
800 * boundary */
2dc0bc24
RK
801 wbbefore = utf32__iterator_word_break(it, it->last[1]);
802 wbtwobefore = utf32__iterator_word_break(it, it->last[0]);
092f426f
RK
803
804 /* WB5 */
2dc0bc24
RK
805 if(wbbefore == unicode_Word_Break_ALetter
806 && wbafter == unicode_Word_Break_ALetter)
092f426f
RK
807 return 0;
808 /* WB6 */
2dc0bc24
RK
809 if(wbbefore == unicode_Word_Break_ALetter
810 && (wbafter == unicode_Word_Break_MidLetter
811 || wbafter == unicode_Word_Break_MidNumLet)
812 && wbtwoafter == unicode_Word_Break_ALetter)
092f426f
RK
813 return 0;
814 /* WB7 */
2dc0bc24
RK
815 if(wbtwobefore == unicode_Word_Break_ALetter
816 && (wbbefore == unicode_Word_Break_MidLetter
817 || wbbefore == unicode_Word_Break_MidNumLet)
818 && wbafter == unicode_Word_Break_ALetter)
092f426f 819 return 0;
c85b7022 820 /* WB8 */
2dc0bc24
RK
821 if(wbbefore == unicode_Word_Break_Numeric
822 && wbafter == unicode_Word_Break_Numeric)
092f426f
RK
823 return 0;
824 /* WB9 */
2dc0bc24
RK
825 if(wbbefore == unicode_Word_Break_ALetter
826 && wbafter == unicode_Word_Break_Numeric)
092f426f
RK
827 return 0;
828 /* WB10 */
2dc0bc24
RK
829 if(wbbefore == unicode_Word_Break_Numeric
830 && wbafter == unicode_Word_Break_ALetter)
092f426f
RK
831 return 0;
832 /* WB11 */
2dc0bc24
RK
833 if(wbtwobefore == unicode_Word_Break_Numeric
834 && (wbbefore == unicode_Word_Break_MidNum
835 || wbbefore == unicode_Word_Break_MidNumLet)
836 && wbafter == unicode_Word_Break_Numeric)
092f426f
RK
837 return 0;
838 /* WB12 */
2dc0bc24
RK
839 if(wbbefore == unicode_Word_Break_Numeric
840 && (wbafter == unicode_Word_Break_MidNum
841 || wbafter == unicode_Word_Break_MidNumLet)
842 && wbtwoafter == unicode_Word_Break_Numeric)
092f426f
RK
843 return 0;
844 /* WB13 */
2dc0bc24
RK
845 if(wbbefore == unicode_Word_Break_Katakana
846 && wbafter == unicode_Word_Break_Katakana)
092f426f
RK
847 return 0;
848 /* WB13a */
2dc0bc24
RK
849 if((wbbefore == unicode_Word_Break_ALetter
850 || wbbefore == unicode_Word_Break_Numeric
851 || wbbefore == unicode_Word_Break_Katakana
852 || wbbefore == unicode_Word_Break_ExtendNumLet)
853 && wbafter == unicode_Word_Break_ExtendNumLet)
092f426f
RK
854 return 0;
855 /* WB13b */
2dc0bc24
RK
856 if(wbbefore == unicode_Word_Break_ExtendNumLet
857 && (wbafter == unicode_Word_Break_ALetter
858 || wbafter == unicode_Word_Break_Numeric
859 || wbafter == unicode_Word_Break_Katakana))
092f426f
RK
860 return 0;
861 /* WB14 */
862 return 1;
863}
864
e5a5a138
RK
865/*@}*/
866/** @defgroup utf32 Functions that operate on UTF-32 strings */
867/*@{*/
868
869/** @brief Return the length of a 0-terminated UTF-32 string
870 * @param s Pointer to 0-terminated string
871 * @return Length of string in code points (excluding terminator)
872 *
56fd389c 873 * Unlike the conversion functions no validity checking is done on the string.
e5a5a138
RK
874 */
875size_t utf32_len(const uint32_t *s) {
876 const uint32_t *t = s;
877
878 while(*t)
879 ++t;
880 return (size_t)(t - s);
881}
882
e5a5a138
RK
883/** @brief Stably sort [s,s+ns) into descending order of combining class
884 * @param s Start of array
885 * @param ns Number of elements, must be at least 1
886 * @param buffer Buffer of at least @p ns elements
887 */
888static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
889 uint32_t *a, *b, *bp;
890 size_t na, nb;
891
892 switch(ns) {
893 case 1: /* 1-element array is always sorted */
894 return;
895 case 2: /* 2-element arrays are trivial to sort */
896 if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
897 uint32_t tmp = s[0];
898 s[0] = s[1];
899 s[1] = tmp;
900 }
901 return;
902 default:
903 /* Partition the array */
904 na = ns / 2;
905 nb = ns - na;
906 a = s;
907 b = s + na;
908 /* Sort the two halves of the array */
909 utf32__sort_ccc(a, na, buffer);
910 utf32__sort_ccc(b, nb, buffer);
911 /* Merge them back into one, via the buffer */
912 bp = buffer;
913 while(na > 0 && nb > 0) {
16506c9d 914 /* We want ascending order of combining class (hence <)
e5a5a138
RK
915 * and we want stability within combining classes (hence <=)
916 */
917 if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
918 *bp++ = *a++;
919 --na;
920 } else {
921 *bp++ = *b++;
922 --nb;
923 }
924 }
925 while(na > 0) {
926 *bp++ = *a++;
927 --na;
928 }
929 while(nb > 0) {
930 *bp++ = *b++;
931 --nb;
932 }
933 memcpy(s, buffer, ns * sizeof(uint32_t));
934 return;
935 }
936}
937
938/** @brief Put combining characters into canonical order
939 * @param s Pointer to UTF-32 string
940 * @param ns Length of @p s
f98fcddb 941 * @return 0 on success, non-0 on error
e5a5a138 942 *
56fd389c
RK
943 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
944 * ordering.
e5a5a138 945 *
56fd389c 946 * Currently we only support a maximum of 1024 combining characters after each
f98fcddb 947 * base character. If this limit is exceeded then a non-0 value is returned.
e5a5a138
RK
948 */
949static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
950 size_t nc;
951 uint32_t buffer[1024];
952
953 /* The ordering amounts to a stable sort of each contiguous group of
954 * characters with non-0 combining class. */
955 while(ns > 0) {
956 /* Skip non-combining characters */
957 if(utf32__combining_class(*s) == 0) {
958 ++s;
959 --ns;
960 continue;
961 }
962 /* We must now have at least one combining character; see how many
963 * there are */
964 for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
965 ;
966 if(nc > 1024)
967 return -1;
968 /* Sort the array */
969 utf32__sort_ccc(s, nc, buffer);
970 s += nc;
971 ns -= nc;
972 }
973 return 0;
974}
975
976/* Magic numbers from UAX #15 s16 */
977#define SBase 0xAC00
978#define LBase 0x1100
979#define VBase 0x1161
980#define TBase 0x11A7
981#define LCount 19
982#define VCount 21
983#define TCount 28
984#define NCount (VCount * TCount)
985#define SCount (LCount * NCount)
986
987/** @brief Guts of the decomposition lookup functions */
988#define utf32__decompose_one_generic(WHICH) do { \
f98fcddb 989 const uint32_t *dc = utf32__decomposition_##WHICH(c); \
e5a5a138
RK
990 if(dc) { \
991 /* Found a canonical decomposition in the table */ \
992 while(*dc) \
993 utf32__decompose_one_##WHICH(d, *dc++); \
994 } else if(c >= SBase && c < SBase + SCount) { \
995 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
996 const uint32_t SIndex = c - SBase; \
997 const uint32_t L = LBase + SIndex / NCount; \
998 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
999 const uint32_t T = TBase + SIndex % TCount; \
1000 dynstr_ucs4_append(d, L); \
1001 dynstr_ucs4_append(d, V); \
1002 if(T != TBase) \
1003 dynstr_ucs4_append(d, T); \
1004 } else \
1005 /* Equal to own canonical decomposition */ \
1006 dynstr_ucs4_append(d, c); \
1007} while(0)
1008
1009/** @brief Recursively compute the canonical decomposition of @p c
1010 * @param d Dynamic string to store decomposition in
1011 * @param c Code point to decompose (must be a valid!)
f98fcddb 1012 * @return 0 on success, non-0 on error
e5a5a138
RK
1013 */
1014static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
1015 utf32__decompose_one_generic(canon);
1016}
1017
1018/** @brief Recursively compute the compatibility decomposition of @p c
1019 * @param d Dynamic string to store decomposition in
1020 * @param c Code point to decompose (must be a valid!)
f98fcddb 1021 * @return 0 on success, non-0 on error
e5a5a138
RK
1022 */
1023static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
1024 utf32__decompose_one_generic(compat);
1025}
1026
16506c9d
RK
1027/** @brief Magic utf32__compositions() return value for Hangul Choseong */
1028static const uint32_t utf32__hangul_L[1];
1029
1030/** @brief Return the list of compositions that @p c starts
1031 * @param c Starter code point
1032 * @return Composition list or NULL
1033 *
1034 * For Hangul leading (Choseong) jamo we return the special value
1035 * utf32__hangul_L. These code points are not listed as the targets of
1036 * canonical decompositions (make-unidata checks) so there is no confusion with
1037 * real decompositions here.
1038 */
1039static const uint32_t *utf32__compositions(uint32_t c) {
1040 const uint32_t *compositions = utf32__unidata(c)->composed;
1041
1042 if(compositions)
1043 return compositions;
1044 /* Special-casing for Hangul */
1045 switch(utf32__grapheme_break(c)) {
1046 default:
1047 return 0;
1048 case unicode_Grapheme_Break_L:
1049 return utf32__hangul_L;
1050 }
1051}
1052
1053/** @brief Composition step
1054 * @param s Start of string
1055 * @param ns Length of string
1056 * @return New length of string
1057 *
1058 * This is called from utf32__decompose_generic() to compose the result string
1059 * in place.
1060 */
1061static size_t utf32__compose(uint32_t *s, size_t ns) {
1062 const uint32_t *compositions;
1063 uint32_t *start = s, *t = s, *tt, cc;
1064
1065 while(ns > 0) {
1066 uint32_t starter = *s++;
1067 int block_starters = 0;
1068 --ns;
1069 /* We don't attempt to compose the following things:
1070 * - final characters whatever kind they are
1071 * - non-starter characters
1072 * - starters that don't take part in a canonical decomposition mapping
1073 */
1074 if(ns == 0
1075 || utf32__combining_class(starter)
1076 || !(compositions = utf32__compositions(starter))) {
1077 *t++ = starter;
1078 continue;
1079 }
1080 if(compositions != utf32__hangul_L) {
1081 /* Where we'll put the eventual starter */
1082 tt = t++;
1083 do {
1084 /* See if we can find composition of starter+*s */
1085 const uint32_t cchar = *s, *cp = compositions;
1086 while((cc = *cp++)) {
1087 const uint32_t *decomp = utf32__decomposition_canon(cc);
1088 /* We know decomp[0] == starter */
1089 if(decomp[1] == cchar)
1090 break;
1091 }
1092 if(cc) {
1093 /* Found a composition: cc decomposes to starter,*s */
1094 starter = cc;
1095 compositions = utf32__compositions(starter);
1096 ++s;
1097 --ns;
1098 } else {
1099 /* No composition found. */
1100 const int class = utf32__combining_class(*s);
1101 if(class) {
1102 /* Transfer the uncomposable combining character to the output */
1103 *t++ = *s++;
1104 --ns;
1105 /* All the combining characters of the same class of the
1106 * uncomposable character are blocked by it, but there may be
1107 * others of higher class later. We eat the uncomposable and
1108 * blocked characters and go back round the loop for that higher
1109 * class. */
1110 while(ns > 0 && utf32__combining_class(*s) == class) {
1111 *t++ = *s++;
1112 --ns;
1113 }
1114 /* Block any subsequent starters */
1115 block_starters = 1;
1116 } else {
1117 /* The uncombinable character is itself a starter, so we don't
1118 * transfer it to the output but instead go back round the main
1119 * loop. */
1120 break;
1121 }
1122 }
1123 /* Keep going while there are still characters and the starter takes
1124 * part in some composition */
1125 } while(ns > 0 && compositions
1126 && (!block_starters || utf32__combining_class(*s)));
1127 /* Store any remaining combining characters */
1128 while(ns > 0 && utf32__combining_class(*s)) {
1129 *t++ = *s++;
1130 --ns;
1131 }
1132 /* Store the resulting starter */
1133 *tt = starter;
1134 } else {
1135 /* Special-casing for Hangul
1136 *
1137 * If there are combining characters between the L and the V then they
1138 * will block the V and so no composition happens. Similarly combining
1139 * characters between V and T will block the T and so we only get as far
1140 * as LV.
1141 */
1142 if(utf32__grapheme_break(*s) == unicode_Grapheme_Break_V) {
1143 const uint32_t V = *s++;
1144 const uint32_t LIndex = starter - LBase;
1145 const uint32_t VIndex = V - VBase;
1146 uint32_t TIndex;
1147 --ns;
1148 if(ns > 0
1149 && utf32__grapheme_break(*s) == unicode_Grapheme_Break_T) {
1150 /* We have an L V T sequence */
1151 const uint32_t T = *s++;
1152 TIndex = T - TBase;
1153 --ns;
1154 } else
1155 /* It's just L V */
1156 TIndex = 0;
1157 /* Compose to LVT or LV as appropriate */
1158 starter = (LIndex * VCount + VIndex) * TCount + TIndex + SBase;
1159 } /* else we only have L or LV and no V or T */
1160 *t++ = starter;
1161 /* There could be some combining characters that belong to the V or T.
1162 * These will be treated as non-starter characters at the top of the loop
1163 * and thuss transferred to the output. */
1164 }
1165 }
1166 return t - start;
1167}
1168
1169/** @brief Guts of the composition and decomposition functions
1170 * @param WHICH @c canon or @c compat to choose decomposition
1171 * @param COMPOSE @c 0 or @c 1 to compose
1172 */
1173#define utf32__decompose_generic(WHICH, COMPOSE) do { \
e5a5a138
RK
1174 struct dynstr_ucs4 d; \
1175 uint32_t c; \
1176 \
1177 dynstr_ucs4_init(&d); \
1178 while(ns) { \
1179 c = *s++; \
56fd389c 1180 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
e5a5a138
RK
1181 goto error; \
1182 utf32__decompose_one_##WHICH(&d, c); \
1183 --ns; \
1184 } \
1185 if(utf32__canonical_ordering(d.vec, d.nvec)) \
1186 goto error; \
16506c9d
RK
1187 if(COMPOSE) \
1188 d.nvec = utf32__compose(d.vec, d.nvec); \
e5a5a138
RK
1189 dynstr_ucs4_terminate(&d); \
1190 if(ndp) \
1191 *ndp = d.nvec; \
1192 return d.vec; \
1193error: \
1194 xfree(d.vec); \
1195 return 0; \
1196} while(0)
1197
1198/** @brief Canonically decompose @p [s,s+ns)
1199 * @param s Pointer to string
1200 * @param ns Length of string
1201 * @param ndp Where to store length of result
f98fcddb 1202 * @return Pointer to result string, or NULL on error
e5a5a138 1203 *
16506c9d
RK
1204 * Computes NFD (Normalization Form D) of the string at @p s. This implies
1205 * performing all canonical decompositions and then normalizing the order of
1206 * combining characters.
e5a5a138 1207 *
56fd389c 1208 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
1209 * - it codes for a UTF-16 surrogate
1210 * - it codes for a value outside the unicode code space
16506c9d
RK
1211 *
1212 * See also:
1213 * - utf32_decompose_compat()
1214 * - utf32_compose_canon()
e5a5a138
RK
1215 */
1216uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
16506c9d 1217 utf32__decompose_generic(canon, 0);
e5a5a138
RK
1218}
1219
1220/** @brief Compatibility decompose @p [s,s+ns)
1221 * @param s Pointer to string
1222 * @param ns Length of string
1223 * @param ndp Where to store length of result
f98fcddb 1224 * @return Pointer to result string, or NULL on error
e5a5a138 1225 *
16506c9d
RK
1226 * Computes NFKD (Normalization Form KD) of the string at @p s. This implies
1227 * performing all canonical and compatibility decompositions and then
1228 * normalizing the order of combining characters.
e5a5a138 1229 *
56fd389c 1230 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
1231 * - it codes for a UTF-16 surrogate
1232 * - it codes for a value outside the unicode code space
16506c9d
RK
1233 *
1234 * See also:
1235 * - utf32_decompose_canon()
1236 * - utf32_compose_compat()
e5a5a138
RK
1237 */
1238uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
16506c9d
RK
1239 utf32__decompose_generic(compat, 0);
1240}
1241
1242/** @brief Canonically compose @p [s,s+ns)
1243 * @param s Pointer to string
1244 * @param ns Length of string
1245 * @param ndp Where to store length of result
1246 * @return Pointer to result string, or NULL on error
1247 *
1248 * Computes NFC (Normalization Form C) of the string at @p s. This implies
1249 * performing all canonical decompositions, normalizing the order of combining
1250 * characters and then composing all unblocked primary compositables.
1251 *
1252 * Returns NULL if the string is not valid for either of the following reasons:
1253 * - it codes for a UTF-16 surrogate
1254 * - it codes for a value outside the unicode code space
1255 *
1256 * See also:
1257 * - utf32_compose_compat()
1258 * - utf32_decompose_canon()
1259 */
1260uint32_t *utf32_compose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
1261 utf32__decompose_generic(canon, 1);
1262}
1263
1264/** @brief Compatibility compose @p [s,s+ns)
1265 * @param s Pointer to string
1266 * @param ns Length of string
1267 * @param ndp Where to store length of result
1268 * @return Pointer to result string, or NULL on error
1269 *
1270 * Computes NFKC (Normalization Form KC) of the string at @p s. This implies
1271 * performing all canonical and compatibility decompositions, normalizing the
1272 * order of combining characters and then composing all unblocked primary
1273 * compositables.
1274 *
1275 * Returns NULL if the string is not valid for either of the following reasons:
1276 * - it codes for a UTF-16 surrogate
1277 * - it codes for a value outside the unicode code space
1278 *
1279 * See also:
1280 * - utf32_compose_canon()
1281 * - utf32_decompose_compat()
1282 */
1283uint32_t *utf32_compose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
1284 utf32__decompose_generic(compat, 1);
e5a5a138
RK
1285}
1286
56fd389c
RK
1287/** @brief Single-character case-fold and decompose operation */
1288#define utf32__casefold_one(WHICH) do { \
bcf9ed7f 1289 const uint32_t *cf = utf32__unidata(c)->casefold; \
56fd389c
RK
1290 if(cf) { \
1291 /* Found a case-fold mapping in the table */ \
1292 while(*cf) \
1293 utf32__decompose_one_##WHICH(&d, *cf++); \
1294 } else \
1295 utf32__decompose_one_##WHICH(&d, c); \
1296} while(0)
e5a5a138
RK
1297
1298/** @brief Case-fold @p [s,s+ns)
1299 * @param s Pointer to string
1300 * @param ns Length of string
1301 * @param ndp Where to store length of result
f98fcddb 1302 * @return Pointer to result string, or NULL on error
e5a5a138
RK
1303 *
1304 * Case-fold the string at @p s according to full default case-folding rules
56fd389c 1305 * (s3.13) for caseless matching. The result will be in NFD.
e5a5a138 1306 *
56fd389c 1307 * Returns NULL if the string is not valid for either of the following reasons:
e5a5a138
RK
1308 * - it codes for a UTF-16 surrogate
1309 * - it codes for a value outside the unicode code space
1310 */
1311uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
1312 struct dynstr_ucs4 d;
1313 uint32_t c;
1314 size_t n;
1315 uint32_t *ss = 0;
1316
1317 /* If the canonical decomposition of the string includes any combining
1318 * character that case-folds to a non-combining character then we must
1319 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
1320 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
1321 * canonically decompose to it. */
bcf9ed7f
RK
1322 for(n = 0; n < ns; ++n)
1323 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
e5a5a138 1324 break;
e5a5a138
RK
1325 if(n < ns) {
1326 /* We need a preliminary decomposition */
1327 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
1328 return 0;
1329 s = ss;
1330 }
1331 dynstr_ucs4_init(&d);
1332 while(ns) {
1333 c = *s++;
56fd389c 1334 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
e5a5a138 1335 goto error;
56fd389c 1336 utf32__casefold_one(canon);
e5a5a138
RK
1337 --ns;
1338 }
1339 if(utf32__canonical_ordering(d.vec, d.nvec))
1340 goto error;
1341 dynstr_ucs4_terminate(&d);
1342 if(ndp)
1343 *ndp = d.nvec;
1344 return d.vec;
1345error:
1346 xfree(d.vec);
1347 xfree(ss);
1348 return 0;
1349}
1350
f98fcddb 1351/** @brief Compatibility case-fold @p [s,s+ns)
56fd389c
RK
1352 * @param s Pointer to string
1353 * @param ns Length of string
1354 * @param ndp Where to store length of result
f98fcddb 1355 * @return Pointer to result string, or NULL on error
56fd389c
RK
1356 *
1357 * Case-fold the string at @p s according to full default case-folding rules
1358 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
1359 *
1360 * Returns NULL if the string is not valid for either of the following reasons:
1361 * - it codes for a UTF-16 surrogate
1362 * - it codes for a value outside the unicode code space
1363 */
1364uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
1365 struct dynstr_ucs4 d;
1366 uint32_t c;
1367 size_t n;
1368 uint32_t *ss = 0;
1369
bcf9ed7f
RK
1370 for(n = 0; n < ns; ++n)
1371 if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
56fd389c 1372 break;
56fd389c
RK
1373 if(n < ns) {
1374 /* We need a preliminary _canonical_ decomposition */
1375 if(!(ss = utf32_decompose_canon(s, ns, &ns)))
1376 return 0;
1377 s = ss;
1378 }
1379 /* This computes NFKD(toCaseFold(s)) */
1380#define compat_casefold_middle() do { \
1381 dynstr_ucs4_init(&d); \
1382 while(ns) { \
1383 c = *s++; \
1384 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
1385 goto error; \
1386 utf32__casefold_one(compat); \
1387 --ns; \
1388 } \
1389 if(utf32__canonical_ordering(d.vec, d.nvec)) \
1390 goto error; \
1391} while(0)
1392 /* Do the inner (NFKD o toCaseFold) */
1393 compat_casefold_middle();
1394 /* We can do away with the NFD'd copy of the input now */
1395 xfree(ss);
1396 s = ss = d.vec;
1397 ns = d.nvec;
1398 /* Do the outer (NFKD o toCaseFold) */
1399 compat_casefold_middle();
1400 /* That's all */
1401 dynstr_ucs4_terminate(&d);
1402 if(ndp)
1403 *ndp = d.nvec;
1404 return d.vec;
1405error:
1406 xfree(d.vec);
1407 xfree(ss);
1408 return 0;
1409}
1410
e5a5a138
RK
1411/** @brief Order a pair of UTF-32 strings
1412 * @param a First 0-terminated string
1413 * @param b Second 0-terminated string
1414 * @return -1, 0 or 1 for a less than, equal to or greater than b
1415 *
1416 * "Comparable to strcmp() at its best."
1417 */
1418int utf32_cmp(const uint32_t *a, const uint32_t *b) {
1419 while(*a && *b && *a == *b) {
1420 ++a;
1421 ++b;
1422 }
1423 return *a < *b ? -1 : (*a > *b ? 1 : 0);
1424}
1425
35b651f0
RK
1426/** @brief Identify a grapheme cluster boundary
1427 * @param s Start of string (must be NFD)
1428 * @param ns Length of string
1429 * @param n Index within string (in [0,ns].)
1430 * @return 1 at a grapheme cluster boundary, 0 otherwise
1431 *
1432 * This function identifies default grapheme cluster boundaries as described in
f98fcddb 1433 * UAX #29 s3. It returns non-0 if @p n points at the code point just after a
35b651f0
RK
1434 * grapheme cluster boundary (including the hypothetical code point just after
1435 * the end of the string).
f98fcddb
RK
1436 *
1437 * This function uses utf32_iterator_set() internally; see that function for
1438 * remarks on performance.
35b651f0 1439 */
1625e11a 1440int utf32_is_grapheme_boundary(const uint32_t *s, size_t ns, size_t n) {
092f426f 1441 struct utf32_iterator_data it[1];
35b651f0 1442
092f426f
RK
1443 utf32__iterator_init(it, s, ns, n);
1444 return utf32_iterator_grapheme_boundary(it);
0b7052da
RK
1445}
1446
1447/** @brief Identify a word boundary
1448 * @param s Start of string (must be NFD)
1449 * @param ns Length of string
1450 * @param n Index within string (in [0,ns].)
1451 * @return 1 at a word boundary, 0 otherwise
1452 *
1453 * This function identifies default word boundaries as described in UAX #29 s4.
f98fcddb 1454 * It returns non-0 if @p n points at the code point just after a word boundary
0b7052da 1455 * (including the hypothetical code point just after the end of the string).
f98fcddb
RK
1456 *
1457 * This function uses utf32_iterator_set() internally; see that function for
1458 * remarks on performance.
0b7052da
RK
1459 */
1460int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
092f426f 1461 struct utf32_iterator_data it[1];
0b7052da 1462
092f426f
RK
1463 utf32__iterator_init(it, s, ns, n);
1464 return utf32_iterator_word_boundary(it);
0b7052da
RK
1465}
1466
8818b7fc
RK
1467/** @brief Split [s,ns) into multiple words
1468 * @param s Pointer to start of string
1469 * @param ns Length of string
1470 * @param nwp Where to store word count, or NULL
c85b7022 1471 * @param wbreak Word_Break property tailor, or NULL
8818b7fc
RK
1472 * @return Pointer to array of pointers to words
1473 *
1474 * The returned array is terminated by a NULL pointer and individual
1475 * strings are 0-terminated.
1476 */
c85b7022
RK
1477uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
1478 unicode_property_tailor *wbreak) {
8818b7fc
RK
1479 struct utf32_iterator_data it[1];
1480 size_t b1 = 0, b2 = 0 ,i;
1481 int isword;
1482 struct vector32 v32[1];
1483 uint32_t *w;
1484
1485 vector32_init(v32);
1486 utf32__iterator_init(it, s, ns, 0);
c85b7022 1487 it->word_break = wbreak;
8818b7fc
RK
1488 /* Work our way through the string stopping at each word break. */
1489 do {
1490 if(utf32_iterator_word_boundary(it)) {
1491 /* We've found a new boundary */
1492 b1 = b2;
1493 b2 = it->n;
1494 /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
1495 /* Inspect the characters between the boundary and form an opinion as to
1496 * whether they are a word or not */
1497 isword = 0;
1498 for(i = b1; i < b2; ++i) {
c85b7022 1499 switch(utf32__iterator_word_break(it, it->s[i])) {
8818b7fc
RK
1500 case unicode_Word_Break_ALetter:
1501 case unicode_Word_Break_Numeric:
1502 case unicode_Word_Break_Katakana:
1503 isword = 1;
1504 break;
1505 default:
1506 break;
1507 }
1508 }
1509 /* If it's a word add it to the list of results */
1510 if(isword) {
8e93ddd1
RK
1511 const size_t len = b2 - b1;
1512 w = xcalloc_noptr(len + 1, sizeof(uint32_t));
1513 memcpy(w, it->s + b1, len * sizeof (uint32_t));
1514 w[len] = 0;
8818b7fc
RK
1515 vector32_append(v32, w);
1516 }
1517 }
1518 } while(!utf32_iterator_advance(it, 1));
1519 vector32_terminate(v32);
1520 if(nwp)
1521 *nwp = v32->nvec;
1522 return v32->vec;
1523}
1524
e5a5a138 1525/*@}*/
349b7b74 1526/** @defgroup utf8 Functions that operate on UTF-8 strings */
e5a5a138
RK
1527/*@{*/
1528
1529/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
1530#define utf8__transform(FN) do { \
1531 uint32_t *to32 = 0, *decomp32 = 0; \
1532 size_t nto32, ndecomp32; \
1533 char *decomp8 = 0; \
1534 \
1535 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
1536 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
1537 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
1538error: \
1539 xfree(to32); \
1540 xfree(decomp32); \
1541 return decomp8; \
1542} while(0)
1543
1544/** @brief Canonically decompose @p [s,s+ns)
1545 * @param s Pointer to string
1546 * @param ns Length of string
1547 * @param ndp Where to store length of result
f98fcddb 1548 * @return Pointer to result string, or NULL on error
e5a5a138 1549 *
0ae60b83
RK
1550 * Computes NFD (Normalization Form D) of the string at @p s. This implies
1551 * performing all canonical decompositions and then normalizing the order of
1552 * combining characters.
e5a5a138
RK
1553 *
1554 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1555 * this might be.
1556 *
0ae60b83
RK
1557 * See also:
1558 * - utf32_decompose_canon().
1559 * - utf8_decompose_compat()
1560 * - utf8_compose_canon()
e5a5a138
RK
1561 */
1562char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
1563 utf8__transform(utf32_decompose_canon);
1564}
1565
1566/** @brief Compatibility decompose @p [s,s+ns)
1567 * @param s Pointer to string
1568 * @param ns Length of string
1569 * @param ndp Where to store length of result
f98fcddb 1570 * @return Pointer to result string, or NULL on error
e5a5a138 1571 *
0ae60b83
RK
1572 * Computes NFKD (Normalization Form KD) of the string at @p s. This implies
1573 * performing all canonical and compatibility decompositions and then
1574 * normalizing the order of combining characters.
e5a5a138
RK
1575 *
1576 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1577 * this might be.
1578 *
0ae60b83
RK
1579 * See also:
1580 * - utf32_decompose_compat().
1581 * - utf8_decompose_canon()
1582 * - utf8_compose_compat()
e5a5a138
RK
1583 */
1584char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
1585 utf8__transform(utf32_decompose_compat);
1586}
1587
0ae60b83
RK
1588/** @brief Canonically compose @p [s,s+ns)
1589 * @param s Pointer to string
1590 * @param ns Length of string
1591 * @param ndp Where to store length of result
1592 * @return Pointer to result string, or NULL on error
1593 *
1594 * Computes NFC (Normalization Form C) of the string at @p s. This implies
1595 * performing all canonical decompositions, normalizing the order of combining
1596 * characters and then composing all unblocked primary compositables.
1597 *
1598 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1599 * this might be.
1600 *
1601 * See also:
1602 * - utf32_compose_canon()
1603 * - utf8_compose_compat()
1604 * - utf8_decompose_canon()
1605 */
1606char *utf8_compose_canon(const char *s, size_t ns, size_t *ndp) {
1607 utf8__transform(utf32_compose_canon);
1608}
1609
1610/** @brief Compatibility compose @p [s,s+ns)
1611 * @param s Pointer to string
1612 * @param ns Length of string
1613 * @param ndp Where to store length of result
1614 * @return Pointer to result string, or NULL on error
1615 *
1616 * Computes NFKC (Normalization Form KC) of the string at @p s. This implies
1617 * performing all canonical and compatibility decompositions, normalizing the
1618 * order of combining characters and then composing all unblocked primary
1619 * compositables.
1620 *
1621 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1622 * this might be.
1623 *
1624 * See also:
1625 * - utf32_compose_compat()
1626 * - utf8_compose_canon()
1627 * - utf8_decompose_compat()
1628 */
1629char *utf8_compose_compat(const char *s, size_t ns, size_t *ndp) {
1630 utf8__transform(utf32_compose_compat);
1631}
1632
e5a5a138
RK
1633/** @brief Case-fold @p [s,s+ns)
1634 * @param s Pointer to string
1635 * @param ns Length of string
1636 * @param ndp Where to store length of result
f98fcddb 1637 * @return Pointer to result string, or NULL on error
e5a5a138
RK
1638 *
1639 * Case-fold the string at @p s according to full default case-folding rules
1640 * (s3.13). The result will be in NFD.
1641 *
1642 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1643 * this might be.
1644 */
1645char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
1646 utf8__transform(utf32_casefold_canon);
1647}
1648
1649/** @brief Compatibility case-fold @p [s,s+ns)
1650 * @param s Pointer to string
1651 * @param ns Length of string
1652 * @param ndp Where to store length of result
f98fcddb 1653 * @return Pointer to result string, or NULL on error
e5a5a138
RK
1654 *
1655 * Case-fold the string at @p s according to full default case-folding rules
1656 * (s3.13). The result will be in NFKD.
1657 *
1658 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
1659 * this might be.
1660 */
e5a5a138
RK
1661char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
1662 utf8__transform(utf32_casefold_compat);
1663}
e5a5a138 1664
8818b7fc
RK
1665/** @brief Split [s,ns) into multiple words
1666 * @param s Pointer to start of string
1667 * @param ns Length of string
1668 * @param nwp Where to store word count, or NULL
c85b7022 1669 * @param wbreak Word_Break property tailor, or NULL
8818b7fc
RK
1670 * @return Pointer to array of pointers to words
1671 *
1672 * The returned array is terminated by a NULL pointer and individual
1673 * strings are 0-terminated.
1674 */
c85b7022
RK
1675char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
1676 unicode_property_tailor *wbreak) {
8818b7fc
RK
1677 uint32_t *to32 = 0, **v32 = 0;
1678 size_t nto32, nv, n;
1679 char **v8 = 0, **ret = 0;
c85b7022 1680
8818b7fc 1681 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
c85b7022 1682 if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
8818b7fc
RK
1683 v8 = xcalloc(sizeof (char *), nv + 1);
1684 for(n = 0; n < nv; ++n)
1685 if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
1686 goto error;
1687 ret = v8;
1688 *nwp = nv;
1689 v8 = 0; /* don't free */
c85b7022 1690error:
8818b7fc
RK
1691 if(v8) {
1692 for(n = 0; n < nv; ++n)
1693 xfree(v8[n]);
1694 xfree(v8);
1695 }
1696 if(v32) {
1697 for(n = 0; n < nv; ++n)
1698 xfree(v32[n]);
1699 xfree(v32);
1700 }
1701 xfree(to32);
1702 return ret;
1703}
1704
1705
e5a5a138
RK
1706/*@}*/
1707
bb5c7798
RK
1708/** @brief Return the length of a 0-terminated UTF-16 string
1709 * @param s Pointer to 0-terminated string
1710 * @return Length of string in code points (excluding terminator)
1711 *
1712 * Unlike the conversion functions no validity checking is done on the string.
1713 */
1714size_t utf16_len(const uint16_t *s) {
1715 const uint16_t *t = s;
1716
1717 while(*t)
1718 ++t;
1719 return (size_t)(t - s);
1720}
1721
e5a5a138
RK
1722/*
1723Local Variables:
1724c-basic-offset:2
1725comment-column:40
1726fill-column:79
1727indent-tabs-mode:nil
1728End:
1729*/