chiark - git - mdw - disorder/blob - lib/unicode.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20 /** @file lib/unicode.c
  21  * @brief Unicode support functions
  22  *
  23  * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
  24  * encoding schemes).  The primary encoding form is UTF-32 but convenience
  25  * wrappers using UTF-8 are provided for a number of functions.
  26  *
  27  * The idea is that all the strings that hit the database will be in a
  28  * particular normalization form, and for the search and tags database
  29  * in case-folded form, so they can be naively compared within the
  30  * database code.
  31  *
  32  * As the code stands this guarantee is not well met!
  33  */
  34
  35 #include <config.h>
  36 #include "types.h"
  37
  38 #include <string.h>
  39 #include <stdio.h>              /* TODO */
  40
  41 #include "mem.h"
  42 #include "vector.h"
  43 #include "unicode.h"
  44 #include "unidata.h"
  45
  46 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
  47 /*@{*/
  48
  49 /** @brief Convert UTF-32 to UTF-8
  50  * @param s Source string
  51  * @param ns Length of source string in code points
  52  * @param ndp Where to store length of destination string (or NULL)
  53  * @return Newly allocated destination string or NULL on error
  54  *
  55  * If the UTF-32 is not valid then NULL is returned.  A UTF-32 code point is
  56  * invalid if:
  57  * - it codes for a UTF-16 surrogate
  58  * - it codes for a value outside the unicode code space
  59  *
  60  * The return value is always 0-terminated.  The value returned via @p *ndp
  61  * does not include the terminator.
  62  */
  63 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
  64   struct dynstr d;
  65   uint32_t c;
  66
  67   dynstr_init(&d);
  68   while(ns > 0) {
  69     c = *s++;
  70     if(c < 0x80)
  71       dynstr_append(&d, c);
  72     else if(c < 0x0800) {
  73       dynstr_append(&d, 0xC0 | (c >> 6));
  74       dynstr_append(&d, 0x80 | (c & 0x3F));
  75     } else if(c < 0x10000) {
  76       if(c >= 0xD800 && c <= 0xDFFF)
  77         goto error;
  78       dynstr_append(&d, 0xE0 | (c >> 12));
  79       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  80       dynstr_append(&d, 0x80 | (c & 0x3F));
  81     } else if(c < 0x110000) {
  82       dynstr_append(&d, 0xF0 | (c >> 18));
  83       dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
  84       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  85       dynstr_append(&d, 0x80 | (c & 0x3F));
  86     } else
  87       goto error;
  88     --ns;
  89   }
  90   dynstr_terminate(&d);
  91   if(ndp)
  92     *ndp = d.nvec;
  93   return d.vec;
  94 error:
  95   xfree(d.vec);
  96   return 0;
  97 }
  98
  99 /** @brief Convert UTF-8 to UTF-32
 100  * @param s Source string
 101  * @param ns Length of source string in code points
 102  * @param ndp Where to store length of destination string (or NULL)
 103  * @return Newly allocated destination string or NULL
 104  *
 105  * The return value is always 0-terminated.  The value returned via @p *ndp
 106  * does not include the terminator.
 107  *
 108  * If the UTF-8 is not valid then NULL is returned.  A UTF-8 sequence
 109  * for a code point is invalid if:
 110  * - it is not the shortest possible sequence for the code point
 111  * - it codes for a UTF-16 surrogate
 112  * - it codes for a value outside the unicode code space
 113  */
 114 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
 115   struct dynstr_ucs4 d;
 116   uint32_t c32, c;
 117   const uint8_t *ss = (const uint8_t *)s;
 118
 119   dynstr_ucs4_init(&d);
 120   while(ns > 0) {
 121     c = *ss++;
 122     --ns;
 123     /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
 124      * (Unicode 5.0.0 s3.9 D76)
 125      *
 126      * 0xxxxxxx
 127      * 7 data bits gives 0x00 - 0x7F and all are acceptable
 128      *
 129      * 110xxxxx 10xxxxxx
 130      * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
 131      *
 132      * 1110xxxx 10xxxxxx 10xxxxxx
 133      * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
 134      * (and UTF-16 surrogates are not acceptable)
 135      *
 136      * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 137      * 21 data bits gives 0x00000000 - 0x001FFFFF
 138      * but only           0x00010000 - 0x0010FFFF are acceptable
 139      *
 140      * It is NOT always the case that the data bits in the first byte are
 141      * always non-0 for the acceptable values, so we do a separate check after
 142      * decoding.
 143      */
 144     if(c < 0x80)
 145       c32 = c;
 146     else if(c <= 0xDF) {
 147       if(ns < 1) goto error;
 148       c32 = c & 0x1F;
 149       c = *ss++;
 150       if((c & 0xC0) != 0x80) goto error;
 151       c32 = (c32 << 6) | (c & 0x3F);
 152       if(c32 < 0x80) goto error;
 153     } else if(c <= 0xEF) {
 154       if(ns < 2) goto error;
 155       c32 = c & 0x0F;
 156       c = *ss++;
 157       if((c & 0xC0) != 0x80) goto error;
 158       c32 = (c32 << 6) | (c & 0x3F);
 159       c = *ss++;
 160       if((c & 0xC0) != 0x80) goto error;
 161       c32 = (c32 << 6) | (c & 0x3F);
 162       if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
 163     } else if(c <= 0xF7) {
 164       if(ns < 3) goto error;
 165       c32 = c & 0x07;
 166       c = *ss++;
 167       if((c & 0xC0) != 0x80) goto error;
 168       c32 = (c32 << 6) | (c & 0x3F);
 169       c = *ss++;
 170       if((c & 0xC0) != 0x80) goto error;
 171       c32 = (c32 << 6) | (c & 0x3F);
 172       c = *ss++;
 173       if((c & 0xC0) != 0x80) goto error;
 174       c32 = (c32 << 6) | (c & 0x3F);
 175       if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
 176     } else
 177       goto error;
 178     dynstr_ucs4_append(&d, c32);
 179   }
 180   dynstr_ucs4_terminate(&d);
 181   if(ndp)
 182     *ndp = d.nvec;
 183   return d.vec;
 184 error:
 185   xfree(d.vec);
 186   return 0;
 187 }
 188
 189 /*@}*/
 190 /** @defgroup utf32 Functions that operate on UTF-32 strings */
 191 /*@{*/
 192
 193 /** @brief Return the length of a 0-terminated UTF-32 string
 194  * @param s Pointer to 0-terminated string
 195  * @return Length of string in code points (excluding terminator)
 196  *
 197  * Unlike the conversion functions no validity checking is done on the string.
 198  */
 199 size_t utf32_len(const uint32_t *s) {
 200   const uint32_t *t = s;
 201
 202   while(*t)
 203     ++t;
 204   return (size_t)(t - s);
 205 }
 206
 207 /** @brief Return the combining class of @p c
 208  * @param c Code point
 209  * @return Combining class of @p c
 210  */
 211 static inline int utf32__combining_class(uint32_t c) {
 212   if(c < UNICODE_NCHARS)
 213     return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
 214   return 0;
 215 }
 216
 217 /** @brief Stably sort [s,s+ns) into descending order of combining class
 218  * @param s Start of array
 219  * @param ns Number of elements, must be at least 1
 220  * @param buffer Buffer of at least @p ns elements
 221  */
 222 static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
 223   uint32_t *a, *b, *bp;
 224   size_t na, nb;
 225
 226   switch(ns) {
 227   case 1:                       /* 1-element array is always sorted */
 228     return;
 229   case 2:                       /* 2-element arrays are trivial to sort */
 230     if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
 231       uint32_t tmp = s[0];
 232       s[0] = s[1];
 233       s[1] = tmp;
 234     }
 235     return;
 236   default:
 237     /* Partition the array */
 238     na = ns / 2;
 239     nb = ns - na;
 240     a = s;
 241     b = s + na;
 242     /* Sort the two halves of the array */
 243     utf32__sort_ccc(a, na, buffer);
 244     utf32__sort_ccc(b, nb, buffer);
 245     /* Merge them back into one, via the buffer */
 246     bp = buffer;
 247     while(na > 0 && nb > 0) {
 248       /* We want descending order of combining class (hence <)
 249        * and we want stability within combining classes (hence <=)
 250        */
 251       if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
 252         *bp++ = *a++;
 253         --na;
 254       } else {
 255         *bp++ = *b++;
 256         --nb;
 257       }
 258     }
 259     while(na > 0) {
 260       *bp++ = *a++;
 261       --na;
 262     }
 263     while(nb > 0) {
 264       *bp++ = *b++;
 265       --nb;
 266     }
 267     memcpy(s, buffer,  ns * sizeof(uint32_t));
 268     return;
 269   }
 270 }
 271
 272 /** @brief Put combining characters into canonical order
 273  * @param s Pointer to UTF-32 string
 274  * @param ns Length of @p s
 275  * @return 0 on success, -1 on error
 276  *
 277  * @p s is modified in-place.  See Unicode 5.0 s3.11 for details of the
 278  * ordering.
 279  *
 280  * Currently we only support a maximum of 1024 combining characters after each
 281  * base character.  If this limit is exceeded then -1 is returned.
 282  */
 283 static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
 284   size_t nc;
 285   uint32_t buffer[1024];
 286
 287   /* The ordering amounts to a stable sort of each contiguous group of
 288    * characters with non-0 combining class. */
 289   while(ns > 0) {
 290     /* Skip non-combining characters */
 291     if(utf32__combining_class(*s) == 0) {
 292       ++s;
 293       --ns;
 294       continue;
 295     }
 296     /* We must now have at least one combining character; see how many
 297      * there are */
 298     for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
 299       ;
 300     if(nc > 1024)
 301       return -1;
 302     /* Sort the array */
 303     utf32__sort_ccc(s, nc, buffer);
 304     s += nc;
 305     ns -= nc;
 306   }
 307   return 0;
 308 }
 309
 310 /* Magic numbers from UAX #15 s16 */
 311 #define SBase 0xAC00
 312 #define LBase 0x1100
 313 #define VBase 0x1161
 314 #define TBase 0x11A7
 315 #define LCount 19
 316 #define VCount 21
 317 #define TCount 28
 318 #define NCount (VCount * TCount)
 319 #define SCount (LCount * NCount)
 320
 321 /** @brief Guts of the decomposition lookup functions */
 322 #define utf32__decompose_one_generic(WHICH) do {                        \
 323   const uint32_t *dc =                                                  \
 324     (c < UNICODE_NCHARS                                                 \
 325      ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH          \
 326      : 0);                                                              \
 327   if(dc) {                                                              \
 328     /* Found a canonical decomposition in the table */                  \
 329     while(*dc)                                                          \
 330       utf32__decompose_one_##WHICH(d, *dc++);                           \
 331   } else if(c >= SBase && c < SBase + SCount) {                         \
 332     /* Mechanically decomposable Hangul syllable (UAX #15 s16) */       \
 333     const uint32_t SIndex = c - SBase;                                  \
 334     const uint32_t L = LBase + SIndex / NCount;                         \
 335     const uint32_t V = VBase + (SIndex % NCount) / TCount;              \
 336     const uint32_t T = TBase + SIndex % TCount;                         \
 337     dynstr_ucs4_append(d, L);                                           \
 338     dynstr_ucs4_append(d, V);                                           \
 339     if(T != TBase)                                                      \
 340       dynstr_ucs4_append(d, T);                                         \
 341   } else                                                                \
 342     /* Equal to own canonical decomposition */                          \
 343     dynstr_ucs4_append(d, c);                                           \
 344 } while(0)
 345
 346 /** @brief Recursively compute the canonical decomposition of @p c
 347  * @param d Dynamic string to store decomposition in
 348  * @param c Code point to decompose (must be a valid!)
 349  * @return 0 on success, -1 on error
 350  */
 351 static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
 352   utf32__decompose_one_generic(canon);
 353 }
 354
 355 /** @brief Recursively compute the compatibility decomposition of @p c
 356  * @param d Dynamic string to store decomposition in
 357  * @param c Code point to decompose (must be a valid!)
 358  * @return 0 on success, -1 on error
 359  */
 360 static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
 361   utf32__decompose_one_generic(compat);
 362 }
 363
 364 /** @brief Guts of the decomposition functions */
 365 #define utf32__decompose_generic(WHICH) do {            \
 366   struct dynstr_ucs4 d;                                 \
 367   uint32_t c;                                           \
 368                                                         \
 369   dynstr_ucs4_init(&d);                                 \
 370   while(ns) {                                           \
 371     c = *s++;                                           \
 372     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 373       goto error;                                       \
 374     utf32__decompose_one_##WHICH(&d, c);                \
 375     --ns;                                               \
 376   }                                                     \
 377   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 378     goto error;                                         \
 379   dynstr_ucs4_terminate(&d);                            \
 380   if(ndp)                                               \
 381     *ndp = d.nvec;                                      \
 382   return d.vec;                                         \
 383 error:                                                  \
 384   xfree(d.vec);                                         \
 385   return 0;                                             \
 386 } while(0)
 387
 388 /** @brief Canonically decompose @p [s,s+ns)
 389  * @param s Pointer to string
 390  * @param ns Length of string
 391  * @param ndp Where to store length of result
 392  * @return Pointer to result string, or NULL
 393  *
 394  * Computes the canonical decomposition of a string and stably sorts combining
 395  * characters into canonical order.  The result is in Normalization Form D and
 396  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 397  * NormalizationTest.txt.
 398  *
 399  * Returns NULL if the string is not valid for either of the following reasons:
 400  * - it codes for a UTF-16 surrogate
 401  * - it codes for a value outside the unicode code space
 402  */
 403 uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 404   utf32__decompose_generic(canon);
 405 }
 406
 407 /** @brief Compatibility decompose @p [s,s+ns)
 408  * @param s Pointer to string
 409  * @param ns Length of string
 410  * @param ndp Where to store length of result
 411  * @return Pointer to result string, or NULL
 412  *
 413  * Computes the compatibility decomposition of a string and stably sorts
 414  * combining characters into canonical order.  The result is in Normalization
 415  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 416  * Unicode 5.0's NormalizationTest.txt.
 417  *
 418  * Returns NULL if the string is not valid for either of the following reasons:
 419  * - it codes for a UTF-16 surrogate
 420  * - it codes for a value outside the unicode code space
 421  */
 422 uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 423   utf32__decompose_generic(compat);
 424 }
 425
 426 /** @brief Single-character case-fold and decompose operation */
 427 #define utf32__casefold_one(WHICH) do {                                 \
 428   const uint32_t *cf =                                                  \
 429      (c < UNICODE_NCHARS                                                \
 430       ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold      \
 431       : 0);                                                             \
 432   if(cf) {                                                              \
 433     /* Found a case-fold mapping in the table */                        \
 434     while(*cf)                                                          \
 435       utf32__decompose_one_##WHICH(&d, *cf++);                          \
 436   } else                                                                \
 437     utf32__decompose_one_##WHICH(&d, c);                                \
 438 } while(0)
 439
 440 /** @brief Case-fold @p [s,s+ns)
 441  * @param s Pointer to string
 442  * @param ns Length of string
 443  * @param ndp Where to store length of result
 444  * @return Pointer to result string, or NULL
 445  *
 446  * Case-fold the string at @p s according to full default case-folding rules
 447  * (s3.13) for caseless matching.  The result will be in NFD.
 448  *
 449  * Returns NULL if the string is not valid for either of the following reasons:
 450  * - it codes for a UTF-16 surrogate
 451  * - it codes for a value outside the unicode code space
 452  */
 453 uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 454   struct dynstr_ucs4 d;
 455   uint32_t c;
 456   size_t n;
 457   uint32_t *ss = 0;
 458
 459   /* If the canonical decomposition of the string includes any combining
 460    * character that case-folds to a non-combining character then we must
 461    * normalize before we fold.  In Unicode 5.0.0 this means 0345 COMBINING
 462    * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
 463    * canonically decompose to it. */
 464   for(n = 0; n < ns; ++n) {
 465     c = s[n];
 466     if(c < UNICODE_NCHARS
 467        && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
 468            & unicode_normalize_before_casefold))
 469       break;
 470   }
 471   if(n < ns) {
 472     /* We need a preliminary decomposition */
 473     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 474       return 0;
 475     s = ss;
 476   }
 477   dynstr_ucs4_init(&d);
 478   while(ns) {
 479     c = *s++;
 480     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
 481       goto error;
 482     utf32__casefold_one(canon);
 483     --ns;
 484   }
 485   if(utf32__canonical_ordering(d.vec, d.nvec))
 486     goto error;
 487   dynstr_ucs4_terminate(&d);
 488   if(ndp)
 489     *ndp = d.nvec;
 490   return d.vec;
 491 error:
 492   xfree(d.vec);
 493   xfree(ss);
 494   return 0;
 495 }
 496
 497 /** @brief Compatibilit case-fold @p [s,s+ns)
 498  * @param s Pointer to string
 499  * @param ns Length of string
 500  * @param ndp Where to store length of result
 501  * @return Pointer to result string, or NULL
 502  *
 503  * Case-fold the string at @p s according to full default case-folding rules
 504  * (s3.13) for compatibility caseless matching.  The result will be in NFKD.
 505  *
 506  * Returns NULL if the string is not valid for either of the following reasons:
 507  * - it codes for a UTF-16 surrogate
 508  * - it codes for a value outside the unicode code space
 509  */
 510 uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 511   struct dynstr_ucs4 d;
 512   uint32_t c;
 513   size_t n;
 514   uint32_t *ss = 0;
 515
 516   for(n = 0; n < ns; ++n) {
 517     c = s[n];
 518     if(c < UNICODE_NCHARS
 519        && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
 520            & unicode_normalize_before_casefold))
 521       break;
 522   }
 523   if(n < ns) {
 524     /* We need a preliminary _canonical_ decomposition */
 525     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 526       return 0;
 527     s = ss;
 528   }
 529   /* This computes NFKD(toCaseFold(s)) */
 530 #define compat_casefold_middle() do {                   \
 531   dynstr_ucs4_init(&d);                                 \
 532   while(ns) {                                           \
 533     c = *s++;                                           \
 534     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 535       goto error;                                       \
 536     utf32__casefold_one(compat);                        \
 537     --ns;                                               \
 538   }                                                     \
 539   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 540     goto error;                                         \
 541 } while(0)
 542   /* Do the inner (NFKD o toCaseFold) */
 543   compat_casefold_middle();
 544   /* We can do away with the NFD'd copy of the input now */
 545   xfree(ss);
 546   s = ss = d.vec;
 547   ns = d.nvec;
 548   /* Do the outer (NFKD o toCaseFold) */
 549   compat_casefold_middle();
 550   /* That's all */
 551   dynstr_ucs4_terminate(&d);
 552   if(ndp)
 553     *ndp = d.nvec;
 554   return d.vec;
 555 error:
 556   xfree(d.vec);
 557   xfree(ss);
 558   return 0;
 559 }
 560
 561 /** @brief Order a pair of UTF-32 strings
 562  * @param a First 0-terminated string
 563  * @param b Second 0-terminated string
 564  * @return -1, 0 or 1 for a less than, equal to or greater than b
 565  *
 566  * "Comparable to strcmp() at its best."
 567  */
 568 int utf32_cmp(const uint32_t *a, const uint32_t *b) {
 569   while(*a && *b && *a == *b) {
 570     ++a;
 571     ++b;
 572   }
 573   return *a < *b ? -1 : (*a > *b ? 1 : 0);
 574 }
 575
 576 /** @brief Return the General_Category value for @p c
 577  * @param Code point
 578  * @return General_Category property value
 579  */
 580 static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
 581   if(c < UNICODE_NCHARS) {
 582     const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
 583     return ud->general_category;
 584   } else
 585     return unicode_General_Category_Cn;
 586 }
 587
 588 /** @brief Check Grapheme_Cluster_Break property
 589  * @param c Code point
 590  * @return 0 if it is as described, 1 otherwise
 591  */
 592 static int utf32__is_control_or_cr_or_lf(uint32_t c) {
 593   switch(utf32__general_category(c)) {
 594   default:
 595     return 0;
 596   case unicode_General_Category_Zl:
 597   case unicode_General_Category_Zp:
 598   case unicode_General_Category_Cc:
 599     return 1;
 600   case unicode_General_Category_Cf:
 601     if(c == 0x200C || c == 0x200D)
 602       return 0;
 603     return 1;
 604   }
 605 }
 606
 607 #define Hangul_Syllable_Type_NA 0
 608 #define Hangul_Syllable_Type_L 0x1100
 609 #define Hangul_Syllable_Type_V 0x1160
 610 #define Hangul_Syllable_Type_T 0x11A8
 611 #define Hangul_Syllable_Type_LV 0xAC00
 612 #define Hangul_Syllable_Type_LVT 0xAC01
 613
 614 /** @brief Determine Hangul_Syllable_Type of @p c
 615  * @param c Code point
 616  * @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
 617  *
 618  * If this is a Hangul character then a representative member of its
 619  * equivalence class is returned.  Otherwise Hangul_Syllable_Type_NA is
 620  * returned.
 621  */
 622 static uint32_t utf32__hangul_syllable_type(uint32_t c) {
 623   /* Dispose of the bulk of the non-Hangul code points first */
 624   if(c < 0x1100) return Hangul_Syllable_Type_NA;
 625   if(c > 0x1200 && c < 0xAC00) return Hangul_Syllable_Type_NA;
 626   if(c >= 0xD800) return Hangul_Syllable_Type_NA;
 627   /* Now we pick out the assigned Hangul code points */
 628   if((c >= 0x1100 && c <= 0x1159) || c == 0x115F) return Hangul_Syllable_Type_L;
 629   if(c >= 0x1160 && c <= 0x11A2) return Hangul_Syllable_Type_V;
 630   if(c >= 0x11A8 && c <= 0x11F9) return Hangul_Syllable_Type_T;
 631   if(c >= 0xAC00 && c <= 0xD7A3) {
 632     if(c % 28 == 16)
 633       return Hangul_Syllable_Type_LV;
 634     else
 635       return Hangul_Syllable_Type_LVT;
 636   }
 637   return Hangul_Syllable_Type_NA;
 638 }
 639
 640 /** @brief Determine Word_Break property
 641  * @param c Code point
 642  * @return Word_Break property value of @p c
 643  */
 644 static enum unicode_Word_Break utf32__word_break(uint32_t c) {
 645   if(c < 0xAC00 || c > 0xD7A3) {
 646     if(c < UNICODE_NCHARS)
 647       return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].word_break;
 648     else
 649       return unicode_Word_Break_Other;
 650   } else
 651     return unicode_Word_Break_ALetter;
 652 }
 653
 654 /** @brief Identify a grapheme cluster boundary
 655  * @param s Start of string (must be NFD)
 656  * @param ns Length of string
 657  * @param n Index within string (in [0,ns].)
 658  * @return 1 at a grapheme cluster boundary, 0 otherwise
 659  *
 660  * This function identifies default grapheme cluster boundaries as described in
 661  * UAX #29 s3.  It returns 1 if @p n points at the code point just after a
 662  * grapheme cluster boundary (including the hypothetical code point just after
 663  * the end of the string).
 664  */
 665 int utf32_is_gcb(const uint32_t *s, size_t ns, size_t n) {
 666   uint32_t before, after;
 667   uint32_t hbefore, hafter;
 668   /* GB1 and GB2 */
 669   if(n == 0 || n == ns)
 670     return 1;
 671   /* Now we know that s[n-1] and s[n] are safe to inspect */
 672   /* GB3 */
 673   before = s[n-1];
 674   after = s[n];
 675   if(before == 0x000D && after == 0x000A)
 676     return 0;
 677   /* GB4 and GB5 */
 678   if(utf32__is_control_or_cr_or_lf(before)
 679      || utf32__is_control_or_cr_or_lf(after))
 680     return 1;
 681   hbefore = utf32__hangul_syllable_type(before);
 682   hafter = utf32__hangul_syllable_type(after);
 683   /* GB6 */
 684   if(hbefore == Hangul_Syllable_Type_L
 685      && (hafter == Hangul_Syllable_Type_L
 686          || hafter == Hangul_Syllable_Type_V
 687          || hafter == Hangul_Syllable_Type_LV
 688          || hafter == Hangul_Syllable_Type_LVT))
 689     return 0;
 690   /* GB7 */
 691   if((hbefore == Hangul_Syllable_Type_LV
 692       || hbefore == Hangul_Syllable_Type_V)
 693      && (hafter == Hangul_Syllable_Type_V
 694          || hafter == Hangul_Syllable_Type_T))
 695     return 0;
 696   /* GB8 */
 697   if((hbefore == Hangul_Syllable_Type_LVT
 698       || hbefore == Hangul_Syllable_Type_T)
 699      && hafter == Hangul_Syllable_Type_T)
 700     return 0;
 701   /* GB9 */
 702   if(utf32__word_break(after) == unicode_Word_Break_Extend)
 703     return 0;
 704   /* GB10 */
 705   return 1;
 706 }
 707
 708 /** @brief Return true if @p c is ignorable for boundary specifications */
 709 static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
 710   return (wb == unicode_Word_Break_Extend
 711           || wb == unicode_Word_Break_Format);
 712 }
 713
 714 /** @brief Identify a word boundary
 715  * @param s Start of string (must be NFD)
 716  * @param ns Length of string
 717  * @param n Index within string (in [0,ns].)
 718  * @return 1 at a word boundary, 0 otherwise
 719  *
 720  * This function identifies default word boundaries as described in UAX #29 s4.
 721  * It returns 1 if @p n points at the code point just after a word boundary
 722  * (including the hypothetical code point just after the end of the string).
 723  */
 724 int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
 725   enum unicode_Word_Break twobefore, before, after, twoafter;
 726   size_t nn;
 727
 728   /* WB1 and WB2 */
 729   if(n == 0 || n == ns)
 730     return 1;
 731   /* WB3 */
 732   if(s[n-1] == 0x000D && s[n] == 0x000A)
 733     return 0;
 734   /* WB4 */
 735   /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
 736   switch(s[n-1]) {                      /* bit of a bodge */
 737   case 0x000A:
 738   case 0x000D:
 739   case 0x0085:
 740   case 0x2028:
 741   case 0x2029:
 742     break;
 743   default:
 744     if(utf32__boundary_ignorable(utf32__word_break(s[n])))
 745       return 0;
 746     break;
 747   }
 748   /* Gather the property values we'll need for the rest of the test taking the
 749    * s6.2 changes into account */
 750   /* First we look at the code points after the proposed boundary */
 751   nn = n;                               /* <ns */
 752   after = utf32__word_break(s[nn++]);
 753   if(!utf32__boundary_ignorable(after)) {
 754     /* X (Extend|Format)* -> X */
 755     while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
 756       ++nn;
 757   }
 758   /* It's possible now that nn=ns */
 759   if(nn < ns)
 760     twoafter = utf32__word_break(s[nn]);
 761   else
 762     twoafter = unicode_Word_Break_Other;
 763
 764   /* Next we look at the code points before the proposed boundary.  This is a
 765    * bit fiddlier. */
 766   nn = n;
 767   while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
 768     --nn;
 769   if(nn == 0) {
 770     /* s[nn] must be ignorable */
 771     before = utf32__word_break(s[nn]);
 772     twobefore = unicode_Word_Break_Other;
 773   } else {
 774     /* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
 775      * ignorable. */
 776     before = utf32__word_break(s[nn - 1]);
 777     --nn;
 778     /* Repeat the exercise */
 779     while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
 780       --nn;
 781     if(nn == 0)
 782       twobefore = utf32__word_break(s[nn]);
 783     else
 784       twobefore = utf32__word_break(s[nn - 1]);
 785   }
 786
 787   /* WB5 */
 788   if(before == unicode_Word_Break_ALetter
 789      && after == unicode_Word_Break_ALetter)
 790     return 0;
 791   /* WB6 */
 792   if(before == unicode_Word_Break_ALetter
 793      && after == unicode_Word_Break_MidLetter
 794      && twoafter == unicode_Word_Break_ALetter)
 795     return 0;
 796   /* WB7 */
 797   if(twobefore == unicode_Word_Break_ALetter
 798      && before == unicode_Word_Break_MidLetter
 799      && after == unicode_Word_Break_ALetter)
 800     return 0;
 801   /* WB8 */
 802   if(before == unicode_Word_Break_Numeric
 803      && after == unicode_Word_Break_Numeric)
 804     return 0;
 805   /* WB9 */
 806   if(before == unicode_Word_Break_ALetter
 807      && after == unicode_Word_Break_Numeric)
 808     return 0;
 809   /* WB10 */
 810   if(before == unicode_Word_Break_Numeric
 811      && after == unicode_Word_Break_ALetter)
 812     return 0;
 813    /* WB11 */
 814   if(twobefore == unicode_Word_Break_Numeric
 815      && before == unicode_Word_Break_MidNum
 816      && after == unicode_Word_Break_Numeric)
 817     return 0;
 818   /* WB12 */
 819   if(before == unicode_Word_Break_Numeric
 820      && after == unicode_Word_Break_MidNum
 821      && twoafter == unicode_Word_Break_Numeric)
 822     return 0;
 823   /* WB13 */
 824   if(before == unicode_Word_Break_Katakana
 825      && after == unicode_Word_Break_Katakana)
 826     return 0;
 827   /* WB13a */
 828   if((before == unicode_Word_Break_ALetter
 829       || before == unicode_Word_Break_Numeric
 830       || before == unicode_Word_Break_Katakana
 831       || before == unicode_Word_Break_ExtendNumLet)
 832      && after == unicode_Word_Break_ExtendNumLet)
 833     return 0;
 834   /* WB13b */
 835   if(before == unicode_Word_Break_ExtendNumLet
 836      && (after == unicode_Word_Break_ALetter
 837          || after == unicode_Word_Break_Numeric
 838          || after == unicode_Word_Break_Katakana))
 839     return 0;
 840   /* WB14 */
 841   return 1;
 842 }
 843
 844 /*@}*/
 845 /** @defgroup utf8 Functions that operate on UTF-8 strings */
 846 /*@{*/
 847
 848 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
 849 #define utf8__transform(FN) do {                                \
 850   uint32_t *to32 = 0, *decomp32 = 0;                            \
 851   size_t nto32, ndecomp32;                                      \
 852   char *decomp8 = 0;                                            \
 853                                                                 \
 854   if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;        \
 855   if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error;     \
 856   decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp);            \
 857 error:                                                          \
 858   xfree(to32);                                                  \
 859   xfree(decomp32);                                              \
 860   return decomp8;                                               \
 861 } while(0)
 862
 863 /** @brief Canonically decompose @p [s,s+ns)
 864  * @param s Pointer to string
 865  * @param ns Length of string
 866  * @param ndp Where to store length of result
 867  * @return Pointer to result string, or NULL
 868  *
 869  * Computes the canonical decomposition of a string and stably sorts combining
 870  * characters into canonical order.  The result is in Normalization Form D and
 871  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 872  * NormalizationTest.txt.
 873  *
 874  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 875  * this might be.
 876  *
 877  * See also utf32_decompose_canon().
 878  */
 879 char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
 880   utf8__transform(utf32_decompose_canon);
 881 }
 882
 883 /** @brief Compatibility decompose @p [s,s+ns)
 884  * @param s Pointer to string
 885  * @param ns Length of string
 886  * @param ndp Where to store length of result
 887  * @return Pointer to result string, or NULL
 888  *
 889  * Computes the compatibility decomposition of a string and stably sorts
 890  * combining characters into canonical order.  The result is in Normalization
 891  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 892  * Unicode 5.0's NormalizationTest.txt.
 893  *
 894  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 895  * this might be.
 896  *
 897  * See also utf32_decompose_compat().
 898  */
 899 char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
 900   utf8__transform(utf32_decompose_compat);
 901 }
 902
 903 /** @brief Case-fold @p [s,s+ns)
 904  * @param s Pointer to string
 905  * @param ns Length of string
 906  * @param ndp Where to store length of result
 907  * @return Pointer to result string, or NULL
 908  *
 909  * Case-fold the string at @p s according to full default case-folding rules
 910  * (s3.13).  The result will be in NFD.
 911  *
 912  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 913  * this might be.
 914  */
 915 char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
 916   utf8__transform(utf32_casefold_canon);
 917 }
 918
 919 /** @brief Compatibility case-fold @p [s,s+ns)
 920  * @param s Pointer to string
 921  * @param ns Length of string
 922  * @param ndp Where to store length of result
 923  * @return Pointer to result string, or NULL
 924  *
 925  * Case-fold the string at @p s according to full default case-folding rules
 926  * (s3.13).  The result will be in NFKD.
 927  *
 928  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 929  * this might be.
 930  */
 931 char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
 932   utf8__transform(utf32_casefold_compat);
 933 }
 934
 935 /*@}*/
 936
 937 /*
 938 Local Variables:
 939 c-basic-offset:2
 940 comment-column:40
 941 fill-column:79
 942 indent-tabs-mode:nil
 943 End:
 944 */