chiark - git - mdw - disorder/blob - lib/unicode.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20 /** @file lib/unicode.c
  21  * @brief Unicode support functions
  22  *
  23  * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
  24  * encoding schemes).
  25  *
  26  * The idea is that all the strings that hit the database will be in a
  27  * particular normalization form, and for the search and tags database
  28  * in case-folded form, so they can be naively compared within the
  29  * database code.
  30  *
  31  * As the code stands this guarantee is not well met!
  32  */
  33
  34 #include <config.h>
  35 #include "types.h"
  36
  37 #include <string.h>
  38 #include <stdio.h>              /* TODO */
  39
  40 #include "mem.h"
  41 #include "vector.h"
  42 #include "unicode.h"
  43 #include "unidata.h"
  44
  45 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
  46 /*@{*/
  47
  48 /** @brief Convert UTF-32 to UTF-8
  49  * @param s Source string
  50  * @param ns Length of source string in code points
  51  * @param ndp Where to store length of destination string (or NULL)
  52  * @return Newly allocated destination string or NULL on error
  53  *
  54  * If the UTF-32 is not valid then NULL is returned.  A UTF-32 code point is
  55  * invalid if:
  56  * - it codes for a UTF-16 surrogate
  57  * - it codes for a value outside the unicode code space
  58  *
  59  * The return value is always 0-terminated.  The value returned via @p *ndp
  60  * does not include the terminator.
  61  */
  62 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
  63   struct dynstr d;
  64   uint32_t c;
  65
  66   dynstr_init(&d);
  67   while(ns > 0) {
  68     c = *s++;
  69     if(c < 0x80)
  70       dynstr_append(&d, c);
  71     else if(c < 0x0800) {
  72       dynstr_append(&d, 0xC0 | (c >> 6));
  73       dynstr_append(&d, 0x80 | (c & 0x3F));
  74     } else if(c < 0x10000) {
  75       if(c >= 0xD800 && c <= 0xDFFF)
  76         goto error;
  77       dynstr_append(&d, 0xE0 | (c >> 12));
  78       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  79       dynstr_append(&d, 0x80 | (c & 0x3F));
  80     } else if(c < 0x110000) {
  81       dynstr_append(&d, 0xF0 | (c >> 18));
  82       dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
  83       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  84       dynstr_append(&d, 0x80 | (c & 0x3F));
  85     } else
  86       goto error;
  87     --ns;
  88   }
  89   dynstr_terminate(&d);
  90   if(ndp)
  91     *ndp = d.nvec;
  92   return d.vec;
  93 error:
  94   xfree(d.vec);
  95   return 0;
  96 }
  97
  98 /** @brief Convert UTF-8 to UTF-32
  99  * @param s Source string
 100  * @param ns Length of source string in code points
 101  * @param ndp Where to store length of destination string (or NULL)
 102  * @return Newly allocated destination string or NULL
 103  *
 104  * The return value is always 0-terminated.  The value returned via @p *ndp
 105  * does not include the terminator.
 106  *
 107  * If the UTF-8 is not valid then NULL is returned.  A UTF-8 sequence
 108  * for a code point is invalid if:
 109  * - it is not the shortest possible sequence for the code point
 110  * - it codes for a UTF-16 surrogate
 111  * - it codes for a value outside the unicode code space
 112  */
 113 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
 114   struct dynstr_ucs4 d;
 115   uint32_t c32, c;
 116   const uint8_t *ss = (const uint8_t *)s;
 117
 118   dynstr_ucs4_init(&d);
 119   while(ns > 0) {
 120     c = *ss++;
 121     --ns;
 122     /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
 123      * (Unicode 5.0.0 s3.9 D76)
 124      *
 125      * 0xxxxxxx
 126      * 7 data bits gives 0x00 - 0x7F and all are acceptable
 127      *
 128      * 110xxxxx 10xxxxxx
 129      * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
 130      *
 131      * 1110xxxx 10xxxxxx 10xxxxxx
 132      * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
 133      * (and UTF-16 surrogates are not acceptable)
 134      *
 135      * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 136      * 21 data bits gives 0x00000000 - 0x001FFFFF
 137      * but only           0x00010000 - 0x0010FFFF are acceptable
 138      *
 139      * It is NOT always the case that the data bits in the first byte are
 140      * always non-0 for the acceptable values, so we do a separate check after
 141      * decoding.
 142      */
 143     if(c < 0x80)
 144       c32 = c;
 145     else if(c <= 0xDF) {
 146       if(ns < 1) goto error;
 147       c32 = c & 0x1F;
 148       c = *ss++;
 149       if((c & 0xC0) != 0x80) goto error;
 150       c32 = (c32 << 6) | (c & 0x3F);
 151       if(c32 < 0x80) goto error;
 152     } else if(c <= 0xEF) {
 153       if(ns < 2) goto error;
 154       c32 = c & 0x0F;
 155       c = *ss++;
 156       if((c & 0xC0) != 0x80) goto error;
 157       c32 = (c32 << 6) | (c & 0x3F);
 158       c = *ss++;
 159       if((c & 0xC0) != 0x80) goto error;
 160       c32 = (c32 << 6) | (c & 0x3F);
 161       if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
 162     } else if(c <= 0xF7) {
 163       if(ns < 3) goto error;
 164       c32 = c & 0x07;
 165       c = *ss++;
 166       if((c & 0xC0) != 0x80) goto error;
 167       c32 = (c32 << 6) | (c & 0x3F);
 168       c = *ss++;
 169       if((c & 0xC0) != 0x80) goto error;
 170       c32 = (c32 << 6) | (c & 0x3F);
 171       c = *ss++;
 172       if((c & 0xC0) != 0x80) goto error;
 173       c32 = (c32 << 6) | (c & 0x3F);
 174       if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
 175     } else
 176       goto error;
 177     dynstr_ucs4_append(&d, c32);
 178   }
 179   dynstr_ucs4_terminate(&d);
 180   if(ndp)
 181     *ndp = d.nvec;
 182   return d.vec;
 183 error:
 184   xfree(d.vec);
 185   return 0;
 186 }
 187
 188 /*@}*/
 189 /** @defgroup utf32 Functions that operate on UTF-32 strings */
 190 /*@{*/
 191
 192 /** @brief Return the length of a 0-terminated UTF-32 string
 193  * @param s Pointer to 0-terminated string
 194  * @return Length of string in code points (excluding terminator)
 195  *
 196  * Unlike the conversion functions no validity checking is done on the string.
 197  */
 198 size_t utf32_len(const uint32_t *s) {
 199   const uint32_t *t = s;
 200
 201   while(*t)
 202     ++t;
 203   return (size_t)(t - s);
 204 }
 205
 206 /** @brief Return the combining class of @p c
 207  * @param c Code point
 208  * @return Combining class of @p c
 209  */
 210 static inline int utf32__combining_class(uint32_t c) {
 211   if(c < UNICODE_NCHARS)
 212     return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
 213   return 0;
 214 }
 215
 216 /** @brief Stably sort [s,s+ns) into descending order of combining class
 217  * @param s Start of array
 218  * @param ns Number of elements, must be at least 1
 219  * @param buffer Buffer of at least @p ns elements
 220  */
 221 static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
 222   uint32_t *a, *b, *bp;
 223   size_t na, nb;
 224
 225   switch(ns) {
 226   case 1:                       /* 1-element array is always sorted */
 227     return;
 228   case 2:                       /* 2-element arrays are trivial to sort */
 229     if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
 230       uint32_t tmp = s[0];
 231       s[0] = s[1];
 232       s[1] = tmp;
 233     }
 234     return;
 235   default:
 236     /* Partition the array */
 237     na = ns / 2;
 238     nb = ns - na;
 239     a = s;
 240     b = s + na;
 241     /* Sort the two halves of the array */
 242     utf32__sort_ccc(a, na, buffer);
 243     utf32__sort_ccc(b, nb, buffer);
 244     /* Merge them back into one, via the buffer */
 245     bp = buffer;
 246     while(na > 0 && nb > 0) {
 247       /* We want descending order of combining class (hence <)
 248        * and we want stability within combining classes (hence <=)
 249        */
 250       if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
 251         *bp++ = *a++;
 252         --na;
 253       } else {
 254         *bp++ = *b++;
 255         --nb;
 256       }
 257     }
 258     while(na > 0) {
 259       *bp++ = *a++;
 260       --na;
 261     }
 262     while(nb > 0) {
 263       *bp++ = *b++;
 264       --nb;
 265     }
 266     memcpy(s, buffer,  ns * sizeof(uint32_t));
 267     return;
 268   }
 269 }
 270
 271 /** @brief Put combining characters into canonical order
 272  * @param s Pointer to UTF-32 string
 273  * @param ns Length of @p s
 274  * @return 0 on success, -1 on error
 275  *
 276  * @p s is modified in-place.  See Unicode 5.0 s3.11 for details of the
 277  * ordering.
 278  *
 279  * Currently we only support a maximum of 1024 combining characters after each
 280  * base character.  If this limit is exceeded then -1 is returned.
 281  */
 282 static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
 283   size_t nc;
 284   uint32_t buffer[1024];
 285
 286   /* The ordering amounts to a stable sort of each contiguous group of
 287    * characters with non-0 combining class. */
 288   while(ns > 0) {
 289     /* Skip non-combining characters */
 290     if(utf32__combining_class(*s) == 0) {
 291       ++s;
 292       --ns;
 293       continue;
 294     }
 295     /* We must now have at least one combining character; see how many
 296      * there are */
 297     for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
 298       ;
 299     if(nc > 1024)
 300       return -1;
 301     /* Sort the array */
 302     utf32__sort_ccc(s, nc, buffer);
 303     s += nc;
 304     ns -= nc;
 305   }
 306   return 0;
 307 }
 308
 309 /* Magic numbers from UAX #15 s16 */
 310 #define SBase 0xAC00
 311 #define LBase 0x1100
 312 #define VBase 0x1161
 313 #define TBase 0x11A7
 314 #define LCount 19
 315 #define VCount 21
 316 #define TCount 28
 317 #define NCount (VCount * TCount)
 318 #define SCount (LCount * NCount)
 319
 320 /** @brief Guts of the decomposition lookup functions */
 321 #define utf32__decompose_one_generic(WHICH) do {                        \
 322   const uint32_t *dc =                                                  \
 323     (c < UNICODE_NCHARS                                                 \
 324      ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH          \
 325      : 0);                                                              \
 326   if(dc) {                                                              \
 327     /* Found a canonical decomposition in the table */                  \
 328     while(*dc)                                                          \
 329       utf32__decompose_one_##WHICH(d, *dc++);                           \
 330   } else if(c >= SBase && c < SBase + SCount) {                         \
 331     /* Mechanically decomposable Hangul syllable (UAX #15 s16) */       \
 332     const uint32_t SIndex = c - SBase;                                  \
 333     const uint32_t L = LBase + SIndex / NCount;                         \
 334     const uint32_t V = VBase + (SIndex % NCount) / TCount;              \
 335     const uint32_t T = TBase + SIndex % TCount;                         \
 336     dynstr_ucs4_append(d, L);                                           \
 337     dynstr_ucs4_append(d, V);                                           \
 338     if(T != TBase)                                                      \
 339       dynstr_ucs4_append(d, T);                                         \
 340   } else                                                                \
 341     /* Equal to own canonical decomposition */                          \
 342     dynstr_ucs4_append(d, c);                                           \
 343 } while(0)
 344
 345 /** @brief Recursively compute the canonical decomposition of @p c
 346  * @param d Dynamic string to store decomposition in
 347  * @param c Code point to decompose (must be a valid!)
 348  * @return 0 on success, -1 on error
 349  */
 350 static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
 351   utf32__decompose_one_generic(canon);
 352 }
 353
 354 /** @brief Recursively compute the compatibility decomposition of @p c
 355  * @param d Dynamic string to store decomposition in
 356  * @param c Code point to decompose (must be a valid!)
 357  * @return 0 on success, -1 on error
 358  */
 359 static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
 360   utf32__decompose_one_generic(compat);
 361 }
 362
 363 /** @brief Guts of the decomposition functions */
 364 #define utf32__decompose_generic(WHICH) do {            \
 365   struct dynstr_ucs4 d;                                 \
 366   uint32_t c;                                           \
 367                                                         \
 368   dynstr_ucs4_init(&d);                                 \
 369   while(ns) {                                           \
 370     c = *s++;                                           \
 371     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 372       goto error;                                       \
 373     utf32__decompose_one_##WHICH(&d, c);                \
 374     --ns;                                               \
 375   }                                                     \
 376   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 377     goto error;                                         \
 378   dynstr_ucs4_terminate(&d);                            \
 379   if(ndp)                                               \
 380     *ndp = d.nvec;                                      \
 381   return d.vec;                                         \
 382 error:                                                  \
 383   xfree(d.vec);                                         \
 384   return 0;                                             \
 385 } while(0)
 386
 387 /** @brief Canonically decompose @p [s,s+ns)
 388  * @param s Pointer to string
 389  * @param ns Length of string
 390  * @param ndp Where to store length of result
 391  * @return Pointer to result string, or NULL
 392  *
 393  * Computes the canonical decomposition of a string and stably sorts combining
 394  * characters into canonical order.  The result is in Normalization Form D and
 395  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 396  * NormalizationTest.txt.
 397  *
 398  * Returns NULL if the string is not valid for either of the following reasons:
 399  * - it codes for a UTF-16 surrogate
 400  * - it codes for a value outside the unicode code space
 401  */
 402 uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 403   utf32__decompose_generic(canon);
 404 }
 405
 406 /** @brief Compatibility decompose @p [s,s+ns)
 407  * @param s Pointer to string
 408  * @param ns Length of string
 409  * @param ndp Where to store length of result
 410  * @return Pointer to result string, or NULL
 411  *
 412  * Computes the compatibility decomposition of a string and stably sorts
 413  * combining characters into canonical order.  The result is in Normalization
 414  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 415  * Unicode 5.0's NormalizationTest.txt.
 416  *
 417  * Returns NULL if the string is not valid for either of the following reasons:
 418  * - it codes for a UTF-16 surrogate
 419  * - it codes for a value outside the unicode code space
 420  */
 421 uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 422   utf32__decompose_generic(compat);
 423 }
 424
 425 /** @brief Single-character case-fold and decompose operation */
 426 #define utf32__casefold_one(WHICH) do {                                 \
 427   const uint32_t *cf =                                                  \
 428      (c < UNICODE_NCHARS                                                \
 429       ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold      \
 430       : 0);                                                             \
 431   if(cf) {                                                              \
 432     /* Found a case-fold mapping in the table */                        \
 433     while(*cf)                                                          \
 434       utf32__decompose_one_##WHICH(&d, *cf++);                          \
 435   } else                                                                \
 436     utf32__decompose_one_##WHICH(&d, c);                                \
 437 } while(0)
 438
 439 /** @brief Case-fold @p [s,s+ns)
 440  * @param s Pointer to string
 441  * @param ns Length of string
 442  * @param ndp Where to store length of result
 443  * @return Pointer to result string, or NULL
 444  *
 445  * Case-fold the string at @p s according to full default case-folding rules
 446  * (s3.13) for caseless matching.  The result will be in NFD.
 447  *
 448  * Returns NULL if the string is not valid for either of the following reasons:
 449  * - it codes for a UTF-16 surrogate
 450  * - it codes for a value outside the unicode code space
 451  */
 452 uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 453   struct dynstr_ucs4 d;
 454   uint32_t c;
 455   size_t n;
 456   uint32_t *ss = 0;
 457
 458   /* If the canonical decomposition of the string includes any combining
 459    * character that case-folds to a non-combining character then we must
 460    * normalize before we fold.  In Unicode 5.0.0 this means 0345 COMBINING
 461    * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
 462    * canonically decompose to it. */
 463   for(n = 0; n < ns; ++n) {
 464     c = s[n];
 465     if(c < UNICODE_NCHARS
 466        && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
 467            & unicode_normalize_before_casefold))
 468       break;
 469   }
 470   if(n < ns) {
 471     /* We need a preliminary decomposition */
 472     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 473       return 0;
 474     s = ss;
 475   }
 476   dynstr_ucs4_init(&d);
 477   while(ns) {
 478     c = *s++;
 479     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
 480       goto error;
 481     utf32__casefold_one(canon);
 482     --ns;
 483   }
 484   if(utf32__canonical_ordering(d.vec, d.nvec))
 485     goto error;
 486   dynstr_ucs4_terminate(&d);
 487   if(ndp)
 488     *ndp = d.nvec;
 489   return d.vec;
 490 error:
 491   xfree(d.vec);
 492   xfree(ss);
 493   return 0;
 494 }
 495
 496 /** @brief Compatibilit case-fold @p [s,s+ns)
 497  * @param s Pointer to string
 498  * @param ns Length of string
 499  * @param ndp Where to store length of result
 500  * @return Pointer to result string, or NULL
 501  *
 502  * Case-fold the string at @p s according to full default case-folding rules
 503  * (s3.13) for compatibility caseless matching.  The result will be in NFKD.
 504  *
 505  * Returns NULL if the string is not valid for either of the following reasons:
 506  * - it codes for a UTF-16 surrogate
 507  * - it codes for a value outside the unicode code space
 508  */
 509 uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 510   struct dynstr_ucs4 d;
 511   uint32_t c;
 512   size_t n;
 513   uint32_t *ss = 0;
 514
 515   for(n = 0; n < ns; ++n) {
 516     c = s[n];
 517     if(c < UNICODE_NCHARS
 518        && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
 519            & unicode_normalize_before_casefold))
 520       break;
 521   }
 522   if(n < ns) {
 523     /* We need a preliminary _canonical_ decomposition */
 524     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 525       return 0;
 526     s = ss;
 527   }
 528   /* This computes NFKD(toCaseFold(s)) */
 529 #define compat_casefold_middle() do {                   \
 530   dynstr_ucs4_init(&d);                                 \
 531   while(ns) {                                           \
 532     c = *s++;                                           \
 533     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 534       goto error;                                       \
 535     utf32__casefold_one(compat);                        \
 536     --ns;                                               \
 537   }                                                     \
 538   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 539     goto error;                                         \
 540 } while(0)
 541   /* Do the inner (NFKD o toCaseFold) */
 542   compat_casefold_middle();
 543   /* We can do away with the NFD'd copy of the input now */
 544   xfree(ss);
 545   s = ss = d.vec;
 546   ns = d.nvec;
 547   /* Do the outer (NFKD o toCaseFold) */
 548   compat_casefold_middle();
 549   /* That's all */
 550   dynstr_ucs4_terminate(&d);
 551   if(ndp)
 552     *ndp = d.nvec;
 553   return d.vec;
 554 error:
 555   xfree(d.vec);
 556   xfree(ss);
 557   return 0;
 558 }
 559
 560 /** @brief Order a pair of UTF-32 strings
 561  * @param a First 0-terminated string
 562  * @param b Second 0-terminated string
 563  * @return -1, 0 or 1 for a less than, equal to or greater than b
 564  *
 565  * "Comparable to strcmp() at its best."
 566  */
 567 int utf32_cmp(const uint32_t *a, const uint32_t *b) {
 568   while(*a && *b && *a == *b) {
 569     ++a;
 570     ++b;
 571   }
 572   return *a < *b ? -1 : (*a > *b ? 1 : 0);
 573 }
 574
 575 /*@}*/
 576 /** @defgroup Functions that operate on UTF-8 strings */
 577 /*@{*/
 578
 579 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
 580 #define utf8__transform(FN) do {                                \
 581   uint32_t *to32 = 0, *decomp32 = 0;                            \
 582   size_t nto32, ndecomp32;                                      \
 583   char *decomp8 = 0;                                            \
 584                                                                 \
 585   if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;        \
 586   if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error;     \
 587   decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp);            \
 588 error:                                                          \
 589   xfree(to32);                                                  \
 590   xfree(decomp32);                                              \
 591   return decomp8;                                               \
 592 } while(0)
 593
 594 /** @brief Canonically decompose @p [s,s+ns)
 595  * @param s Pointer to string
 596  * @param ns Length of string
 597  * @param ndp Where to store length of result
 598  * @return Pointer to result string, or NULL
 599  *
 600  * Computes the canonical decomposition of a string and stably sorts combining
 601  * characters into canonical order.  The result is in Normalization Form D and
 602  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 603  * NormalizationTest.txt.
 604  *
 605  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 606  * this might be.
 607  *
 608  * See also utf32_decompose_canon().
 609  */
 610 char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
 611   utf8__transform(utf32_decompose_canon);
 612 }
 613
 614 /** @brief Compatibility decompose @p [s,s+ns)
 615  * @param s Pointer to string
 616  * @param ns Length of string
 617  * @param ndp Where to store length of result
 618  * @return Pointer to result string, or NULL
 619  *
 620  * Computes the compatibility decomposition of a string and stably sorts
 621  * combining characters into canonical order.  The result is in Normalization
 622  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 623  * Unicode 5.0's NormalizationTest.txt.
 624  *
 625  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 626  * this might be.
 627  *
 628  * See also utf32_decompose_compat().
 629  */
 630 char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
 631   utf8__transform(utf32_decompose_compat);
 632 }
 633
 634 /** @brief Case-fold @p [s,s+ns)
 635  * @param s Pointer to string
 636  * @param ns Length of string
 637  * @param ndp Where to store length of result
 638  * @return Pointer to result string, or NULL
 639  *
 640  * Case-fold the string at @p s according to full default case-folding rules
 641  * (s3.13).  The result will be in NFD.
 642  *
 643  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 644  * this might be.
 645  */
 646 char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
 647   utf8__transform(utf32_casefold_canon);
 648 }
 649
 650 /** @brief Compatibility case-fold @p [s,s+ns)
 651  * @param s Pointer to string
 652  * @param ns Length of string
 653  * @param ndp Where to store length of result
 654  * @return Pointer to result string, or NULL
 655  *
 656  * Case-fold the string at @p s according to full default case-folding rules
 657  * (s3.13).  The result will be in NFKD.
 658  *
 659  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 660  * this might be.
 661  */
 662 char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
 663   utf8__transform(utf32_casefold_compat);
 664 }
 665
 666 /*@}*/
 667
 668 /*
 669 Local Variables:
 670 c-basic-offset:2
 671 comment-column:40
 672 fill-column:79
 673 indent-tabs-mode:nil
 674 End:
 675 */