[disorder] / lib / unicode.c

/*
 * This file is part of DisOrder
 * Copyright (C) 2007 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */
/** @file lib/unicode.c
 * @brief Unicode support functions
 *
 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
 * encoding schemes).
 *
 * The idea is that all the strings that hit the database will be in a
 * particular normalization form, and for the search and tags database
 * in case-folded form, so they can be naively compared within the
 * database code.
 *
 * As the code stands this guarantee is not well met!
 */

#include <config.h>
#include "types.h"

#include <string.h>
#include <stdio.h>		/* TODO */

#include "mem.h"
#include "vector.h"
#include "unicode.h"
#include "unidata.h"

/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
/*@{*/

/** @brief Convert UTF-32 to UTF-8
 * @param s Source string
 * @param ns Length of source string in code points
 * @param ndp Where to store length of destination string (or NULL)
 * @return Newly allocated destination string or NULL on error
 *
 * If the UTF-32 is not valid then NULL is returned.  A UTF-32 code
 * point is invalid if:
 * - it codes for a UTF-16 surrogate
 * - it codes for a value outside the unicode code space
 *
 * The return value is always 0-terminated.  The value returned via @p
 * *ndp does not include the terminator.
 */
char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
  struct dynstr d;
  uint32_t c;

  dynstr_init(&d);
  while(ns > 0) {
    c = *s++;
    if(c < 0x80)
      dynstr_append(&d, c);
    else if(c < 0x0800) {
      dynstr_append(&d, 0xC0 | (c >> 6));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else if(c < 0x10000) {
      if(c >= 0xDF800 && c <= 0xDFFF)
	goto error;
      dynstr_append(&d, 0xE0 | (c >> 12));
      dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else if(c < 0x110000) {
      dynstr_append(&d, 0xF0 | (c >> 18));
      dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
      dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else
      goto error;
    --ns;
  }
  dynstr_terminate(&d);
  if(ndp)
    *ndp = d.nvec;
  return d.vec;
error:
  xfree(d.vec);
  return 0;
}

/** @brief Convert UTF-8 to UTF-32
 * @param s Source string
 * @param ns Length of source string in code points
 * @param ndp Where to store length of destination string (or NULL)
 * @return Newly allocated destination string or NULL
 *
 * The return value is always 0-terminated.  The value returned via @p
 * *ndp does not include the terminator.
 *
 * If the UTF-8 is not valid then NULL is returned.  A UTF-8 sequence
 * for a code point is invalid if:
 * - it is not the shortest possible sequence for the code point
 * - it codes for a UTF-16 surrogate
 * - it codes for a value outside the unicode code space
 */
uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
  struct dynstr_ucs4 d;
  uint32_t c32, c;
  const uint8_t *ss = (const uint8_t *)s;

  dynstr_ucs4_init(&d);
  while(ns > 0) {
    c = *ss++;
    --ns;
    /* 
     * Acceptable UTF-8 is:
     *
     * 0xxxxxxx
     * 7 data bits gives 0x00 - 0x7F and all are acceptable
     * 
     * 110xxxxx 10xxxxxx
     * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
     *   
     * 1110xxxx 10xxxxxx 10xxxxxx
     * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
     * (and UTF-16 surrogates are not acceptable)
     *
     * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 21 data bits gives 0x00000000 - 0x001FFFFF
     * but only           0x00010000 - 0x0010FFFF are acceptable
     *
     * It is NOT always the case that the data bits in the first byte
     * are always non-0 for the acceptable values, so we do a separate
     * check after decoding.
     */
    if(c < 0x80)
      c32 = c;
    else if(c <= 0xDF) {
      if(ns < 1) goto error;
      c32 = c & 0x1F;
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      if(c32 < 0x80) goto error;
    } else if(c <= 0xEF) {
      if(ns < 2) goto error;
      c32 = c & 0x0F;
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
    } else if(c <= 0xF7) {
      if(ns < 3) goto error;
      c32 = c & 0x07;
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      c = *ss++;
      if((c & 0xC0) != 0x80) goto error;
      c32 = (c32 << 6) | (c & 0x3F);
      if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
    } else
      goto error;
    dynstr_ucs4_append(&d, c32);
  }
  dynstr_ucs4_terminate(&d);
  if(ndp)
    *ndp = d.nvec;
  return d.vec;
error:
  xfree(d.vec);
  return 0;
}

/*@}*/
/** @defgroup utf32 Functions that operate on UTF-32 strings */
/*@{*/

/** @brief Return the length of a 0-terminated UTF-32 string
 * @param s Pointer to 0-terminated string
 * @return Length of string in code points (excluding terminator)
 *
 * Unlike the conversion functions no validity checking is done on the
 * string.
 */
size_t utf32_len(const uint32_t *s) {
  const uint32_t *t = s;

  while(*t)
    ++t;
  return (size_t)(t - s);
}

/** @brief Return the combining class of @p c
 * @param c Code point
 * @return Combining class of @p c
 */
static inline int utf32__combining_class(uint32_t c) {
  if(c < UNICODE_NCHARS)
    return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
  return 0;
}

/** @brief Stably sort [s,s+ns) into descending order of combining class
 * @param s Start of array
 * @param ns Number of elements, must be at least 1
 * @param buffer Buffer of at least @p ns elements
 */
static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
  uint32_t *a, *b, *bp;
  size_t na, nb;

  switch(ns) {
  case 1:			/* 1-element array is always sorted */
    return;
  case 2:			/* 2-element arrays are trivial to sort */
    if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
      uint32_t tmp = s[0];
      s[0] = s[1];
      s[1] = tmp;
    }
    return;
  default:
    /* Partition the array */
    na = ns / 2;
    nb = ns - na;
    a = s;
    b = s + na;
    /* Sort the two halves of the array */
    utf32__sort_ccc(a, na, buffer);
    utf32__sort_ccc(b, nb, buffer);
    /* Merge them back into one, via the buffer */
    bp = buffer;
    while(na > 0 && nb > 0) {
      /* We want descending order of combining class (hence <)
       * and we want stability within combining classes (hence <=)
       */
      if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
	*bp++ = *a++;
	--na;
      } else {
	*bp++ = *b++;
	--nb;
      }
    }
    while(na > 0) {
      *bp++ = *a++;
      --na;
    }
    while(nb > 0) {
      *bp++ = *b++;
      --nb;
    }
    memcpy(s, buffer,  ns * sizeof(uint32_t));
    return;
  }
}

/** @brief Put combining characters into canonical order
 * @param s Pointer to UTF-32 string
 * @param ns Length of @p s
 * @return 0 on success, -1 on error
 *
 * @p s is modified in-place.  See Unicode 5.0 s3.11 for details of
 * the ordering.
 *
 * Currently we only support a maximum of 1024 combining characters
 * after each base character.  If this limit is exceeded then -1 is
 * returned.
 */
static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
  size_t nc;
  uint32_t buffer[1024];

  /* The ordering amounts to a stable sort of each contiguous group of
   * characters with non-0 combining class. */
  while(ns > 0) {
    /* Skip non-combining characters */
    if(utf32__combining_class(*s) == 0) {
      ++s;
      --ns;
      continue;
    }
    /* We must now have at least one combining character; see how many
     * there are */
    for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
      ;
    if(nc > 1024)
      return -1;
    /* Sort the array */
    utf32__sort_ccc(s, nc, buffer);
    s += nc;
    ns -= nc;
  }
  return 0;
}

/* Magic numbers from UAX #15 s16 */
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount * TCount)
#define SCount (LCount * NCount)

/** @brief Guts of the decomposition lookup functions */
#define utf32__decompose_one_generic(WHICH) do {                        \
  const uint32_t *dc =                                                  \
    (c < UNICODE_NCHARS                                                 \
     ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH          \
     : 0);                                                              \
  if(dc) {                                                              \
    /* Found a canonical decomposition in the table */                  \
    while(*dc)                                                          \
      utf32__decompose_one_##WHICH(d, *dc++);                           \
  } else if(c >= SBase && c < SBase + SCount) {                         \
    /* Mechanically decomposable Hangul syllable (UAX #15 s16) */       \
    const uint32_t SIndex = c - SBase;                                  \
    const uint32_t L = LBase + SIndex / NCount;                         \
    const uint32_t V = VBase + (SIndex % NCount) / TCount;              \
    const uint32_t T = TBase + SIndex % TCount;                         \
    dynstr_ucs4_append(d, L);                                           \
    dynstr_ucs4_append(d, V);                                           \
    if(T != TBase)                                                      \
      dynstr_ucs4_append(d, T);                                         \
  } else                                                                \
    /* Equal to own canonical decomposition */                          \
    dynstr_ucs4_append(d, c);                                           \
} while(0)

/** @brief Recursively compute the canonical decomposition of @p c
 * @param d Dynamic string to store decomposition in
 * @param c Code point to decompose (must be a valid!)
 * @return 0 on success, -1 on error
 */
static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
  utf32__decompose_one_generic(canon);
}

/** @brief Recursively compute the compatibility decomposition of @p c
 * @param d Dynamic string to store decomposition in
 * @param c Code point to decompose (must be a valid!)
 * @return 0 on success, -1 on error
 */
static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
  utf32__decompose_one_generic(compat);
}

/** @brief Guts of the decomposition functions */
#define utf32__decompose_generic(WHICH) do {            \
  struct dynstr_ucs4 d;                                 \
  uint32_t c;                                           \
                                                        \
  dynstr_ucs4_init(&d);                                 \
  while(ns) {                                           \
    c = *s++;                                           \
    if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF)   \
      goto error;                                       \
    utf32__decompose_one_##WHICH(&d, c);                \
    --ns;                                               \
  }                                                     \
  if(utf32__canonical_ordering(d.vec, d.nvec))          \
    goto error;                                         \
  dynstr_ucs4_terminate(&d);                            \
  if(ndp)                                               \
    *ndp = d.nvec;                                      \
  return d.vec;                                         \
error:                                                  \
  xfree(d.vec);                                         \
  return 0;                                             \
} while(0)

/** @brief Canonically decompose @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Computes the canonical decomposition of a string and stably sorts combining
 * characters into canonical order.  The result is in Normalization Form D and
 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 * NormalizationTest.txt.
 *
 * Returns NULL if the string is not valid for either of the following
 * reasons:
 * - it codes for a UTF-16 surrogate
 * - it codes for a value outside the unicode code space
 */
uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
  utf32__decompose_generic(canon);
}

/** @brief Compatibility decompose @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Computes the compatibility decomposition of a string and stably sorts
 * combining characters into canonical order.  The result is in Normalization
 * Form KD and (at the time of writing!) passes the NFKD tests defined in
 * Unicode 5.0's NormalizationTest.txt.
 *
 * Returns NULL if the string is not valid for either of the following
 * reasons:
 * - it codes for a UTF-16 surrogate
 * - it codes for a value outside the unicode code space
 */
uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
  utf32__decompose_generic(compat);
}

/** @brief Case-fold @p C
 * @param D String to append to
 * @param C Character to fold
 */
static inline void utf32__casefold_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
  const uint32_t *cf =                                                  
     (c < UNICODE_NCHARS                                              
      ? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold
      : 0);                                                             
  if(cf) {                                                              
    /* Found a case-fold mapping in the table */                        
    while(*cf)                                                          
      utf32__decompose_one_canon(d, *cf++);                            
  } else                                                               
    utf32__decompose_one_canon(d, c);  
}

/** @brief Case-fold @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Case-fold the string at @p s according to full default case-folding rules
 * (s3.13).  The result will be in NFD.
 *
 * Returns NULL if the string is not valid for either of the following
 * reasons:
 * - it codes for a UTF-16 surrogate
 * - it codes for a value outside the unicode code space
 */
uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
  struct dynstr_ucs4 d;
  uint32_t c;
  size_t n;
  uint32_t *ss = 0;

  /* If the canonical decomposition of the string includes any combining
   * character that case-folds to a non-combining character then we must
   * normalize before we fold.  In Unicode 5.0.0 this means 0345 COMBINING
   * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
   * canonically decompose to it. */
  for(n = 0; n < ns; ++n) {
    c = s[n];
    if(c < UNICODE_NCHARS
       && (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
           & unicode_normalize_before_casefold))
      break;
  }
  if(n < ns) {
    /* We need a preliminary decomposition */
    if(!(ss = utf32_decompose_canon(s, ns, &ns)))
      return 0;
    s = ss;
  }
  dynstr_ucs4_init(&d);
  while(ns) {
    c = *s++;
    if((c >= 0xDF800 && c <= 0xDFFF) || c > 0x10FFFF)
      goto error;
    utf32__casefold_one_canon(&d, c);
    --ns;
  }
  if(utf32__canonical_ordering(d.vec, d.nvec))
    goto error;
  dynstr_ucs4_terminate(&d);
  if(ndp)
    *ndp = d.nvec;
  return d.vec;
error:
  xfree(d.vec);
  xfree(ss);
  return 0;
}

/** @brief Order a pair of UTF-32 strings
 * @param a First 0-terminated string
 * @param b Second 0-terminated string
 * @return -1, 0 or 1 for a less than, equal to or greater than b
 *
 * "Comparable to strcmp() at its best."
 */
int utf32_cmp(const uint32_t *a, const uint32_t *b) {
  while(*a && *b && *a == *b) {
    ++a;
    ++b;
  }
  return *a < *b ? -1 : (*a > *b ? 1 : 0);
}

/*@}*/
/** @defgroup Functions that operate on UTF-8 strings */
/*@{*/

/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
#define utf8__transform(FN) do {                                \
  uint32_t *to32 = 0, *decomp32 = 0;                            \
  size_t nto32, ndecomp32;                                      \
  char *decomp8 = 0;                                            \
                                                                \
  if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;        \
  if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error;     \
  decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp);            \
error:                                                          \
  xfree(to32);                                                  \
  xfree(decomp32);                                              \
  return decomp8;                                               \
} while(0)

/** @brief Canonically decompose @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Computes the canonical decomposition of a string and stably sorts combining
 * characters into canonical order.  The result is in Normalization Form D and
 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 * NormalizationTest.txt.
 *
 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 * this might be.
 *
 * See also utf32_decompose_canon().
 */
char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
  utf8__transform(utf32_decompose_canon);
}

/** @brief Compatibility decompose @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Computes the compatibility decomposition of a string and stably sorts
 * combining characters into canonical order.  The result is in Normalization
 * Form KD and (at the time of writing!) passes the NFKD tests defined in
 * Unicode 5.0's NormalizationTest.txt.
 *
 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 * this might be.
 *
 * See also utf32_decompose_compat().
 */
char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
  utf8__transform(utf32_decompose_compat);
}

/** @brief Case-fold @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Case-fold the string at @p s according to full default case-folding rules
 * (s3.13).  The result will be in NFD.
 *
 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 * this might be.
 */
char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
  utf8__transform(utf32_casefold_canon);
}

/** @brief Compatibility case-fold @p [s,s+ns)
 * @param s Pointer to string
 * @param ns Length of string
 * @param ndp Where to store length of result
 * @return Pointer to result string, or NULL
 *
 * Case-fold the string at @p s according to full default case-folding rules
 * (s3.13).  The result will be in NFKD.
 *
 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 * this might be.
 */
#if 0
char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
  utf8__transform(utf32_casefold_compat);
}
#endif

/*@}*/

/*
Local Variables:
c-basic-offset:2
comment-column:40
fill-column:79
indent-tabs-mode:nil
End:
*/
Commit	Line	Data
e5a5a138 RK	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2007 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/unicode.c
	21	* @brief Unicode support functions
	22	*
	23	* Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
	24	* encoding schemes).
	25	*
	26	* The idea is that all the strings that hit the database will be in a
	27	* particular normalization form, and for the search and tags database
	28	* in case-folded form, so they can be naively compared within the
	29	* database code.
	30	*
	31	* As the code stands this guarantee is not well met!
	32	*/
	33
	34	#include <config.h>
	35	#include "types.h"
	36
	37	#include <string.h>
	38	#include <stdio.h> /* TODO */
	39
	40	#include "mem.h"
	41	#include "vector.h"
	42	#include "unicode.h"
	43	#include "unidata.h"
	44
	45	/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
	46	/@{/
	47
	48	/** @brief Convert UTF-32 to UTF-8
	49	* @param s Source string
	50	* @param ns Length of source string in code points
	51	* @param ndp Where to store length of destination string (or NULL)
	52	* @return Newly allocated destination string or NULL on error
	53	*
	54	* If the UTF-32 is not valid then NULL is returned. A UTF-32 code
	55	* point is invalid if:
	56	* - it codes for a UTF-16 surrogate
	57	* - it codes for a value outside the unicode code space
	58	*
	59	* The return value is always 0-terminated. The value returned via @p
	60	* *ndp does not include the terminator.
	61	*/
	62	char utf32_to_utf8(const uint32_t s, size_t ns, size_t *ndp) {
	63	struct dynstr d;
	64	uint32_t c;
65
66	dynstr_init(&d);
67	while(ns > 0) {
68	c = *s++;
69	if(c < 0x80)
70	dynstr_append(&d, c);
71	else if(c < 0x0800) {
72	dynstr_append(&d, 0xC0 \| (c >> 6));
73	dynstr_append(&d, 0x80 \| (c & 0x3F));
74	} else if(c < 0x10000) {
75	if(c >= 0xDF800 && c <= 0xDFFF)
76	goto error;
77	dynstr_append(&d, 0xE0 \| (c >> 12));
78	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
79	dynstr_append(&d, 0x80 \| (c & 0x3F));
80	} else if(c < 0x110000) {
81	dynstr_append(&d, 0xF0 \| (c >> 18));
82	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
83	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
84	dynstr_append(&d, 0x80 \| (c & 0x3F));
85	} else
86	goto error;
87	--ns;
88	}
89	dynstr_terminate(&d);
90	if(ndp)
91	*ndp = d.nvec;
92	return d.vec;
93	error:
94	xfree(d.vec);
95	return 0;
96	}
97
98	/** @brief Convert UTF-8 to UTF-32
99	* @param s Source string
100	* @param ns Length of source string in code points
101	* @param ndp Where to store length of destination string (or NULL)
102	* @return Newly allocated destination string or NULL
103	*
104	* The return value is always 0-terminated. The value returned via @p
105	* *ndp does not include the terminator.
106	*
107	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
108	* for a code point is invalid if:
109	* - it is not the shortest possible sequence for the code point
110	* - it codes for a UTF-16 surrogate
111	* - it codes for a value outside the unicode code space
112	*/
113	uint32_t utf8_to_utf32(const char s, size_t ns, size_t *ndp) {
114	struct dynstr_ucs4 d;
115	uint32_t c32, c;
116	const uint8_t ss = (const uint8_t )s;
117
118	dynstr_ucs4_init(&d);
119	while(ns > 0) {
120	c = *ss++;
121	--ns;
122	/*
123	* Acceptable UTF-8 is:
124	*
125	* 0xxxxxxx
126	* 7 data bits gives 0x00 - 0x7F and all are acceptable
127	*
128	* 110xxxxx 10xxxxxx
129	* 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
130	*
131	* 1110xxxx 10xxxxxx 10xxxxxx
132	* 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
133	* (and UTF-16 surrogates are not acceptable)
134	*
135	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136	* 21 data bits gives 0x00000000 - 0x001FFFFF
137	* but only 0x00010000 - 0x0010FFFF are acceptable
138	*
139	* It is NOT always the case that the data bits in the first byte
140	* are always non-0 for the acceptable values, so we do a separate
141	* check after decoding.
142	*/
143	if(c < 0x80)
144	c32 = c;
145	else if(c <= 0xDF) {
146	if(ns < 1) goto error;
147	c32 = c & 0x1F;
148	c = *ss++;
149	if((c & 0xC0) != 0x80) goto error;
150	c32 = (c32 << 6) \| (c & 0x3F);
151	if(c32 < 0x80) goto error;
152	} else if(c <= 0xEF) {
153	if(ns < 2) goto error;
154	c32 = c & 0x0F;
155	c = *ss++;
156	if((c & 0xC0) != 0x80) goto error;
157	c32 = (c32 << 6) \| (c & 0x3F);
158	c = *ss++;
159	if((c & 0xC0) != 0x80) goto error;
160	c32 = (c32 << 6) \| (c & 0x3F);
161	if(c32 < 0x0800 \|\| (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
162	} else if(c <= 0xF7) {
163	if(ns < 3) goto error;
164	c32 = c & 0x07;
165	c = *ss++;
166	if((c & 0xC0) != 0x80) goto error;
167	c32 = (c32 << 6) \| (c & 0x3F);
168	c = *ss++;
169	if((c & 0xC0) != 0x80) goto error;
170	c32 = (c32 << 6) \| (c & 0x3F);
171	c = *ss++;
172	if((c & 0xC0) != 0x80) goto error;
173	c32 = (c32 << 6) \| (c & 0x3F);
174	if(c32 < 0x00010000 \|\| c32 > 0x0010FFFF) goto error;
175	} else
176	goto error;
177	dynstr_ucs4_append(&d, c32);
178	}
179	dynstr_ucs4_terminate(&d);
180	if(ndp)
181	*ndp = d.nvec;
182	return d.vec;
183	error:
184	xfree(d.vec);
185	return 0;
186	}
187
188	/@}/
189	/** @defgroup utf32 Functions that operate on UTF-32 strings */
190	/@{/
191
192	/** @brief Return the length of a 0-terminated UTF-32 string
193	* @param s Pointer to 0-terminated string
194	* @return Length of string in code points (excluding terminator)
195	*
196	* Unlike the conversion functions no validity checking is done on the
197	* string.
198	*/
199	size_t utf32_len(const uint32_t *s) {
200	const uint32_t *t = s;
201
202	while(*t)
203	++t;
204	return (size_t)(t - s);
205	}
206
207	/** @brief Return the combining class of @p c
208	* @param c Code point
209	* @return Combining class of @p c
210	*/
211	static inline int utf32__combining_class(uint32_t c) {
212	if(c < UNICODE_NCHARS)
213	return unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].ccc;
214	return 0;
215	}
216
217	/** @brief Stably sort [s,s+ns) into descending order of combining class
218	* @param s Start of array
219	* @param ns Number of elements, must be at least 1
220	* @param buffer Buffer of at least @p ns elements
221	*/
222	static void utf32__sort_ccc(uint32_t s, size_t ns, uint32_t buffer) {
223	uint32_t a, b, *bp;
224	size_t na, nb;
225
226	switch(ns) {
227	case 1: /* 1-element array is always sorted */
228	return;
229	case 2: /* 2-element arrays are trivial to sort */
230	if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
231	uint32_t tmp = s[0];
232	s[0] = s[1];
233	s[1] = tmp;
234	}
235	return;
236	default:
237	/* Partition the array */
238	na = ns / 2;
239	nb = ns - na;
240	a = s;
241	b = s + na;
242	/* Sort the two halves of the array */
243	utf32__sort_ccc(a, na, buffer);
244	utf32__sort_ccc(b, nb, buffer);
245	/* Merge them back into one, via the buffer */
246	bp = buffer;
247	while(na > 0 && nb > 0) {
248	/* We want descending order of combining class (hence <)
249	* and we want stability within combining classes (hence <=)
250	*/
251	if(utf32__combining_class(a) <= utf32__combining_class(b)) {
252	bp++ = a++;
253	--na;
254	} else {
255	bp++ = b++;
256	--nb;
257	}
258	}
259	while(na > 0) {
260	bp++ = a++;
261	--na;
262	}
263	while(nb > 0) {
264	bp++ = b++;
265	--nb;
266	}
267	memcpy(s, buffer, ns * sizeof(uint32_t));
268	return;
269	}
270	}
271
272	/** @brief Put combining characters into canonical order
273	* @param s Pointer to UTF-32 string
274	* @param ns Length of @p s
275	* @return 0 on success, -1 on error
276	*
277	* @p s is modified in-place. See Unicode 5.0 s3.11 for details of
278	* the ordering.
279	*
280	* Currently we only support a maximum of 1024 combining characters
281	* after each base character. If this limit is exceeded then -1 is
282	* returned.
283	*/
284	static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
285	size_t nc;
286	uint32_t buffer[1024];
287
288	/* The ordering amounts to a stable sort of each contiguous group of
289	* characters with non-0 combining class. */
290	while(ns > 0) {
291	/* Skip non-combining characters */
292	if(utf32__combining_class(*s) == 0) {
293	++s;
294	--ns;
295	continue;
296	}
297	/* We must now have at least one combining character; see how many
298	* there are */
299	for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
300	;
301	if(nc > 1024)
302	return -1;
303	/* Sort the array */
304	utf32__sort_ccc(s, nc, buffer);
305	s += nc;
306	ns -= nc;
307	}
308	return 0;
309	}
310
311	/* Magic numbers from UAX #15 s16 */
312	#define SBase 0xAC00
313	#define LBase 0x1100
314	#define VBase 0x1161
315	#define TBase 0x11A7
316	#define LCount 19
317	#define VCount 21
318	#define TCount 28
319	#define NCount (VCount * TCount)
320	#define SCount (LCount * NCount)
321
322	/** @brief Guts of the decomposition lookup functions */
323	#define utf32__decompose_one_generic(WHICH) do { \
324	const uint32_t *dc = \
325	(c < UNICODE_NCHARS \
326	? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].WHICH \
327	: 0); \
328	if(dc) { \
329	/* Found a canonical decomposition in the table */ \
330	while(*dc) \
331	utf32__decompose_one_##WHICH(d, *dc++); \
332	} else if(c >= SBase && c < SBase + SCount) { \
333	/* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
334	const uint32_t SIndex = c - SBase; \
335	const uint32_t L = LBase + SIndex / NCount; \
336	const uint32_t V = VBase + (SIndex % NCount) / TCount; \
337	const uint32_t T = TBase + SIndex % TCount; \
338	dynstr_ucs4_append(d, L); \
339	dynstr_ucs4_append(d, V); \
340	if(T != TBase) \
341	dynstr_ucs4_append(d, T); \
342	} else \
343	/* Equal to own canonical decomposition */ \
344	dynstr_ucs4_append(d, c); \
345	} while(0)
346
347	/** @brief Recursively compute the canonical decomposition of @p c
348	* @param d Dynamic string to store decomposition in
349	* @param c Code point to decompose (must be a valid!)
350	* @return 0 on success, -1 on error
351	*/
352	static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
353	utf32__decompose_one_generic(canon);
354	}
355
356	/** @brief Recursively compute the compatibility decomposition of @p c
357	* @param d Dynamic string to store decomposition in
358	* @param c Code point to decompose (must be a valid!)
359	* @return 0 on success, -1 on error
360	*/
361	static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
362	utf32__decompose_one_generic(compat);
363	}
364
365	/** @brief Guts of the decomposition functions */
366	#define utf32__decompose_generic(WHICH) do { \
367	struct dynstr_ucs4 d; \
368	uint32_t c; \
369	\
370	dynstr_ucs4_init(&d); \
371	while(ns) { \
372	c = *s++; \
373	if((c >= 0xDF800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
374	goto error; \
375	utf32__decompose_one_##WHICH(&d, c); \
376	--ns; \
377	} \
378	if(utf32__canonical_ordering(d.vec, d.nvec)) \
379	goto error; \
380	dynstr_ucs4_terminate(&d); \
381	if(ndp) \
382	*ndp = d.nvec; \
383	return d.vec; \
384	error: \
385	xfree(d.vec); \
386	return 0; \
387	} while(0)
388
389	/** @brief Canonically decompose @p [s,s+ns)
390	* @param s Pointer to string
391	* @param ns Length of string
392	* @param ndp Where to store length of result
393	* @return Pointer to result string, or NULL
394	*
395	* Computes the canonical decomposition of a string and stably sorts combining
396	* characters into canonical order. The result is in Normalization Form D and
397	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
398	* NormalizationTest.txt.
399	*
400	* Returns NULL if the string is not valid for either of the following
401	* reasons:
402	* - it codes for a UTF-16 surrogate
403	* - it codes for a value outside the unicode code space
404	*/
405	uint32_t utf32_decompose_canon(const uint32_t s, size_t ns, size_t *ndp) {
406	utf32__decompose_generic(canon);
407	}
408
409	/** @brief Compatibility decompose @p [s,s+ns)
410	* @param s Pointer to string
411	* @param ns Length of string
412	* @param ndp Where to store length of result
413	* @return Pointer to result string, or NULL
414	*
415	* Computes the compatibility decomposition of a string and stably sorts
416	* combining characters into canonical order. The result is in Normalization
417	* Form KD and (at the time of writing!) passes the NFKD tests defined in
418	* Unicode 5.0's NormalizationTest.txt.
419	*
420	* Returns NULL if the string is not valid for either of the following
421	* reasons:
422	* - it codes for a UTF-16 surrogate
423	* - it codes for a value outside the unicode code space
424	*/
425	uint32_t utf32_decompose_compat(const uint32_t s, size_t ns, size_t *ndp) {
426	utf32__decompose_generic(compat);
427	}
428
429	/** @brief Case-fold @p C
430	* @param D String to append to
431	* @param C Character to fold
432	*/
433	static inline void utf32__casefold_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
434	const uint32_t *cf =
435	(c < UNICODE_NCHARS
436	? unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].casefold
437	: 0);
438	if(cf) {
439	/* Found a case-fold mapping in the table */
440	while(*cf)
441	utf32__decompose_one_canon(d, *cf++);
442	} else
443	utf32__decompose_one_canon(d, c);
444	}
445
446	/** @brief Case-fold @p [s,s+ns)
447	* @param s Pointer to string
448	* @param ns Length of string
449	* @param ndp Where to store length of result
450	* @return Pointer to result string, or NULL
451	*
452	* Case-fold the string at @p s according to full default case-folding rules
453	* (s3.13). The result will be in NFD.
454	*
455	* Returns NULL if the string is not valid for either of the following
456	* reasons:
457	* - it codes for a UTF-16 surrogate
458	* - it codes for a value outside the unicode code space
459	*/
460	uint32_t utf32_casefold_canon(const uint32_t s, size_t ns, size_t *ndp) {
461	struct dynstr_ucs4 d;
462	uint32_t c;
463	size_t n;
464	uint32_t *ss = 0;
465
466	/* If the canonical decomposition of the string includes any combining
467	* character that case-folds to a non-combining character then we must
468	* normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
469	* GREEK YPOGEGRAMMENI in its decomposition and the various characters that
470	* canonically decompose to it. */
471	for(n = 0; n < ns; ++n) {
472	c = s[n];
473	if(c < UNICODE_NCHARS
474	&& (unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS].flags
475	& unicode_normalize_before_casefold))
476	break;
477	}
478	if(n < ns) {
479	/* We need a preliminary decomposition */
480	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
481	return 0;
482	s = ss;
483	}
484	dynstr_ucs4_init(&d);
485	while(ns) {
486	c = *s++;
487	if((c >= 0xDF800 && c <= 0xDFFF) \|\| c > 0x10FFFF)
488	goto error;
489	utf32__casefold_one_canon(&d, c);
490	--ns;
491	}
492	if(utf32__canonical_ordering(d.vec, d.nvec))
493	goto error;
494	dynstr_ucs4_terminate(&d);
495	if(ndp)
496	*ndp = d.nvec;
497	return d.vec;
498	error:
499	xfree(d.vec);
500	xfree(ss);
501	return 0;
502	}
503
504	/** @brief Order a pair of UTF-32 strings
505	* @param a First 0-terminated string
506	* @param b Second 0-terminated string
507	* @return -1, 0 or 1 for a less than, equal to or greater than b
508	*
509	* "Comparable to strcmp() at its best."
510	*/
511	int utf32_cmp(const uint32_t a, const uint32_t b) {
512	while(a && b && a == b) {
513	++a;
514	++b;
515	}
516	return a < b ? -1 : (a > b ? 1 : 0);
517	}
518
519	/@}/
520	/** @defgroup Functions that operate on UTF-8 strings */
521	/@{/
522
523	/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
524	#define utf8__transform(FN) do { \
525	uint32_t to32 = 0, decomp32 = 0; \
526	size_t nto32, ndecomp32; \
527	char *decomp8 = 0; \
528	\
529	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
530	if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
531	decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
532	error: \
533	xfree(to32); \
534	xfree(decomp32); \
535	return decomp8; \
536	} while(0)
537
538	/** @brief Canonically decompose @p [s,s+ns)
539	* @param s Pointer to string
540	* @param ns Length of string
541	* @param ndp Where to store length of result
542	* @return Pointer to result string, or NULL
543	*
544	* Computes the canonical decomposition of a string and stably sorts combining
545	* characters into canonical order. The result is in Normalization Form D and
546	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
547	* NormalizationTest.txt.
548	*
549	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
550	* this might be.
551	*
552	* See also utf32_decompose_canon().
553	*/
554	char utf8_decompose_canon(const char s, size_t ns, size_t *ndp) {
555	utf8__transform(utf32_decompose_canon);
556	}
557
558	/** @brief Compatibility decompose @p [s,s+ns)
559	* @param s Pointer to string
560	* @param ns Length of string
561	* @param ndp Where to store length of result
562	* @return Pointer to result string, or NULL
563	*
564	* Computes the compatibility decomposition of a string and stably sorts
565	* combining characters into canonical order. The result is in Normalization
566	* Form KD and (at the time of writing!) passes the NFKD tests defined in
567	* Unicode 5.0's NormalizationTest.txt.
568	*
569	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
570	* this might be.
571	*
572	* See also utf32_decompose_compat().
573	*/
574	char utf8_decompose_compat(const char s, size_t ns, size_t *ndp) {
575	utf8__transform(utf32_decompose_compat);
576	}
577
578	/** @brief Case-fold @p [s,s+ns)
579	* @param s Pointer to string
580	* @param ns Length of string
581	* @param ndp Where to store length of result
582	* @return Pointer to result string, or NULL
583	*
584	* Case-fold the string at @p s according to full default case-folding rules
585	* (s3.13). The result will be in NFD.
586	*
587	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
588	* this might be.
589	*/
590	char utf8_casefold_canon(const char s, size_t ns, size_t *ndp) {
591	utf8__transform(utf32_casefold_canon);
592	}
593
594	/** @brief Compatibility case-fold @p [s,s+ns)
595	* @param s Pointer to string
596	* @param ns Length of string
597	* @param ndp Where to store length of result
598	* @return Pointer to result string, or NULL
599	*
600	* Case-fold the string at @p s according to full default case-folding rules
601	* (s3.13). The result will be in NFKD.
602	*
603	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
604	* this might be.
605	*/
606	#if 0
607	char utf8_casefold_compat(const char s, size_t ns, size_t *ndp) {
608	utf8__transform(utf32_casefold_compat);
609	}
610	#endif
611
612	/@}/
613
614	/*
615	Local Variables:
616	c-basic-offset:2
617	comment-column:40
618	fill-column:79
619	indent-tabs-mode:nil
620	End:
621	*/