chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2007 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/unicode.c
	21	* @brief Unicode support functions
	22	*
	23	* Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
	24	* encoding schemes). The primary encoding form is UTF-32 but convenience
	25	* wrappers using UTF-8 are provided for a number of functions.
	26	*
	27	* The idea is that all the strings that hit the database will be in a
	28	* particular normalization form, and for the search and tags database
	29	* in case-folded form, so they can be naively compared within the
	30	* database code.
	31	*
	32	* As the code stands this guarantee is not well met!
	33	*/
	34
	35	#include <config.h>
	36	#include "types.h"
	37
	38	#include <string.h>
	39	#include <stdio.h> /* TODO */
	40
	41	#include "mem.h"
	42	#include "vector.h"
	43	#include "unicode.h"
	44	#include "unidata.h"
	45
	46	/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
	47	/@{/
	48
	49	/** @brief Convert UTF-32 to UTF-8
	50	* @param s Source string
	51	* @param ns Length of source string in code points
	52	* @param ndp Where to store length of destination string (or NULL)
	53	* @return Newly allocated destination string or NULL on error
	54	*
	55	* If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
	56	* invalid if:
	57	* - it codes for a UTF-16 surrogate
	58	* - it codes for a value outside the unicode code space
	59	*
	60	* The return value is always 0-terminated. The value returned via @p *ndp
	61	* does not include the terminator.
	62	*/
	63	char utf32_to_utf8(const uint32_t s, size_t ns, size_t *ndp) {
	64	struct dynstr d;
	65	uint32_t c;
	66
	67	dynstr_init(&d);
	68	while(ns > 0) {
	69	c = *s++;
	70	if(c < 0x80)
	71	dynstr_append(&d, c);
	72	else if(c < 0x0800) {
	73	dynstr_append(&d, 0xC0 \| (c >> 6));
	74	dynstr_append(&d, 0x80 \| (c & 0x3F));
	75	} else if(c < 0x10000) {
	76	if(c >= 0xD800 && c <= 0xDFFF)
	77	goto error;
	78	dynstr_append(&d, 0xE0 \| (c >> 12));
	79	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	80	dynstr_append(&d, 0x80 \| (c & 0x3F));
	81	} else if(c < 0x110000) {
	82	dynstr_append(&d, 0xF0 \| (c >> 18));
	83	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	84	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	85	dynstr_append(&d, 0x80 \| (c & 0x3F));
	86	} else
	87	goto error;
	88	--ns;
	89	}
	90	dynstr_terminate(&d);
	91	if(ndp)
	92	*ndp = d.nvec;
	93	return d.vec;
	94	error:
	95	xfree(d.vec);
	96	return 0;
	97	}
	98
	99	/** @brief Convert UTF-8 to UTF-32
	100	* @param s Source string
	101	* @param ns Length of source string in code points
	102	* @param ndp Where to store length of destination string (or NULL)
	103	* @return Newly allocated destination string or NULL
	104	*
	105	* The return value is always 0-terminated. The value returned via @p *ndp
	106	* does not include the terminator.
	107	*
	108	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
	109	* for a code point is invalid if:
	110	* - it is not the shortest possible sequence for the code point
	111	* - it codes for a UTF-16 surrogate
	112	* - it codes for a value outside the unicode code space
	113	*/
	114	uint32_t utf8_to_utf32(const char s, size_t ns, size_t *ndp) {
	115	struct dynstr_ucs4 d;
	116	uint32_t c32, c;
	117	const uint8_t ss = (const uint8_t )s;
	118
	119	dynstr_ucs4_init(&d);
	120	while(ns > 0) {
	121	c = *ss++;
	122	--ns;
	123	/* Acceptable UTF-8 is that which codes for Unicode Scalar Values
	124	* (Unicode 5.0.0 s3.9 D76)
	125	*
	126	* 0xxxxxxx
	127	* 7 data bits gives 0x00 - 0x7F and all are acceptable
	128	*
	129	* 110xxxxx 10xxxxxx
	130	* 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
	131	*
	132	* 1110xxxx 10xxxxxx 10xxxxxx
	133	* 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
	134	* (and UTF-16 surrogates are not acceptable)
	135	*
	136	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	137	* 21 data bits gives 0x00000000 - 0x001FFFFF
	138	* but only 0x00010000 - 0x0010FFFF are acceptable
	139	*
	140	* It is NOT always the case that the data bits in the first byte are
	141	* always non-0 for the acceptable values, so we do a separate check after
	142	* decoding.
	143	*/
	144	if(c < 0x80)
	145	c32 = c;
	146	else if(c <= 0xDF) {
	147	if(ns < 1) goto error;
	148	c32 = c & 0x1F;
	149	c = *ss++;
	150	if((c & 0xC0) != 0x80) goto error;
	151	c32 = (c32 << 6) \| (c & 0x3F);
	152	if(c32 < 0x80) goto error;
	153	} else if(c <= 0xEF) {
	154	if(ns < 2) goto error;
	155	c32 = c & 0x0F;
	156	c = *ss++;
	157	if((c & 0xC0) != 0x80) goto error;
	158	c32 = (c32 << 6) \| (c & 0x3F);
	159	c = *ss++;
	160	if((c & 0xC0) != 0x80) goto error;
	161	c32 = (c32 << 6) \| (c & 0x3F);
	162	if(c32 < 0x0800 \|\| (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
	163	} else if(c <= 0xF7) {
	164	if(ns < 3) goto error;
	165	c32 = c & 0x07;
	166	c = *ss++;
	167	if((c & 0xC0) != 0x80) goto error;
	168	c32 = (c32 << 6) \| (c & 0x3F);
	169	c = *ss++;
	170	if((c & 0xC0) != 0x80) goto error;
	171	c32 = (c32 << 6) \| (c & 0x3F);
	172	c = *ss++;
	173	if((c & 0xC0) != 0x80) goto error;
	174	c32 = (c32 << 6) \| (c & 0x3F);
	175	if(c32 < 0x00010000 \|\| c32 > 0x0010FFFF) goto error;
	176	} else
	177	goto error;
	178	dynstr_ucs4_append(&d, c32);
	179	}
	180	dynstr_ucs4_terminate(&d);
	181	if(ndp)
	182	*ndp = d.nvec;
	183	return d.vec;
	184	error:
	185	xfree(d.vec);
	186	return 0;
	187	}
	188
	189	/@}/
	190	/** @defgroup utf32 Functions that operate on UTF-32 strings */
	191	/@{/
	192
	193	/** @brief Return the length of a 0-terminated UTF-32 string
	194	* @param s Pointer to 0-terminated string
	195	* @return Length of string in code points (excluding terminator)
	196	*
	197	* Unlike the conversion functions no validity checking is done on the string.
	198	*/
	199	size_t utf32_len(const uint32_t *s) {
	200	const uint32_t *t = s;
	201
	202	while(*t)
	203	++t;
	204	return (size_t)(t - s);
	205	}
	206
	207	/** @brief Return the @ref unidata structure for code point @p c
	208	*
	209	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	210	*/
	211	static const struct unidata *utf32__unidata(uint32_t c) {
	212	if(c < UNICODE_NCHARS)
	213	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	214	else if((c >= 0xF0000 && c <= 0xFFFFD)
	215	\|\| (c >= 0x100000 && c <= 0x10FFFD))
	216	return utf32__unidata(0xE000); /* Co */
	217	else
	218	return utf32__unidata(0xFFFF); /* Cn */
	219	}
	220
	221	/** @brief Return the combining class of @p c
	222	* @param c Code point
	223	* @return Combining class of @p c
	224	*/
	225	static inline int utf32__combining_class(uint32_t c) {
	226	return utf32__unidata(c)->ccc;
	227	}
	228
	229	/** @brief Stably sort [s,s+ns) into descending order of combining class
	230	* @param s Start of array
	231	* @param ns Number of elements, must be at least 1
	232	* @param buffer Buffer of at least @p ns elements
	233	*/
	234	static void utf32__sort_ccc(uint32_t s, size_t ns, uint32_t buffer) {
	235	uint32_t a, b, *bp;
	236	size_t na, nb;
	237
	238	switch(ns) {
	239	case 1: /* 1-element array is always sorted */
	240	return;
	241	case 2: /* 2-element arrays are trivial to sort */
	242	if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
	243	uint32_t tmp = s[0];
	244	s[0] = s[1];
	245	s[1] = tmp;
	246	}
	247	return;
	248	default:
	249	/* Partition the array */
	250	na = ns / 2;
	251	nb = ns - na;
	252	a = s;
	253	b = s + na;
	254	/* Sort the two halves of the array */
	255	utf32__sort_ccc(a, na, buffer);
	256	utf32__sort_ccc(b, nb, buffer);
	257	/* Merge them back into one, via the buffer */
	258	bp = buffer;
	259	while(na > 0 && nb > 0) {
	260	/* We want descending order of combining class (hence <)
	261	* and we want stability within combining classes (hence <=)
	262	*/
	263	if(utf32__combining_class(a) <= utf32__combining_class(b)) {
	264	bp++ = a++;
	265	--na;
	266	} else {
	267	bp++ = b++;
	268	--nb;
	269	}
	270	}
	271	while(na > 0) {
	272	bp++ = a++;
	273	--na;
	274	}
	275	while(nb > 0) {
	276	bp++ = b++;
	277	--nb;
	278	}
	279	memcpy(s, buffer, ns * sizeof(uint32_t));
	280	return;
	281	}
	282	}
	283
	284	/** @brief Put combining characters into canonical order
	285	* @param s Pointer to UTF-32 string
	286	* @param ns Length of @p s
	287	* @return 0 on success, -1 on error
	288	*
	289	* @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
	290	* ordering.
	291	*
	292	* Currently we only support a maximum of 1024 combining characters after each
	293	* base character. If this limit is exceeded then -1 is returned.
	294	*/
	295	static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
	296	size_t nc;
	297	uint32_t buffer[1024];
	298
	299	/* The ordering amounts to a stable sort of each contiguous group of
	300	* characters with non-0 combining class. */
	301	while(ns > 0) {
	302	/* Skip non-combining characters */
	303	if(utf32__combining_class(*s) == 0) {
	304	++s;
	305	--ns;
	306	continue;
	307	}
	308	/* We must now have at least one combining character; see how many
	309	* there are */
	310	for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
	311	;
	312	if(nc > 1024)
	313	return -1;
	314	/* Sort the array */
	315	utf32__sort_ccc(s, nc, buffer);
	316	s += nc;
	317	ns -= nc;
	318	}
	319	return 0;
	320	}
	321
	322	/* Magic numbers from UAX #15 s16 */
	323	#define SBase 0xAC00
	324	#define LBase 0x1100
	325	#define VBase 0x1161
	326	#define TBase 0x11A7
	327	#define LCount 19
	328	#define VCount 21
	329	#define TCount 28
	330	#define NCount (VCount * TCount)
	331	#define SCount (LCount * NCount)
	332
	333	/** @brief Guts of the decomposition lookup functions */
	334	#define utf32__decompose_one_generic(WHICH) do { \
	335	const uint32_t *dc = utf32__unidata(c)->WHICH; \
	336	if(dc) { \
	337	/* Found a canonical decomposition in the table */ \
	338	while(*dc) \
	339	utf32__decompose_one_##WHICH(d, *dc++); \
	340	} else if(c >= SBase && c < SBase + SCount) { \
	341	/* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
	342	const uint32_t SIndex = c - SBase; \
	343	const uint32_t L = LBase + SIndex / NCount; \
	344	const uint32_t V = VBase + (SIndex % NCount) / TCount; \
	345	const uint32_t T = TBase + SIndex % TCount; \
	346	dynstr_ucs4_append(d, L); \
	347	dynstr_ucs4_append(d, V); \
	348	if(T != TBase) \
	349	dynstr_ucs4_append(d, T); \
	350	} else \
	351	/* Equal to own canonical decomposition */ \
	352	dynstr_ucs4_append(d, c); \
	353	} while(0)
	354
	355	/** @brief Recursively compute the canonical decomposition of @p c
	356	* @param d Dynamic string to store decomposition in
	357	* @param c Code point to decompose (must be a valid!)
	358	* @return 0 on success, -1 on error
	359	*/
	360	static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
	361	utf32__decompose_one_generic(canon);
	362	}
	363
	364	/** @brief Recursively compute the compatibility decomposition of @p c
	365	* @param d Dynamic string to store decomposition in
	366	* @param c Code point to decompose (must be a valid!)
	367	* @return 0 on success, -1 on error
	368	*/
	369	static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
	370	utf32__decompose_one_generic(compat);
	371	}
	372
	373	/** @brief Guts of the decomposition functions */
	374	#define utf32__decompose_generic(WHICH) do { \
	375	struct dynstr_ucs4 d; \
	376	uint32_t c; \
	377	\
	378	dynstr_ucs4_init(&d); \
	379	while(ns) { \
	380	c = *s++; \
	381	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	382	goto error; \
	383	utf32__decompose_one_##WHICH(&d, c); \
	384	--ns; \
	385	} \
	386	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	387	goto error; \
	388	dynstr_ucs4_terminate(&d); \
	389	if(ndp) \
	390	*ndp = d.nvec; \
	391	return d.vec; \
	392	error: \
	393	xfree(d.vec); \
	394	return 0; \
	395	} while(0)
	396
	397	/** @brief Canonically decompose @p [s,s+ns)
	398	* @param s Pointer to string
	399	* @param ns Length of string
	400	* @param ndp Where to store length of result
	401	* @return Pointer to result string, or NULL
	402	*
	403	* Computes the canonical decomposition of a string and stably sorts combining
	404	* characters into canonical order. The result is in Normalization Form D and
	405	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
	406	* NormalizationTest.txt.
	407	*
	408	* Returns NULL if the string is not valid for either of the following reasons:
	409	* - it codes for a UTF-16 surrogate
	410	* - it codes for a value outside the unicode code space
	411	*/
	412	uint32_t utf32_decompose_canon(const uint32_t s, size_t ns, size_t *ndp) {
	413	utf32__decompose_generic(canon);
	414	}
	415
	416	/** @brief Compatibility decompose @p [s,s+ns)
	417	* @param s Pointer to string
	418	* @param ns Length of string
	419	* @param ndp Where to store length of result
	420	* @return Pointer to result string, or NULL
	421	*
	422	* Computes the compatibility decomposition of a string and stably sorts
	423	* combining characters into canonical order. The result is in Normalization
	424	* Form KD and (at the time of writing!) passes the NFKD tests defined in
	425	* Unicode 5.0's NormalizationTest.txt.
	426	*
	427	* Returns NULL if the string is not valid for either of the following reasons:
	428	* - it codes for a UTF-16 surrogate
	429	* - it codes for a value outside the unicode code space
	430	*/
	431	uint32_t utf32_decompose_compat(const uint32_t s, size_t ns, size_t *ndp) {
	432	utf32__decompose_generic(compat);
	433	}
	434
	435	/** @brief Single-character case-fold and decompose operation */
	436	#define utf32__casefold_one(WHICH) do { \
	437	const uint32_t *cf = utf32__unidata(c)->casefold; \
	438	if(cf) { \
	439	/* Found a case-fold mapping in the table */ \
	440	while(*cf) \
	441	utf32__decompose_one_##WHICH(&d, *cf++); \
	442	} else \
	443	utf32__decompose_one_##WHICH(&d, c); \
	444	} while(0)
	445
	446	/** @brief Case-fold @p [s,s+ns)
	447	* @param s Pointer to string
	448	* @param ns Length of string
	449	* @param ndp Where to store length of result
	450	* @return Pointer to result string, or NULL
	451	*
	452	* Case-fold the string at @p s according to full default case-folding rules
	453	* (s3.13) for caseless matching. The result will be in NFD.
	454	*
	455	* Returns NULL if the string is not valid for either of the following reasons:
	456	* - it codes for a UTF-16 surrogate
	457	* - it codes for a value outside the unicode code space
	458	*/
	459	uint32_t utf32_casefold_canon(const uint32_t s, size_t ns, size_t *ndp) {
	460	struct dynstr_ucs4 d;
	461	uint32_t c;
	462	size_t n;
	463	uint32_t *ss = 0;
	464
	465	/* If the canonical decomposition of the string includes any combining
	466	* character that case-folds to a non-combining character then we must
	467	* normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
	468	* GREEK YPOGEGRAMMENI in its decomposition and the various characters that
	469	* canonically decompose to it. */
	470	for(n = 0; n < ns; ++n)
	471	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	472	break;
	473	if(n < ns) {
	474	/* We need a preliminary decomposition */
	475	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	476	return 0;
	477	s = ss;
	478	}
	479	dynstr_ucs4_init(&d);
	480	while(ns) {
	481	c = *s++;
	482	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF)
	483	goto error;
	484	utf32__casefold_one(canon);
	485	--ns;
	486	}
	487	if(utf32__canonical_ordering(d.vec, d.nvec))
	488	goto error;
	489	dynstr_ucs4_terminate(&d);
	490	if(ndp)
	491	*ndp = d.nvec;
	492	return d.vec;
	493	error:
	494	xfree(d.vec);
	495	xfree(ss);
	496	return 0;
	497	}
	498
	499	/** @brief Compatibilit case-fold @p [s,s+ns)
	500	* @param s Pointer to string
	501	* @param ns Length of string
	502	* @param ndp Where to store length of result
	503	* @return Pointer to result string, or NULL
	504	*
	505	* Case-fold the string at @p s according to full default case-folding rules
	506	* (s3.13) for compatibility caseless matching. The result will be in NFKD.
	507	*
	508	* Returns NULL if the string is not valid for either of the following reasons:
	509	* - it codes for a UTF-16 surrogate
	510	* - it codes for a value outside the unicode code space
	511	*/
	512	uint32_t utf32_casefold_compat(const uint32_t s, size_t ns, size_t *ndp) {
	513	struct dynstr_ucs4 d;
	514	uint32_t c;
	515	size_t n;
	516	uint32_t *ss = 0;
	517
	518	for(n = 0; n < ns; ++n)
	519	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	520	break;
	521	if(n < ns) {
	522	/* We need a preliminary _canonical_ decomposition */
	523	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	524	return 0;
	525	s = ss;
	526	}
	527	/* This computes NFKD(toCaseFold(s)) */
	528	#define compat_casefold_middle() do { \
	529	dynstr_ucs4_init(&d); \
	530	while(ns) { \
	531	c = *s++; \
	532	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	533	goto error; \
	534	utf32__casefold_one(compat); \
	535	--ns; \
	536	} \
	537	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	538	goto error; \
	539	} while(0)
	540	/* Do the inner (NFKD o toCaseFold) */
	541	compat_casefold_middle();
	542	/* We can do away with the NFD'd copy of the input now */
	543	xfree(ss);
	544	s = ss = d.vec;
	545	ns = d.nvec;
	546	/* Do the outer (NFKD o toCaseFold) */
	547	compat_casefold_middle();
	548	/* That's all */
	549	dynstr_ucs4_terminate(&d);
	550	if(ndp)
	551	*ndp = d.nvec;
	552	return d.vec;
	553	error:
	554	xfree(d.vec);
	555	xfree(ss);
	556	return 0;
	557	}
	558
	559	/** @brief Order a pair of UTF-32 strings
	560	* @param a First 0-terminated string
	561	* @param b Second 0-terminated string
	562	* @return -1, 0 or 1 for a less than, equal to or greater than b
	563	*
	564	* "Comparable to strcmp() at its best."
	565	*/
	566	int utf32_cmp(const uint32_t a, const uint32_t b) {
	567	while(a && b && a == b) {
	568	++a;
	569	++b;
	570	}
	571	return a < b ? -1 : (a > b ? 1 : 0);
	572	}
	573
	574	/** @brief Return the General_Category value for @p c
	575	* @param Code point
	576	* @return General_Category property value
	577	*/
	578	static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
	579	return utf32__unidata(c)->general_category;
	580	}
	581
	582	/** @brief Check Grapheme_Cluster_Break property
	583	* @param c Code point
	584	* @return 0 if it is as described, 1 otherwise
	585	*/
	586	static int utf32__is_control_or_cr_or_lf(uint32_t c) {
	587	switch(utf32__general_category(c)) {
	588	default:
	589	return 0;
	590	case unicode_General_Category_Zl:
	591	case unicode_General_Category_Zp:
	592	case unicode_General_Category_Cc:
	593	return 1;
	594	case unicode_General_Category_Cf:
	595	if(c == 0x200C \|\| c == 0x200D)
	596	return 0;
	597	return 1;
	598	}
	599	}
	600
	601	#define Hangul_Syllable_Type_NA 0
	602	#define Hangul_Syllable_Type_L 0x1100
	603	#define Hangul_Syllable_Type_V 0x1160
	604	#define Hangul_Syllable_Type_T 0x11A8
	605	#define Hangul_Syllable_Type_LV 0xAC00
	606	#define Hangul_Syllable_Type_LVT 0xAC01
	607
	608	/** @brief Determine Hangul_Syllable_Type of @p c
	609	* @param c Code point
	610	* @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
	611	*
	612	* If this is a Hangul character then a representative member of its
	613	* equivalence class is returned. Otherwise Hangul_Syllable_Type_NA is
	614	* returned.
	615	*/
	616	static uint32_t utf32__hangul_syllable_type(uint32_t c) {
	617	/* Dispose of the bulk of the non-Hangul code points first */
	618	if(c < 0x1100) return Hangul_Syllable_Type_NA;
	619	if(c > 0x1200 && c < 0xAC00) return Hangul_Syllable_Type_NA;
	620	if(c >= 0xD800) return Hangul_Syllable_Type_NA;
	621	/* Now we pick out the assigned Hangul code points */
	622	if((c >= 0x1100 && c <= 0x1159) \|\| c == 0x115F) return Hangul_Syllable_Type_L;
	623	if(c >= 0x1160 && c <= 0x11A2) return Hangul_Syllable_Type_V;
	624	if(c >= 0x11A8 && c <= 0x11F9) return Hangul_Syllable_Type_T;
	625	if(c >= 0xAC00 && c <= 0xD7A3) {
	626	if(c % 28 == 16)
	627	return Hangul_Syllable_Type_LV;
	628	else
	629	return Hangul_Syllable_Type_LVT;
	630	}
	631	return Hangul_Syllable_Type_NA;
	632	}
	633
	634	/** @brief Determine Word_Break property
	635	* @param c Code point
	636	* @return Word_Break property value of @p c
	637	*/
	638	static enum unicode_Word_Break utf32__word_break(uint32_t c) {
	639	if(c < 0xAC00 \|\| c > 0xD7A3)
	640	return utf32__unidata(c)->word_break;
	641	else
	642	return unicode_Word_Break_ALetter;
	643	}
	644
	645	/** @brief Identify a grapheme cluster boundary
	646	* @param s Start of string (must be NFD)
	647	* @param ns Length of string
	648	* @param n Index within string (in [0,ns].)
	649	* @return 1 at a grapheme cluster boundary, 0 otherwise
	650	*
	651	* This function identifies default grapheme cluster boundaries as described in
	652	* UAX #29 s3. It returns 1 if @p n points at the code point just after a
	653	* grapheme cluster boundary (including the hypothetical code point just after
	654	* the end of the string).
	655	*/
	656	int utf32_is_gcb(const uint32_t *s, size_t ns, size_t n) {
	657	uint32_t before, after;
	658	uint32_t hbefore, hafter;
	659	/* GB1 and GB2 */
	660	if(n == 0 \|\| n == ns)
	661	return 1;
	662	/* Now we know that s[n-1] and s[n] are safe to inspect */
	663	/* GB3 */
	664	before = s[n-1];
	665	after = s[n];
	666	if(before == 0x000D && after == 0x000A)
	667	return 0;
	668	/* GB4 and GB5 */
	669	if(utf32__is_control_or_cr_or_lf(before)
	670	\|\| utf32__is_control_or_cr_or_lf(after))
	671	return 1;
	672	hbefore = utf32__hangul_syllable_type(before);
	673	hafter = utf32__hangul_syllable_type(after);
	674	/* GB6 */
	675	if(hbefore == Hangul_Syllable_Type_L
	676	&& (hafter == Hangul_Syllable_Type_L
	677	\|\| hafter == Hangul_Syllable_Type_V
	678	\|\| hafter == Hangul_Syllable_Type_LV
	679	\|\| hafter == Hangul_Syllable_Type_LVT))
	680	return 0;
	681	/* GB7 */
	682	if((hbefore == Hangul_Syllable_Type_LV
	683	\|\| hbefore == Hangul_Syllable_Type_V)
	684	&& (hafter == Hangul_Syllable_Type_V
	685	\|\| hafter == Hangul_Syllable_Type_T))
	686	return 0;
	687	/* GB8 */
	688	if((hbefore == Hangul_Syllable_Type_LVT
	689	\|\| hbefore == Hangul_Syllable_Type_T)
	690	&& hafter == Hangul_Syllable_Type_T)
	691	return 0;
	692	/* GB9 */
	693	if(utf32__word_break(after) == unicode_Word_Break_Extend)
	694	return 0;
	695	/* GB10 */
	696	return 1;
	697	}
	698
	699	/** @brief Return true if @p c is ignorable for boundary specifications */
	700	static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
	701	return (wb == unicode_Word_Break_Extend
	702	\|\| wb == unicode_Word_Break_Format);
	703	}
	704
	705	/** @brief Identify a word boundary
	706	* @param s Start of string (must be NFD)
	707	* @param ns Length of string
	708	* @param n Index within string (in [0,ns].)
	709	* @return 1 at a word boundary, 0 otherwise
	710	*
	711	* This function identifies default word boundaries as described in UAX #29 s4.
	712	* It returns 1 if @p n points at the code point just after a word boundary
	713	* (including the hypothetical code point just after the end of the string).
	714	*/
	715	int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
	716	enum unicode_Word_Break twobefore, before, after, twoafter;
	717	size_t nn;
	718
	719	/* WB1 and WB2 */
	720	if(n == 0 \|\| n == ns)
	721	return 1;
	722	/* WB3 */
	723	if(s[n-1] == 0x000D && s[n] == 0x000A)
	724	return 0;
	725	/* WB4 */
	726	/* (!Sep) x (Extend\|Format) as in UAX #29 s6.2 */
	727	switch(s[n-1]) { /* bit of a bodge */
	728	case 0x000A:
	729	case 0x000D:
	730	case 0x0085:
	731	case 0x2028:
	732	case 0x2029:
	733	break;
	734	default:
	735	if(utf32__boundary_ignorable(utf32__word_break(s[n])))
	736	return 0;
	737	break;
	738	}
	739	/* Gather the property values we'll need for the rest of the test taking the
	740	* s6.2 changes into account */
	741	/* First we look at the code points after the proposed boundary */
	742	nn = n; /* <ns */
	743	after = utf32__word_break(s[nn++]);
	744	if(!utf32__boundary_ignorable(after)) {
	745	/* X (Extend\|Format)* -> X */
	746	while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
	747	++nn;
	748	}
	749	/* It's possible now that nn=ns */
	750	if(nn < ns)
	751	twoafter = utf32__word_break(s[nn]);
	752	else
	753	twoafter = unicode_Word_Break_Other;
	754
	755	/* Next we look at the code points before the proposed boundary. This is a
	756	* bit fiddlier. */
	757	nn = n;
	758	while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
	759	--nn;
	760	if(nn == 0) {
	761	/* s[nn] must be ignorable */
	762	before = utf32__word_break(s[nn]);
	763	twobefore = unicode_Word_Break_Other;
	764	} else {
	765	/* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
	766	* ignorable. */
	767	before = utf32__word_break(s[nn - 1]);
	768	--nn;
	769	/* Repeat the exercise */
	770	while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
	771	--nn;
	772	if(nn == 0)
	773	twobefore = utf32__word_break(s[nn]);
	774	else
	775	twobefore = utf32__word_break(s[nn - 1]);
	776	}
	777
	778	/* WB5 */
	779	if(before == unicode_Word_Break_ALetter
	780	&& after == unicode_Word_Break_ALetter)
	781	return 0;
	782	/* WB6 */
	783	if(before == unicode_Word_Break_ALetter
	784	&& after == unicode_Word_Break_MidLetter
	785	&& twoafter == unicode_Word_Break_ALetter)
	786	return 0;
	787	/* WB7 */
	788	if(twobefore == unicode_Word_Break_ALetter
	789	&& before == unicode_Word_Break_MidLetter
	790	&& after == unicode_Word_Break_ALetter)
	791	return 0;
	792	/* WB8 */
	793	if(before == unicode_Word_Break_Numeric
	794	&& after == unicode_Word_Break_Numeric)
	795	return 0;
	796	/* WB9 */
	797	if(before == unicode_Word_Break_ALetter
	798	&& after == unicode_Word_Break_Numeric)
	799	return 0;
	800	/* WB10 */
	801	if(before == unicode_Word_Break_Numeric
	802	&& after == unicode_Word_Break_ALetter)
	803	return 0;
	804	/* WB11 */
	805	if(twobefore == unicode_Word_Break_Numeric
	806	&& before == unicode_Word_Break_MidNum
	807	&& after == unicode_Word_Break_Numeric)
	808	return 0;
	809	/* WB12 */
	810	if(before == unicode_Word_Break_Numeric
	811	&& after == unicode_Word_Break_MidNum
	812	&& twoafter == unicode_Word_Break_Numeric)
	813	return 0;
	814	/* WB13 */
	815	if(before == unicode_Word_Break_Katakana
	816	&& after == unicode_Word_Break_Katakana)
	817	return 0;
	818	/* WB13a */
	819	if((before == unicode_Word_Break_ALetter
	820	\|\| before == unicode_Word_Break_Numeric
	821	\|\| before == unicode_Word_Break_Katakana
	822	\|\| before == unicode_Word_Break_ExtendNumLet)
	823	&& after == unicode_Word_Break_ExtendNumLet)
	824	return 0;
	825	/* WB13b */
	826	if(before == unicode_Word_Break_ExtendNumLet
	827	&& (after == unicode_Word_Break_ALetter
	828	\|\| after == unicode_Word_Break_Numeric
	829	\|\| after == unicode_Word_Break_Katakana))
	830	return 0;
	831	/* WB14 */
	832	return 1;
	833	}
	834
	835	/@}/
	836	/** @defgroup utf8 Functions that operate on UTF-8 strings */
	837	/@{/
	838
	839	/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
	840	#define utf8__transform(FN) do { \
	841	uint32_t to32 = 0, decomp32 = 0; \
	842	size_t nto32, ndecomp32; \
	843	char *decomp8 = 0; \
	844	\
	845	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
	846	if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
	847	decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
	848	error: \
	849	xfree(to32); \
	850	xfree(decomp32); \
	851	return decomp8; \
	852	} while(0)
	853
	854	/** @brief Canonically decompose @p [s,s+ns)
	855	* @param s Pointer to string
	856	* @param ns Length of string
	857	* @param ndp Where to store length of result
	858	* @return Pointer to result string, or NULL
	859	*
	860	* Computes the canonical decomposition of a string and stably sorts combining
	861	* characters into canonical order. The result is in Normalization Form D and
	862	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
	863	* NormalizationTest.txt.
	864	*
	865	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	866	* this might be.
	867	*
	868	* See also utf32_decompose_canon().
	869	*/
	870	char utf8_decompose_canon(const char s, size_t ns, size_t *ndp) {
	871	utf8__transform(utf32_decompose_canon);
	872	}
	873
	874	/** @brief Compatibility decompose @p [s,s+ns)
	875	* @param s Pointer to string
	876	* @param ns Length of string
	877	* @param ndp Where to store length of result
	878	* @return Pointer to result string, or NULL
	879	*
	880	* Computes the compatibility decomposition of a string and stably sorts
	881	* combining characters into canonical order. The result is in Normalization
	882	* Form KD and (at the time of writing!) passes the NFKD tests defined in
	883	* Unicode 5.0's NormalizationTest.txt.
	884	*
	885	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	886	* this might be.
	887	*
	888	* See also utf32_decompose_compat().
	889	*/
	890	char utf8_decompose_compat(const char s, size_t ns, size_t *ndp) {
	891	utf8__transform(utf32_decompose_compat);
	892	}
	893
	894	/** @brief Case-fold @p [s,s+ns)
	895	* @param s Pointer to string
	896	* @param ns Length of string
	897	* @param ndp Where to store length of result
	898	* @return Pointer to result string, or NULL
	899	*
	900	* Case-fold the string at @p s according to full default case-folding rules
	901	* (s3.13). The result will be in NFD.
	902	*
	903	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	904	* this might be.
	905	*/
	906	char utf8_casefold_canon(const char s, size_t ns, size_t *ndp) {
	907	utf8__transform(utf32_casefold_canon);
	908	}
	909
	910	/** @brief Compatibility case-fold @p [s,s+ns)
	911	* @param s Pointer to string
	912	* @param ns Length of string
	913	* @param ndp Where to store length of result
	914	* @return Pointer to result string, or NULL
	915	*
	916	* Case-fold the string at @p s according to full default case-folding rules
	917	* (s3.13). The result will be in NFKD.
	918	*
	919	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	920	* this might be.
	921	*/
	922	char utf8_casefold_compat(const char s, size_t ns, size_t *ndp) {
	923	utf8__transform(utf32_casefold_compat);
	924	}
	925
	926	/@}/
	927
	928	/*
	929	Local Variables:
	930	c-basic-offset:2
	931	comment-column:40
	932	fill-column:79
	933	indent-tabs-mode:nil
	934	End:
	935	*/