chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2007 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/unicode.c
	21	* @brief Unicode support functions
	22	*
	23	* Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
	24	* encoding schemes). The primary encoding form is UTF-32 but convenience
	25	* wrappers using UTF-8 are provided for a number of functions.
	26	*
	27	* The idea is that all the strings that hit the database will be in a
	28	* particular normalization form, and for the search and tags database
	29	* in case-folded form, so they can be naively compared within the
	30	* database code.
	31	*
	32	* As the code stands this guarantee is not well met!
	33	*/
	34
	35	#include <config.h>
	36	#include "types.h"
	37
	38	#include <string.h>
	39	#include <stdio.h> /* TODO */
	40
	41	#include "mem.h"
	42	#include "vector.h"
	43	#include "unicode.h"
	44	#include "unidata.h"
	45
	46	/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
	47	/@{/
	48
	49	/** @brief Convert UTF-32 to UTF-8
	50	* @param s Source string
	51	* @param ns Length of source string in code points
	52	* @param ndp Where to store length of destination string (or NULL)
	53	* @return Newly allocated destination string or NULL on error
	54	*
	55	* If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
	56	* invalid if:
	57	* - it codes for a UTF-16 surrogate
	58	* - it codes for a value outside the unicode code space
	59	*
	60	* The return value is always 0-terminated. The value returned via @p *ndp
	61	* does not include the terminator.
	62	*/
	63	char utf32_to_utf8(const uint32_t s, size_t ns, size_t *ndp) {
	64	struct dynstr d;
	65	uint32_t c;
	66
	67	dynstr_init(&d);
	68	while(ns > 0) {
	69	c = *s++;
	70	if(c < 0x80)
	71	dynstr_append(&d, c);
	72	else if(c < 0x0800) {
	73	dynstr_append(&d, 0xC0 \| (c >> 6));
	74	dynstr_append(&d, 0x80 \| (c & 0x3F));
	75	} else if(c < 0x10000) {
	76	if(c >= 0xD800 && c <= 0xDFFF)
	77	goto error;
	78	dynstr_append(&d, 0xE0 \| (c >> 12));
	79	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	80	dynstr_append(&d, 0x80 \| (c & 0x3F));
	81	} else if(c < 0x110000) {
	82	dynstr_append(&d, 0xF0 \| (c >> 18));
	83	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	84	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	85	dynstr_append(&d, 0x80 \| (c & 0x3F));
	86	} else
	87	goto error;
	88	--ns;
	89	}
	90	dynstr_terminate(&d);
	91	if(ndp)
	92	*ndp = d.nvec;
	93	return d.vec;
	94	error:
	95	xfree(d.vec);
	96	return 0;
	97	}
	98
	99	/** @brief Convert UTF-8 to UTF-32
	100	* @param s Source string
	101	* @param ns Length of source string in code points
	102	* @param ndp Where to store length of destination string (or NULL)
	103	* @return Newly allocated destination string or NULL
	104	*
	105	* The return value is always 0-terminated. The value returned via @p *ndp
	106	* does not include the terminator.
	107	*
	108	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
	109	* for a code point is invalid if:
	110	* - it is not the shortest possible sequence for the code point
	111	* - it codes for a UTF-16 surrogate
	112	* - it codes for a value outside the unicode code space
	113	*/
	114	uint32_t utf8_to_utf32(const char s, size_t ns, size_t *ndp) {
	115	struct dynstr_ucs4 d;
	116	uint32_t c32, c;
	117	const uint8_t ss = (const uint8_t )s;
	118
	119	dynstr_ucs4_init(&d);
	120	while(ns > 0) {
	121	c = *ss++;
	122	--ns;
	123	/* Acceptable UTF-8 is that which codes for Unicode Scalar Values
	124	* (Unicode 5.0.0 s3.9 D76)
	125	*
	126	* 0xxxxxxx
	127	* 7 data bits gives 0x00 - 0x7F and all are acceptable
	128	*
	129	* 110xxxxx 10xxxxxx
	130	* 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
	131	*
	132	* 1110xxxx 10xxxxxx 10xxxxxx
	133	* 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
	134	* (and UTF-16 surrogates are not acceptable)
	135	*
	136	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	137	* 21 data bits gives 0x00000000 - 0x001FFFFF
	138	* but only 0x00010000 - 0x0010FFFF are acceptable
	139	*
	140	* It is NOT always the case that the data bits in the first byte are
	141	* always non-0 for the acceptable values, so we do a separate check after
	142	* decoding.
	143	*/
	144	if(c < 0x80)
	145	c32 = c;
	146	else if(c <= 0xDF) {
	147	if(ns < 1) goto error;
	148	c32 = c & 0x1F;
	149	c = *ss++;
	150	if((c & 0xC0) != 0x80) goto error;
	151	c32 = (c32 << 6) \| (c & 0x3F);
	152	if(c32 < 0x80) goto error;
	153	} else if(c <= 0xEF) {
	154	if(ns < 2) goto error;
	155	c32 = c & 0x0F;
	156	c = *ss++;
	157	if((c & 0xC0) != 0x80) goto error;
	158	c32 = (c32 << 6) \| (c & 0x3F);
	159	c = *ss++;
	160	if((c & 0xC0) != 0x80) goto error;
	161	c32 = (c32 << 6) \| (c & 0x3F);
	162	if(c32 < 0x0800 \|\| (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
	163	} else if(c <= 0xF7) {
	164	if(ns < 3) goto error;
	165	c32 = c & 0x07;
	166	c = *ss++;
	167	if((c & 0xC0) != 0x80) goto error;
	168	c32 = (c32 << 6) \| (c & 0x3F);
	169	c = *ss++;
	170	if((c & 0xC0) != 0x80) goto error;
	171	c32 = (c32 << 6) \| (c & 0x3F);
	172	c = *ss++;
	173	if((c & 0xC0) != 0x80) goto error;
	174	c32 = (c32 << 6) \| (c & 0x3F);
	175	if(c32 < 0x00010000 \|\| c32 > 0x0010FFFF) goto error;
	176	} else
	177	goto error;
	178	dynstr_ucs4_append(&d, c32);
	179	}
	180	dynstr_ucs4_terminate(&d);
	181	if(ndp)
	182	*ndp = d.nvec;
	183	return d.vec;
	184	error:
	185	xfree(d.vec);
	186	return 0;
	187	}
	188
	189	/@}/
	190	/** @defgroup utf32 Functions that operate on UTF-32 strings */
	191	/@{/
	192
	193	/** @brief Return the length of a 0-terminated UTF-32 string
	194	* @param s Pointer to 0-terminated string
	195	* @return Length of string in code points (excluding terminator)
	196	*
	197	* Unlike the conversion functions no validity checking is done on the string.
	198	*/
	199	size_t utf32_len(const uint32_t *s) {
	200	const uint32_t *t = s;
	201
	202	while(*t)
	203	++t;
	204	return (size_t)(t - s);
	205	}
	206
	207	/** @brief Return the @ref unidata structure for code point @p c
	208	*
	209	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	210	*/
	211	static const struct unidata *utf32__unidata(uint32_t c) {
	212	/* The bottom half of the table contains almost everything of interest
	213	* and we can just return the right thing straight away */
	214	if(c < UNICODE_BREAK_START)
	215	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	216	/* Within the break everything is unassigned */
	217	if(c < UNICODE_BREAK_END)
	218	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	219	/* Planes 15 and 16 are (mostly) private use */
	220	if((c >= 0xF0000 && c <= 0xFFFFD)
	221	\|\| (c >= 0x100000 && c <= 0x10FFFD))
	222	return utf32__unidata(0xE000); /* first Co code point */
	223	/* Everything else above the break top is unassigned */
	224	if(c >= UNICODE_BREAK_TOP)
	225	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	226	/* Currently the rest is language tags and variation selectors */
	227	c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
	228	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	229	}
	230
	231	/** @brief Return the combining class of @p c
	232	* @param c Code point
	233	* @return Combining class of @p c
	234	*/
	235	static inline int utf32__combining_class(uint32_t c) {
	236	return utf32__unidata(c)->ccc;
	237	}
	238
	239	/** @brief Stably sort [s,s+ns) into descending order of combining class
	240	* @param s Start of array
	241	* @param ns Number of elements, must be at least 1
	242	* @param buffer Buffer of at least @p ns elements
	243	*/
	244	static void utf32__sort_ccc(uint32_t s, size_t ns, uint32_t buffer) {
	245	uint32_t a, b, *bp;
	246	size_t na, nb;
	247
	248	switch(ns) {
	249	case 1: /* 1-element array is always sorted */
	250	return;
	251	case 2: /* 2-element arrays are trivial to sort */
	252	if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
	253	uint32_t tmp = s[0];
	254	s[0] = s[1];
	255	s[1] = tmp;
	256	}
	257	return;
	258	default:
	259	/* Partition the array */
	260	na = ns / 2;
	261	nb = ns - na;
	262	a = s;
	263	b = s + na;
	264	/* Sort the two halves of the array */
	265	utf32__sort_ccc(a, na, buffer);
	266	utf32__sort_ccc(b, nb, buffer);
	267	/* Merge them back into one, via the buffer */
	268	bp = buffer;
	269	while(na > 0 && nb > 0) {
	270	/* We want descending order of combining class (hence <)
	271	* and we want stability within combining classes (hence <=)
	272	*/
	273	if(utf32__combining_class(a) <= utf32__combining_class(b)) {
	274	bp++ = a++;
	275	--na;
	276	} else {
	277	bp++ = b++;
	278	--nb;
	279	}
	280	}
	281	while(na > 0) {
	282	bp++ = a++;
	283	--na;
	284	}
	285	while(nb > 0) {
	286	bp++ = b++;
	287	--nb;
	288	}
	289	memcpy(s, buffer, ns * sizeof(uint32_t));
	290	return;
	291	}
	292	}
	293
	294	/** @brief Put combining characters into canonical order
	295	* @param s Pointer to UTF-32 string
	296	* @param ns Length of @p s
	297	* @return 0 on success, -1 on error
	298	*
	299	* @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
	300	* ordering.
	301	*
	302	* Currently we only support a maximum of 1024 combining characters after each
	303	* base character. If this limit is exceeded then -1 is returned.
	304	*/
	305	static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
	306	size_t nc;
	307	uint32_t buffer[1024];
	308
	309	/* The ordering amounts to a stable sort of each contiguous group of
	310	* characters with non-0 combining class. */
	311	while(ns > 0) {
	312	/* Skip non-combining characters */
	313	if(utf32__combining_class(*s) == 0) {
	314	++s;
	315	--ns;
	316	continue;
	317	}
	318	/* We must now have at least one combining character; see how many
	319	* there are */
	320	for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
	321	;
	322	if(nc > 1024)
	323	return -1;
	324	/* Sort the array */
	325	utf32__sort_ccc(s, nc, buffer);
	326	s += nc;
	327	ns -= nc;
	328	}
	329	return 0;
	330	}
	331
	332	/* Magic numbers from UAX #15 s16 */
	333	#define SBase 0xAC00
	334	#define LBase 0x1100
	335	#define VBase 0x1161
	336	#define TBase 0x11A7
	337	#define LCount 19
	338	#define VCount 21
	339	#define TCount 28
	340	#define NCount (VCount * TCount)
	341	#define SCount (LCount * NCount)
	342
	343	/** @brief Guts of the decomposition lookup functions */
	344	#define utf32__decompose_one_generic(WHICH) do { \
	345	const uint32_t *dc = utf32__unidata(c)->WHICH; \
	346	if(dc) { \
	347	/* Found a canonical decomposition in the table */ \
	348	while(*dc) \
	349	utf32__decompose_one_##WHICH(d, *dc++); \
	350	} else if(c >= SBase && c < SBase + SCount) { \
	351	/* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
	352	const uint32_t SIndex = c - SBase; \
	353	const uint32_t L = LBase + SIndex / NCount; \
	354	const uint32_t V = VBase + (SIndex % NCount) / TCount; \
	355	const uint32_t T = TBase + SIndex % TCount; \
	356	dynstr_ucs4_append(d, L); \
	357	dynstr_ucs4_append(d, V); \
	358	if(T != TBase) \
	359	dynstr_ucs4_append(d, T); \
	360	} else \
	361	/* Equal to own canonical decomposition */ \
	362	dynstr_ucs4_append(d, c); \
	363	} while(0)
	364
	365	/** @brief Recursively compute the canonical decomposition of @p c
	366	* @param d Dynamic string to store decomposition in
	367	* @param c Code point to decompose (must be a valid!)
	368	* @return 0 on success, -1 on error
	369	*/
	370	static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
	371	utf32__decompose_one_generic(canon);
	372	}
	373
	374	/** @brief Recursively compute the compatibility decomposition of @p c
	375	* @param d Dynamic string to store decomposition in
	376	* @param c Code point to decompose (must be a valid!)
	377	* @return 0 on success, -1 on error
	378	*/
	379	static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
	380	utf32__decompose_one_generic(compat);
	381	}
	382
	383	/** @brief Guts of the decomposition functions */
	384	#define utf32__decompose_generic(WHICH) do { \
	385	struct dynstr_ucs4 d; \
	386	uint32_t c; \
	387	\
	388	dynstr_ucs4_init(&d); \
	389	while(ns) { \
	390	c = *s++; \
	391	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	392	goto error; \
	393	utf32__decompose_one_##WHICH(&d, c); \
	394	--ns; \
	395	} \
	396	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	397	goto error; \
	398	dynstr_ucs4_terminate(&d); \
	399	if(ndp) \
	400	*ndp = d.nvec; \
	401	return d.vec; \
	402	error: \
	403	xfree(d.vec); \
	404	return 0; \
	405	} while(0)
	406
	407	/** @brief Canonically decompose @p [s,s+ns)
	408	* @param s Pointer to string
	409	* @param ns Length of string
	410	* @param ndp Where to store length of result
	411	* @return Pointer to result string, or NULL
	412	*
	413	* Computes the canonical decomposition of a string and stably sorts combining
	414	* characters into canonical order. The result is in Normalization Form D and
	415	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
	416	* NormalizationTest.txt.
	417	*
	418	* Returns NULL if the string is not valid for either of the following reasons:
	419	* - it codes for a UTF-16 surrogate
	420	* - it codes for a value outside the unicode code space
	421	*/
	422	uint32_t utf32_decompose_canon(const uint32_t s, size_t ns, size_t *ndp) {
	423	utf32__decompose_generic(canon);
	424	}
	425
	426	/** @brief Compatibility decompose @p [s,s+ns)
	427	* @param s Pointer to string
	428	* @param ns Length of string
	429	* @param ndp Where to store length of result
	430	* @return Pointer to result string, or NULL
	431	*
	432	* Computes the compatibility decomposition of a string and stably sorts
	433	* combining characters into canonical order. The result is in Normalization
	434	* Form KD and (at the time of writing!) passes the NFKD tests defined in
	435	* Unicode 5.0's NormalizationTest.txt.
	436	*
	437	* Returns NULL if the string is not valid for either of the following reasons:
	438	* - it codes for a UTF-16 surrogate
	439	* - it codes for a value outside the unicode code space
	440	*/
	441	uint32_t utf32_decompose_compat(const uint32_t s, size_t ns, size_t *ndp) {
	442	utf32__decompose_generic(compat);
	443	}
	444
	445	/** @brief Single-character case-fold and decompose operation */
	446	#define utf32__casefold_one(WHICH) do { \
	447	const uint32_t *cf = utf32__unidata(c)->casefold; \
	448	if(cf) { \
	449	/* Found a case-fold mapping in the table */ \
	450	while(*cf) \
	451	utf32__decompose_one_##WHICH(&d, *cf++); \
	452	} else \
	453	utf32__decompose_one_##WHICH(&d, c); \
	454	} while(0)
	455
	456	/** @brief Case-fold @p [s,s+ns)
	457	* @param s Pointer to string
	458	* @param ns Length of string
	459	* @param ndp Where to store length of result
	460	* @return Pointer to result string, or NULL
	461	*
	462	* Case-fold the string at @p s according to full default case-folding rules
	463	* (s3.13) for caseless matching. The result will be in NFD.
	464	*
	465	* Returns NULL if the string is not valid for either of the following reasons:
	466	* - it codes for a UTF-16 surrogate
	467	* - it codes for a value outside the unicode code space
	468	*/
	469	uint32_t utf32_casefold_canon(const uint32_t s, size_t ns, size_t *ndp) {
	470	struct dynstr_ucs4 d;
	471	uint32_t c;
	472	size_t n;
	473	uint32_t *ss = 0;
	474
	475	/* If the canonical decomposition of the string includes any combining
	476	* character that case-folds to a non-combining character then we must
	477	* normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
	478	* GREEK YPOGEGRAMMENI in its decomposition and the various characters that
	479	* canonically decompose to it. */
	480	for(n = 0; n < ns; ++n)
	481	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	482	break;
	483	if(n < ns) {
	484	/* We need a preliminary decomposition */
	485	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	486	return 0;
	487	s = ss;
	488	}
	489	dynstr_ucs4_init(&d);
	490	while(ns) {
	491	c = *s++;
	492	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF)
	493	goto error;
	494	utf32__casefold_one(canon);
	495	--ns;
	496	}
	497	if(utf32__canonical_ordering(d.vec, d.nvec))
	498	goto error;
	499	dynstr_ucs4_terminate(&d);
	500	if(ndp)
	501	*ndp = d.nvec;
	502	return d.vec;
	503	error:
	504	xfree(d.vec);
	505	xfree(ss);
	506	return 0;
	507	}
	508
	509	/** @brief Compatibilit case-fold @p [s,s+ns)
	510	* @param s Pointer to string
	511	* @param ns Length of string
	512	* @param ndp Where to store length of result
	513	* @return Pointer to result string, or NULL
	514	*
	515	* Case-fold the string at @p s according to full default case-folding rules
	516	* (s3.13) for compatibility caseless matching. The result will be in NFKD.
	517	*
	518	* Returns NULL if the string is not valid for either of the following reasons:
	519	* - it codes for a UTF-16 surrogate
	520	* - it codes for a value outside the unicode code space
	521	*/
	522	uint32_t utf32_casefold_compat(const uint32_t s, size_t ns, size_t *ndp) {
	523	struct dynstr_ucs4 d;
	524	uint32_t c;
	525	size_t n;
	526	uint32_t *ss = 0;
	527
	528	for(n = 0; n < ns; ++n)
	529	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	530	break;
	531	if(n < ns) {
	532	/* We need a preliminary _canonical_ decomposition */
	533	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	534	return 0;
	535	s = ss;
	536	}
	537	/* This computes NFKD(toCaseFold(s)) */
	538	#define compat_casefold_middle() do { \
	539	dynstr_ucs4_init(&d); \
	540	while(ns) { \
	541	c = *s++; \
	542	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	543	goto error; \
	544	utf32__casefold_one(compat); \
	545	--ns; \
	546	} \
	547	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	548	goto error; \
	549	} while(0)
	550	/* Do the inner (NFKD o toCaseFold) */
	551	compat_casefold_middle();
	552	/* We can do away with the NFD'd copy of the input now */
	553	xfree(ss);
	554	s = ss = d.vec;
	555	ns = d.nvec;
	556	/* Do the outer (NFKD o toCaseFold) */
	557	compat_casefold_middle();
	558	/* That's all */
	559	dynstr_ucs4_terminate(&d);
	560	if(ndp)
	561	*ndp = d.nvec;
	562	return d.vec;
	563	error:
	564	xfree(d.vec);
	565	xfree(ss);
	566	return 0;
	567	}
	568
	569	/** @brief Order a pair of UTF-32 strings
	570	* @param a First 0-terminated string
	571	* @param b Second 0-terminated string
	572	* @return -1, 0 or 1 for a less than, equal to or greater than b
	573	*
	574	* "Comparable to strcmp() at its best."
	575	*/
	576	int utf32_cmp(const uint32_t a, const uint32_t b) {
	577	while(a && b && a == b) {
	578	++a;
	579	++b;
	580	}
	581	return a < b ? -1 : (a > b ? 1 : 0);
	582	}
	583
	584	/** @brief Return the General_Category value for @p c
	585	* @param Code point
	586	* @return General_Category property value
	587	*/
	588	static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
	589	return utf32__unidata(c)->general_category;
	590	}
	591
	592	/** @brief Check Grapheme_Cluster_Break property
	593	* @param c Code point
	594	* @return 0 if it is as described, 1 otherwise
	595	*/
	596	static int utf32__is_control_or_cr_or_lf(uint32_t c) {
	597	switch(utf32__general_category(c)) {
	598	default:
	599	return 0;
	600	case unicode_General_Category_Zl:
	601	case unicode_General_Category_Zp:
	602	case unicode_General_Category_Cc:
	603	return 1;
	604	case unicode_General_Category_Cf:
	605	if(c == 0x200C \|\| c == 0x200D)
	606	return 0;
	607	return 1;
	608	}
	609	}
	610
	611	#define Hangul_Syllable_Type_NA 0
	612	#define Hangul_Syllable_Type_L 0x1100
	613	#define Hangul_Syllable_Type_V 0x1160
	614	#define Hangul_Syllable_Type_T 0x11A8
	615	#define Hangul_Syllable_Type_LV 0xAC00
	616	#define Hangul_Syllable_Type_LVT 0xAC01
	617
	618	/** @brief Determine Hangul_Syllable_Type of @p c
	619	* @param c Code point
	620	* @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
	621	*
	622	* If this is a Hangul character then a representative member of its
	623	* equivalence class is returned. Otherwise Hangul_Syllable_Type_NA is
	624	* returned.
	625	*/
	626	static uint32_t utf32__hangul_syllable_type(uint32_t c) {
	627	/* Dispose of the bulk of the non-Hangul code points first */
	628	if(c < 0x1100) return Hangul_Syllable_Type_NA;
	629	if(c > 0x1200 && c < 0xAC00) return Hangul_Syllable_Type_NA;
	630	if(c >= 0xD800) return Hangul_Syllable_Type_NA;
	631	/* Now we pick out the assigned Hangul code points */
	632	if((c >= 0x1100 && c <= 0x1159) \|\| c == 0x115F) return Hangul_Syllable_Type_L;
	633	if(c >= 0x1160 && c <= 0x11A2) return Hangul_Syllable_Type_V;
	634	if(c >= 0x11A8 && c <= 0x11F9) return Hangul_Syllable_Type_T;
	635	if(c >= 0xAC00 && c <= 0xD7A3) {
	636	if(c % 28 == 16)
	637	return Hangul_Syllable_Type_LV;
	638	else
	639	return Hangul_Syllable_Type_LVT;
	640	}
	641	return Hangul_Syllable_Type_NA;
	642	}
	643
	644	/** @brief Determine Word_Break property
	645	* @param c Code point
	646	* @return Word_Break property value of @p c
	647	*/
	648	static enum unicode_Word_Break utf32__word_break(uint32_t c) {
	649	return utf32__unidata(c)->word_break;
	650	}
	651
	652	/** @brief Identify a grapheme cluster boundary
	653	* @param s Start of string (must be NFD)
	654	* @param ns Length of string
	655	* @param n Index within string (in [0,ns].)
	656	* @return 1 at a grapheme cluster boundary, 0 otherwise
	657	*
	658	* This function identifies default grapheme cluster boundaries as described in
	659	* UAX #29 s3. It returns 1 if @p n points at the code point just after a
	660	* grapheme cluster boundary (including the hypothetical code point just after
	661	* the end of the string).
	662	*/
	663	int utf32_is_gcb(const uint32_t *s, size_t ns, size_t n) {
	664	uint32_t before, after;
	665	uint32_t hbefore, hafter;
	666	/* GB1 and GB2 */
	667	if(n == 0 \|\| n == ns)
	668	return 1;
	669	/* Now we know that s[n-1] and s[n] are safe to inspect */
	670	/* GB3 */
	671	before = s[n-1];
	672	after = s[n];
	673	if(before == 0x000D && after == 0x000A)
	674	return 0;
	675	/* GB4 and GB5 */
	676	if(utf32__is_control_or_cr_or_lf(before)
	677	\|\| utf32__is_control_or_cr_or_lf(after))
	678	return 1;
	679	hbefore = utf32__hangul_syllable_type(before);
	680	hafter = utf32__hangul_syllable_type(after);
	681	/* GB6 */
	682	if(hbefore == Hangul_Syllable_Type_L
	683	&& (hafter == Hangul_Syllable_Type_L
	684	\|\| hafter == Hangul_Syllable_Type_V
	685	\|\| hafter == Hangul_Syllable_Type_LV
	686	\|\| hafter == Hangul_Syllable_Type_LVT))
	687	return 0;
	688	/* GB7 */
	689	if((hbefore == Hangul_Syllable_Type_LV
	690	\|\| hbefore == Hangul_Syllable_Type_V)
	691	&& (hafter == Hangul_Syllable_Type_V
	692	\|\| hafter == Hangul_Syllable_Type_T))
	693	return 0;
	694	/* GB8 */
	695	if((hbefore == Hangul_Syllable_Type_LVT
	696	\|\| hbefore == Hangul_Syllable_Type_T)
	697	&& hafter == Hangul_Syllable_Type_T)
	698	return 0;
	699	/* GB9 */
	700	if(utf32__word_break(after) == unicode_Word_Break_Extend)
	701	return 0;
	702	/* GB10 */
	703	return 1;
	704	}
	705
	706	/** @brief Return true if @p c is ignorable for boundary specifications */
	707	static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
	708	return (wb == unicode_Word_Break_Extend
	709	\|\| wb == unicode_Word_Break_Format);
	710	}
	711
	712	/** @brief Identify a word boundary
	713	* @param s Start of string (must be NFD)
	714	* @param ns Length of string
	715	* @param n Index within string (in [0,ns].)
	716	* @return 1 at a word boundary, 0 otherwise
	717	*
	718	* This function identifies default word boundaries as described in UAX #29 s4.
	719	* It returns 1 if @p n points at the code point just after a word boundary
	720	* (including the hypothetical code point just after the end of the string).
	721	*/
	722	int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
	723	enum unicode_Word_Break twobefore, before, after, twoafter;
	724	size_t nn;
	725
	726	/* WB1 and WB2 */
	727	if(n == 0 \|\| n == ns)
	728	return 1;
	729	/* WB3 */
	730	if(s[n-1] == 0x000D && s[n] == 0x000A)
	731	return 0;
	732	/* WB4 */
	733	/* (!Sep) x (Extend\|Format) as in UAX #29 s6.2 */
	734	switch(s[n-1]) { /* bit of a bodge */
	735	case 0x000A:
	736	case 0x000D:
	737	case 0x0085:
	738	case 0x2028:
	739	case 0x2029:
	740	break;
	741	default:
	742	if(utf32__boundary_ignorable(utf32__word_break(s[n])))
	743	return 0;
	744	break;
	745	}
	746	/* Gather the property values we'll need for the rest of the test taking the
	747	* s6.2 changes into account */
	748	/* First we look at the code points after the proposed boundary */
	749	nn = n; /* <ns */
	750	after = utf32__word_break(s[nn++]);
	751	if(!utf32__boundary_ignorable(after)) {
	752	/* X (Extend\|Format)* -> X */
	753	while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
	754	++nn;
	755	}
	756	/* It's possible now that nn=ns */
	757	if(nn < ns)
	758	twoafter = utf32__word_break(s[nn]);
	759	else
	760	twoafter = unicode_Word_Break_Other;
	761
	762	/* Next we look at the code points before the proposed boundary. This is a
	763	* bit fiddlier. */
	764	nn = n;
	765	while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
	766	--nn;
	767	if(nn == 0) {
	768	/* s[nn] must be ignorable */
	769	before = utf32__word_break(s[nn]);
	770	twobefore = unicode_Word_Break_Other;
	771	} else {
	772	/* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
	773	* ignorable. */
	774	before = utf32__word_break(s[nn - 1]);
	775	--nn;
	776	/* Repeat the exercise */
	777	while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
	778	--nn;
	779	if(nn == 0)
	780	twobefore = utf32__word_break(s[nn]);
	781	else
	782	twobefore = utf32__word_break(s[nn - 1]);
	783	}
	784
	785	/* WB5 */
	786	if(before == unicode_Word_Break_ALetter
	787	&& after == unicode_Word_Break_ALetter)
	788	return 0;
	789	/* WB6 */
	790	if(before == unicode_Word_Break_ALetter
	791	&& after == unicode_Word_Break_MidLetter
	792	&& twoafter == unicode_Word_Break_ALetter)
	793	return 0;
	794	/* WB7 */
	795	if(twobefore == unicode_Word_Break_ALetter
	796	&& before == unicode_Word_Break_MidLetter
	797	&& after == unicode_Word_Break_ALetter)
	798	return 0;
	799	/* WB8 */
	800	if(before == unicode_Word_Break_Numeric
	801	&& after == unicode_Word_Break_Numeric)
	802	return 0;
	803	/* WB9 */
	804	if(before == unicode_Word_Break_ALetter
	805	&& after == unicode_Word_Break_Numeric)
	806	return 0;
	807	/* WB10 */
	808	if(before == unicode_Word_Break_Numeric
	809	&& after == unicode_Word_Break_ALetter)
	810	return 0;
	811	/* WB11 */
	812	if(twobefore == unicode_Word_Break_Numeric
	813	&& before == unicode_Word_Break_MidNum
	814	&& after == unicode_Word_Break_Numeric)
	815	return 0;
	816	/* WB12 */
	817	if(before == unicode_Word_Break_Numeric
	818	&& after == unicode_Word_Break_MidNum
	819	&& twoafter == unicode_Word_Break_Numeric)
	820	return 0;
	821	/* WB13 */
	822	if(before == unicode_Word_Break_Katakana
	823	&& after == unicode_Word_Break_Katakana)
	824	return 0;
	825	/* WB13a */
	826	if((before == unicode_Word_Break_ALetter
	827	\|\| before == unicode_Word_Break_Numeric
	828	\|\| before == unicode_Word_Break_Katakana
	829	\|\| before == unicode_Word_Break_ExtendNumLet)
	830	&& after == unicode_Word_Break_ExtendNumLet)
	831	return 0;
	832	/* WB13b */
	833	if(before == unicode_Word_Break_ExtendNumLet
	834	&& (after == unicode_Word_Break_ALetter
	835	\|\| after == unicode_Word_Break_Numeric
	836	\|\| after == unicode_Word_Break_Katakana))
	837	return 0;
	838	/* WB14 */
	839	return 1;
	840	}
	841
	842	/@}/
	843	/** @defgroup utf8 Functions that operate on UTF-8 strings */
	844	/@{/
	845
	846	/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
	847	#define utf8__transform(FN) do { \
	848	uint32_t to32 = 0, decomp32 = 0; \
	849	size_t nto32, ndecomp32; \
	850	char *decomp8 = 0; \
	851	\
	852	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
	853	if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
	854	decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
	855	error: \
	856	xfree(to32); \
	857	xfree(decomp32); \
	858	return decomp8; \
	859	} while(0)
	860
	861	/** @brief Canonically decompose @p [s,s+ns)
	862	* @param s Pointer to string
	863	* @param ns Length of string
	864	* @param ndp Where to store length of result
	865	* @return Pointer to result string, or NULL
	866	*
	867	* Computes the canonical decomposition of a string and stably sorts combining
	868	* characters into canonical order. The result is in Normalization Form D and
	869	* (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
	870	* NormalizationTest.txt.
	871	*
	872	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	873	* this might be.
	874	*
	875	* See also utf32_decompose_canon().
	876	*/
	877	char utf8_decompose_canon(const char s, size_t ns, size_t *ndp) {
	878	utf8__transform(utf32_decompose_canon);
	879	}
	880
	881	/** @brief Compatibility decompose @p [s,s+ns)
	882	* @param s Pointer to string
	883	* @param ns Length of string
	884	* @param ndp Where to store length of result
	885	* @return Pointer to result string, or NULL
	886	*
	887	* Computes the compatibility decomposition of a string and stably sorts
	888	* combining characters into canonical order. The result is in Normalization
	889	* Form KD and (at the time of writing!) passes the NFKD tests defined in
	890	* Unicode 5.0's NormalizationTest.txt.
	891	*
	892	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	893	* this might be.
	894	*
	895	* See also utf32_decompose_compat().
	896	*/
	897	char utf8_decompose_compat(const char s, size_t ns, size_t *ndp) {
	898	utf8__transform(utf32_decompose_compat);
	899	}
	900
	901	/** @brief Case-fold @p [s,s+ns)
	902	* @param s Pointer to string
	903	* @param ns Length of string
	904	* @param ndp Where to store length of result
	905	* @return Pointer to result string, or NULL
	906	*
	907	* Case-fold the string at @p s according to full default case-folding rules
	908	* (s3.13). The result will be in NFD.
	909	*
	910	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	911	* this might be.
	912	*/
	913	char utf8_casefold_canon(const char s, size_t ns, size_t *ndp) {
	914	utf8__transform(utf32_casefold_canon);
	915	}
	916
	917	/** @brief Compatibility case-fold @p [s,s+ns)
	918	* @param s Pointer to string
	919	* @param ns Length of string
	920	* @param ndp Where to store length of result
	921	* @return Pointer to result string, or NULL
	922	*
	923	* Case-fold the string at @p s according to full default case-folding rules
	924	* (s3.13). The result will be in NFKD.
	925	*
	926	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	927	* this might be.
	928	*/
	929	char utf8_casefold_compat(const char s, size_t ns, size_t *ndp) {
	930	utf8__transform(utf32_casefold_compat);
	931	}
	932
	933	/@}/
	934
	935	/*
	936	Local Variables:
	937	c-basic-offset:2
	938	comment-column:40
	939	fill-column:79
	940	indent-tabs-mode:nil
	941	End:
	942	*/