chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2007 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/unicode.c
	21	* @brief Unicode support functions
	22	*
	23	* Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
	24	* encoding schemes). The primary encoding form is UTF-32 but convenience
	25	* wrappers using UTF-8 are provided for a number of functions.
	26	*
	27	* The idea is that all the strings that hit the database will be in a
	28	* particular normalization form, and for the search and tags database
	29	* in case-folded form, so they can be naively compared within the
	30	* database code.
	31	*
	32	* As the code stands this guarantee is not well met!
	33	*
	34	* Subpages:
	35	* - @ref utf32props
	36	* - @ref utftransform
	37	* - @ref utf32iterator
	38	* - @ref utf32
	39	* - @ref utf8
	40	*/
	41
	42	#include "common.h"
	43
	44	#include "mem.h"
	45	#include "vector.h"
	46	#include "unicode.h"
	47	#include "unidata.h"
	48
	49	/** @defgroup utf32props Unicode Code Point Properties */
	50	/@{/
	51
	52	static const struct unidata *utf32__unidata_hard(uint32_t c);
	53
	54	/** @brief Find definition of code point @p c
	55	* @param c Code point
	56	* @return Pointer to @ref unidata structure for @p c
	57	*
	58	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	59	* The returned pointer is NOT guaranteed to be unique to @p c.
	60	*/
	61	static inline const struct unidata *utf32__unidata(uint32_t c) {
	62	/* The bottom half of the table contains almost everything of interest
	63	* and we can just return the right thing straight away */
	64	if(c < UNICODE_BREAK_START)
	65	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	66	else
	67	return utf32__unidata_hard(c);
	68	}
	69
	70	/** @brief Find definition of code point @p c
	71	* @param c Code point
	72	* @return Pointer to @ref unidata structure for @p c
	73	*
	74	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	75	* The returned pointer is NOT guaranteed to be unique to @p c.
	76	*
	77	* Don't use this function (although it will work fine) - use utf32__unidata()
	78	* instead.
	79	*/
	80	static const struct unidata *utf32__unidata_hard(uint32_t c) {
	81	if(c < UNICODE_BREAK_START)
	82	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	83	/* Within the break everything is unassigned */
	84	if(c < UNICODE_BREAK_END)
	85	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	86	/* Planes 15 and 16 are (mostly) private use */
	87	if((c >= 0xF0000 && c <= 0xFFFFD)
	88	\|\| (c >= 0x100000 && c <= 0x10FFFD))
	89	return utf32__unidata(0xE000); /* first Co code point */
	90	/* Everything else above the break top is unassigned */
	91	if(c >= UNICODE_BREAK_TOP)
	92	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	93	/* Currently the rest is language tags and variation selectors */
	94	c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
	95	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	96	}
	97
	98	/** @brief Return the combining class of @p c
	99	* @param c Code point
	100	* @return Combining class of @p c
	101	*
	102	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	103	*/
	104	static inline int utf32__combining_class(uint32_t c) {
	105	return utf32__unidata(c)->ccc;
	106	}
	107
	108	/** @brief Return the combining class of @p c
	109	* @param c Code point
	110	* @return Combining class of @p c
	111	*
	112	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	113	*/
	114	int utf32_combining_class(uint32_t c) {
	115	return utf32__combining_class(c);
	116	}
	117
	118	/** @brief Return the General_Category value for @p c
	119	* @param c Code point
	120	* @return General_Category property value
	121	*
	122	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	123	*/
	124	static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
	125	return utf32__unidata(c)->general_category;
	126	}
	127
	128	/** @brief Determine Grapheme_Break property
	129	* @param c Code point
	130	* @return Grapheme_Break property value of @p c
	131	*
	132	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	133	*/
	134	static inline enum unicode_Grapheme_Break utf32__grapheme_break(uint32_t c) {
	135	return utf32__unidata(c)->grapheme_break;
	136	}
	137
	138	/** @brief Determine Word_Break property
	139	* @param c Code point
	140	* @return Word_Break property value of @p c
	141	*
	142	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	143	*/
	144	static inline enum unicode_Word_Break utf32__word_break(uint32_t c) {
	145	return utf32__unidata(c)->word_break;
	146	}
	147
	148	/** @brief Determine Sentence_Break property
	149	* @param c Code point
	150	* @return Word_Break property value of @p c
	151	*
	152	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	153	*/
	154	static inline enum unicode_Sentence_Break utf32__sentence_break(uint32_t c) {
	155	return utf32__unidata(c)->sentence_break;
	156	}
	157
	158	/** @brief Return true if @p c is ignorable for boundary specifications
	159	* @param wb Word break property value
	160	* @return non-0 if @p wb is unicode_Word_Break_Extend or unicode_Word_Break_Format
	161	*/
	162	static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
	163	return (wb == unicode_Word_Break_Extend
	164	\|\| wb == unicode_Word_Break_Format);
	165	}
	166
	167	/** @brief Return the canonical decomposition of @p c
	168	* @param c Code point
	169	* @return 0-terminated canonical decomposition, or 0
	170	*/
	171	static inline const uint32_t *utf32__decomposition_canon(uint32_t c) {
	172	const struct unidata *const data = utf32__unidata(c);
	173	const uint32_t *const decomp = data->decomp;
	174
	175	if(decomp && !(data->flags & unicode_compatibility_decomposition))
	176	return decomp;
	177	else
	178	return 0;
	179	}
	180
	181	/** @brief Return the compatibility decomposition of @p c
	182	* @param c Code point
	183	* @return 0-terminated decomposition, or 0
	184	*/
	185	static inline const uint32_t *utf32__decomposition_compat(uint32_t c) {
	186	return utf32__unidata(c)->decomp;
	187	}
	188
	189	/@}/
	190	/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
	191	/@{/
	192
	193	/** @brief Convert UTF-32 to UTF-8
	194	* @param s Source string
	195	* @param ns Length of source string in code points
	196	* @param ndp Where to store length of destination string (or NULL)
	197	* @return Newly allocated destination string or NULL on error
	198	*
	199	* If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
	200	* invalid if:
	201	* - it codes for a UTF-16 surrogate
	202	* - it codes for a value outside the unicode code space
	203	*
	204	* The return value is always 0-terminated. The value returned via @p *ndp
	205	* does not include the terminator.
	206	*/
	207	char utf32_to_utf8(const uint32_t s, size_t ns, size_t *ndp) {
	208	struct dynstr d;
	209	uint32_t c;
	210
	211	dynstr_init(&d);
	212	while(ns > 0) {
	213	c = *s++;
	214	if(c < 0x80)
	215	dynstr_append(&d, c);
	216	else if(c < 0x0800) {
	217	dynstr_append(&d, 0xC0 \| (c >> 6));
	218	dynstr_append(&d, 0x80 \| (c & 0x3F));
	219	} else if(c < 0x10000) {
	220	if(c >= 0xD800 && c <= 0xDFFF)
	221	goto error;
	222	dynstr_append(&d, 0xE0 \| (c >> 12));
	223	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	224	dynstr_append(&d, 0x80 \| (c & 0x3F));
	225	} else if(c < 0x110000) {
	226	dynstr_append(&d, 0xF0 \| (c >> 18));
	227	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	228	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	229	dynstr_append(&d, 0x80 \| (c & 0x3F));
	230	} else
	231	goto error;
	232	--ns;
	233	}
	234	dynstr_terminate(&d);
	235	if(ndp)
	236	*ndp = d.nvec;
	237	return d.vec;
	238	error:
	239	xfree(d.vec);
	240	return 0;
	241	}
	242
	243	/** @brief Convert UTF-8 to UTF-32
	244	* @param s Source string
	245	* @param ns Length of source string in code points
	246	* @param ndp Where to store length of destination string (or NULL)
	247	* @return Newly allocated destination string or NULL on error
	248	*
	249	* The return value is always 0-terminated. The value returned via @p *ndp
	250	* does not include the terminator.
	251	*
	252	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
	253	* for a code point is invalid if:
	254	* - it is not the shortest possible sequence for the code point
	255	* - it codes for a UTF-16 surrogate
	256	* - it codes for a value outside the unicode code space
	257	*/
	258	uint32_t utf8_to_utf32(const char s, size_t ns, size_t *ndp) {
	259	struct dynstr_ucs4 d;
	260	uint32_t c32;
	261	const uint8_t ss = (const uint8_t )s;
	262	int n;
	263
	264	dynstr_ucs4_init(&d);
	265	while(ns > 0) {
	266	const struct unicode_utf8_row const r = &unicode_utf8_valid[ss];
	267	if(r->count <= ns) {
	268	switch(r->count) {
	269	case 1:
	270	c32 = *ss;
	271	break;
	272	case 2:
	273	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	274	goto error;
	275	c32 = *ss & 0x1F;
	276	break;
	277	case 3:
	278	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	279	goto error;
	280	c32 = *ss & 0x0F;
	281	break;
	282	case 4:
	283	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	284	goto error;
	285	c32 = *ss & 0x07;
	286	break;
	287	default:
	288	goto error;
	289	}
	290	} else
	291	goto error;
	292	for(n = 1; n < r->count; ++n) {
	293	if(ss[n] < 0x80 \|\| ss[n] > 0xBF)
	294	goto error;
	295	c32 = (c32 << 6) \| (ss[n] & 0x3F);
	296	}
	297	dynstr_ucs4_append(&d, c32);
	298	ss += r->count;
	299	ns -= r->count;
	300	}
	301	dynstr_ucs4_terminate(&d);
	302	if(ndp)
	303	*ndp = d.nvec;
	304	return d.vec;
	305	error:
	306	xfree(d.vec);
	307	return 0;
	308	}
	309
	310	/** @brief Test whether [s,s+ns) is valid UTF-8
	311	* @param s Start of string
	312	* @param ns Length of string
	313	* @return non-0 if @p s is valid UTF-8, 0 if it is not valid
	314	*
	315	* This function is intended to be much faster than calling utf8_to_utf32() and
	316	* throwing away the result.
	317	*/
	318	int utf8_valid(const char *s, size_t ns) {
	319	const uint8_t ss = (const uint8_t )s;
	320	while(ns > 0) {
	321	const struct unicode_utf8_row const r = &unicode_utf8_valid[ss];
	322	if(r->count <= ns) {
	323	switch(r->count) {
	324	case 1:
	325	break;
	326	case 2:
	327	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	328	return 0;
	329	break;
	330	case 3:
	331	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	332	return 0;
	333	if(ss[2] < 0x80 \|\| ss[2] > 0xBF)
	334	return 0;
	335	break;
	336	case 4:
	337	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	338	return 0;
	339	if(ss[2] < 0x80 \|\| ss[2] > 0xBF)
	340	return 0;
	341	if(ss[3] < 0x80 \|\| ss[3] > 0xBF)
	342	return 0;
	343	break;
	344	default:
	345	return 0;
	346	}
	347	} else
	348	return 0;
	349	ss += r->count;
	350	ns -= r->count;
	351	}
	352	return 1;
	353	}
	354
	355	/@}/
	356	/** @defgroup utf32iterator UTF-32 string iterators */
	357	/@{/
	358
	359	struct utf32_iterator_data {
	360	/** @brief Start of string */
	361	const uint32_t *s;
	362
	363	/** @brief Length of string */
	364	size_t ns;
	365
	366	/** @brief Current position */
	367	size_t n;
	368
	369	/** @brief Last two non-ignorable characters or (uint32_t)-1
	370	*
	371	* last[1] is the non-Extend/Format character just before position @p n;
	372	* last[0] is the one just before that.
	373	*
	374	* Exception 1: if there is no such non-Extend/Format character then an
	375	* Extend/Format character is accepted instead.
	376	*
	377	* Exception 2: if there is no such character even taking that into account
	378	* the value is (uint32_t)-1.
	379	*/
	380	uint32_t last[2];
	381
	382	/** @brief Tailoring for Word_Break */
	383	unicode_property_tailor *word_break;
	384	};
	385
	386	/** @brief Initialize an internal private iterator
	387	* @param it Iterator
	388	* @param s Start of string
	389	* @param ns Length of string
	390	* @param n Absolute position
	391	*/
	392	static void utf32__iterator_init(utf32_iterator it,
	393	const uint32_t *s, size_t ns, size_t n) {
	394	it->s = s;
	395	it->ns = ns;
	396	it->n = 0;
	397	it->last[0] = it->last[1] = -1;
	398	it->word_break = 0;
	399	utf32_iterator_set(it, n);
	400	}
	401
	402	/** @brief Create a new iterator pointing at the start of a string
	403	* @param s Start of string
	404	* @param ns Length of string
	405	* @return New iterator
	406	*/
	407	utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
	408	utf32_iterator it = xmalloc(sizeof *it);
	409	utf32__iterator_init(it, s, ns, 0);
	410	return it;
	411	}
	412
	413	/** @brief Tailor this iterator's interpretation of the Word_Break property.
	414	* @param it Iterator
	415	* @param pt Property tailor function or NULL
	416	*
	417	* After calling this the iterator will call @p pt to determine the Word_Break
	418	* property of each code point. If it returns -1 the default value will be
	419	* used otherwise the returned value will be used.
	420	*
	421	* @p pt can be NULL to revert to the default value of the property.
	422	*
	423	* It is safe to call this function at any time; the iterator's internal state
	424	* will be reset to suit the new tailoring.
	425	*/
	426	void utf32_iterator_tailor_word_break(utf32_iterator it,
	427	unicode_property_tailor *pt) {
	428	it->word_break = pt;
	429	utf32_iterator_set(it, it->n);
	430	}
	431
	432	static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
	433	uint32_t c) {
	434	if(!it->word_break)
	435	return utf32__word_break(c);
	436	else {
	437	const int t = it->word_break(c);
	438
	439	if(t < 0)
	440	return utf32__word_break(c);
	441	else
	442	return t;
	443	}
	444	}
	445
	446	/** @brief Destroy an iterator
	447	* @param it Iterator
	448	*/
	449	void utf32_iterator_destroy(utf32_iterator it) {
	450	xfree(it);
	451	}
	452
	453	/** @brief Find the current position of an interator
	454	* @param it Iterator
	455	*/
	456	size_t utf32_iterator_where(utf32_iterator it) {
	457	return it->n;
	458	}
	459
	460	/** @brief Set an iterator's absolute position
	461	* @param it Iterator
	462	* @param n Absolute position
	463	* @return 0 on success, non-0 on error
	464	*
	465	* It is an error to position the iterator outside the string (but acceptable
	466	* to point it at the hypothetical post-final character). If an invalid value
	467	* of @p n is specified then the iterator is not changed.
	468	*
	469	* This function works by backing up and then advancing to reconstruct the
	470	* iterator's internal state for position @p n. The worst case will be O(n)
	471	* time complexity (with a worse constant factor that utf32_iterator_advance())
	472	* but the typical case is essentially constant-time.
	473	*/
	474	int utf32_iterator_set(utf32_iterator it, size_t n) {
	475	/* We can't just jump to position @p n; the @p last[] values will be wrong.
	476	* What we need is to jump a bit behind @p n and then advance forward,
	477	* updating @p last[] along the way. How far back? We need to cross two
	478	* non-ignorable code points as we advance forwards, so we'd better pass two
	479	* such characters on the way back (if such are available).
	480	*/
	481	size_t m;
	482
	483	if(n > it->ns) /* range check */
	484	return -1;
	485	/* Walk backwards skipping ignorable code points */
	486	m = n;
	487	while(m > 0
	488	&& (utf32__boundary_ignorable(utf32__iterator_word_break(it,
	489	it->s[m-1]))))
	490	--m;
	491	/* Either m=0 or s[m-1] is not ignorable */
	492	if(m > 0) {
	493	--m;
	494	/* s[m] is our first non-ignorable code; look for a second in the same
	495	way **/
	496	while(m > 0
	497	&& (utf32__boundary_ignorable(utf32__iterator_word_break(it,
	498	it->s[m-1]))))
	499	--m;
	500	/* Either m=0 or s[m-1] is not ignorable */
	501	if(m > 0)
	502	--m;
	503	}
	504	it->last[0] = it->last[1] = -1;
	505	it->n = m;
	506	return utf32_iterator_advance(it, n - m);
	507	}
	508
	509	/** @brief Advance an iterator
	510	* @param it Iterator
	511	* @param count Number of code points to advance by
	512	* @return 0 on success, non-0 on error
	513	*
	514	* It is an error to advance an iterator beyond the hypothetical post-final
	515	* character of the string. If an invalid value of @p n is specified then the
	516	* iterator is not changed.
	517	*
	518	* This function has O(n) time complexity: it works by advancing naively
	519	* forwards through the string.
	520	*/
	521	int utf32_iterator_advance(utf32_iterator it, size_t count) {
	522	if(count <= it->ns - it->n) {
	523	while(count > 0) {
	524	const uint32_t c = it->s[it->n];
	525	const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
	526	if(it->last[1] == (uint32_t)-1
	527	\|\| !utf32__boundary_ignorable(wb)) {
	528	it->last[0] = it->last[1];
	529	it->last[1] = c;
	530	}
	531	++it->n;
	532	--count;
	533	}
	534	return 0;
	535	} else
	536	return -1;
	537	}
	538
	539	/** @brief Find the current code point
	540	* @param it Iterator
	541	* @return Current code point or 0
	542	*
	543	* If the iterator points at the hypothetical post-final character of the
	544	* string then 0 is returned. NB that this doesn't mean that there aren't any
	545	* 0 code points inside the string!
	546	*/
	547	uint32_t utf32_iterator_code(utf32_iterator it) {
	548	if(it->n < it->ns)
	549	return it->s[it->n];
	550	else
	551	return 0;
	552	}
	553
	554	/** @brief Test for a grapheme boundary
	555	* @param it Iterator
	556	* @return Non-0 if pointing just after a grapheme boundary, otherwise 0
	557	*
	558	* This function identifies default grapheme cluster boundaries as described in
	559	* UAX #29 s3. It returns non-0 if @p it points at the code point just after a
	560	* grapheme cluster boundary (including the hypothetical code point just after
	561	* the end of the string).
	562	*/
	563	int utf32_iterator_grapheme_boundary(utf32_iterator it) {
	564	uint32_t before, after;
	565	enum unicode_Grapheme_Break gbbefore, gbafter;
	566	/* GB1 and GB2 */
	567	if(it->n == 0 \|\| it->n == it->ns)
	568	return 1;
	569	/* Now we know that s[n-1] and s[n] are safe to inspect */
	570	/* GB3 */
	571	before = it->s[it->n-1];
	572	after = it->s[it->n];
	573	if(before == 0x000D && after == 0x000A)
	574	return 0;
	575	gbbefore = utf32__grapheme_break(before);
	576	gbafter = utf32__grapheme_break(after);
	577	/* GB4 */
	578	if(gbbefore == unicode_Grapheme_Break_Control
	579	\|\| before == 0x000D
	580	\|\| before == 0x000A)
	581	return 1;
	582	/* GB5 */
	583	if(gbafter == unicode_Grapheme_Break_Control
	584	\|\| after == 0x000D
	585	\|\| after == 0x000A)
	586	return 1;
	587	/* GB6 */
	588	if(gbbefore == unicode_Grapheme_Break_L
	589	&& (gbafter == unicode_Grapheme_Break_L
	590	\|\| gbafter == unicode_Grapheme_Break_V
	591	\|\| gbafter == unicode_Grapheme_Break_LV
	592	\|\| gbafter == unicode_Grapheme_Break_LVT))
	593	return 0;
	594	/* GB7 */
	595	if((gbbefore == unicode_Grapheme_Break_LV
	596	\|\| gbbefore == unicode_Grapheme_Break_V)
	597	&& (gbafter == unicode_Grapheme_Break_V
	598	\|\| gbafter == unicode_Grapheme_Break_T))
	599	return 0;
	600	/* GB8 */
	601	if((gbbefore == unicode_Grapheme_Break_LVT
	602	\|\| gbbefore == unicode_Grapheme_Break_T)
	603	&& gbafter == unicode_Grapheme_Break_T)
	604	return 0;
	605	/* GB9 */
	606	if(gbafter == unicode_Grapheme_Break_Extend)
	607	return 0;
	608	/* GB10 */
	609	return 1;
	610
	611	}
	612
	613	/** @brief Test for a word boundary
	614	* @param it Iterator
	615	* @return Non-0 if pointing just after a word boundary, otherwise 0
	616	*
	617	* This function identifies default word boundaries as described in UAX #29 s4.
	618	* It returns non-0 if @p it points at the code point just after a word
	619	* boundary (including the hypothetical code point just after the end of the
	620	* string) and 0 otherwise.
	621	*/
	622	int utf32_iterator_word_boundary(utf32_iterator it) {
	623	enum unicode_Word_Break twobefore, before, after, twoafter;
	624	size_t nn;
	625
	626	/* WB1 and WB2 */
	627	if(it->n == 0 \|\| it->n == it->ns)
	628	return 1;
	629	/* WB3 */
	630	if(it->s[it->n-1] == 0x000D && it->s[it->n] == 0x000A)
	631	return 0;
	632	/* WB4 */
	633	/* (!Sep) x (Extend\|Format) as in UAX #29 s6.2 */
	634	if(utf32__sentence_break(it->s[it->n-1]) != unicode_Sentence_Break_Sep
	635	&& utf32__boundary_ignorable(utf32__iterator_word_break(it, it->s[it->n])))
	636	return 0;
	637	/* Gather the property values we'll need for the rest of the test taking the
	638	* s6.2 changes into account */
	639	/* First we look at the code points after the proposed boundary */
	640	nn = it->n; /* <it->ns */
	641	after = utf32__iterator_word_break(it, it->s[nn++]);
	642	if(!utf32__boundary_ignorable(after)) {
	643	/* X (Extend\|Format)* -> X */
	644	while(nn < it->ns
	645	&& utf32__boundary_ignorable(utf32__iterator_word_break(it,
	646	it->s[nn])))
	647	++nn;
	648	}
	649	/* It's possible now that nn=ns */
	650	if(nn < it->ns)
	651	twoafter = utf32__iterator_word_break(it, it->s[nn]);
	652	else
	653	twoafter = unicode_Word_Break_Other;
	654
	655	/* We've already recorded the non-ignorable code points before the proposed
	656	* boundary */
	657	before = utf32__iterator_word_break(it, it->last[1]);
	658	twobefore = utf32__iterator_word_break(it, it->last[0]);
	659
	660	/* WB5 */
	661	if(before == unicode_Word_Break_ALetter
	662	&& after == unicode_Word_Break_ALetter)
	663	return 0;
	664	/* WB6 */
	665	if(before == unicode_Word_Break_ALetter
	666	&& after == unicode_Word_Break_MidLetter
	667	&& twoafter == unicode_Word_Break_ALetter)
	668	return 0;
	669	/* WB7 */
	670	if(twobefore == unicode_Word_Break_ALetter
	671	&& before == unicode_Word_Break_MidLetter
	672	&& after == unicode_Word_Break_ALetter)
	673	return 0;
	674	/* WB8 */
	675	if(before == unicode_Word_Break_Numeric
	676	&& after == unicode_Word_Break_Numeric)
	677	return 0;
	678	/* WB9 */
	679	if(before == unicode_Word_Break_ALetter
	680	&& after == unicode_Word_Break_Numeric)
	681	return 0;
	682	/* WB10 */
	683	if(before == unicode_Word_Break_Numeric
	684	&& after == unicode_Word_Break_ALetter)
	685	return 0;
	686	/* WB11 */
	687	if(twobefore == unicode_Word_Break_Numeric
	688	&& before == unicode_Word_Break_MidNum
	689	&& after == unicode_Word_Break_Numeric)
	690	return 0;
	691	/* WB12 */
	692	if(before == unicode_Word_Break_Numeric
	693	&& after == unicode_Word_Break_MidNum
	694	&& twoafter == unicode_Word_Break_Numeric)
	695	return 0;
	696	/* WB13 */
	697	if(before == unicode_Word_Break_Katakana
	698	&& after == unicode_Word_Break_Katakana)
	699	return 0;
	700	/* WB13a */
	701	if((before == unicode_Word_Break_ALetter
	702	\|\| before == unicode_Word_Break_Numeric
	703	\|\| before == unicode_Word_Break_Katakana
	704	\|\| before == unicode_Word_Break_ExtendNumLet)
	705	&& after == unicode_Word_Break_ExtendNumLet)
	706	return 0;
	707	/* WB13b */
	708	if(before == unicode_Word_Break_ExtendNumLet
	709	&& (after == unicode_Word_Break_ALetter
	710	\|\| after == unicode_Word_Break_Numeric
	711	\|\| after == unicode_Word_Break_Katakana))
	712	return 0;
	713	/* WB14 */
	714	return 1;
	715	}
	716
	717	/@}/
	718	/** @defgroup utf32 Functions that operate on UTF-32 strings */
	719	/@{/
	720
	721	/** @brief Return the length of a 0-terminated UTF-32 string
	722	* @param s Pointer to 0-terminated string
	723	* @return Length of string in code points (excluding terminator)
	724	*
	725	* Unlike the conversion functions no validity checking is done on the string.
	726	*/
	727	size_t utf32_len(const uint32_t *s) {
	728	const uint32_t *t = s;
	729
	730	while(*t)
	731	++t;
	732	return (size_t)(t - s);
	733	}
	734
	735	/** @brief Stably sort [s,s+ns) into descending order of combining class
	736	* @param s Start of array
	737	* @param ns Number of elements, must be at least 1
	738	* @param buffer Buffer of at least @p ns elements
	739	*/
	740	static void utf32__sort_ccc(uint32_t s, size_t ns, uint32_t buffer) {
	741	uint32_t a, b, *bp;
	742	size_t na, nb;
	743
	744	switch(ns) {
	745	case 1: /* 1-element array is always sorted */
	746	return;
	747	case 2: /* 2-element arrays are trivial to sort */
	748	if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
	749	uint32_t tmp = s[0];
	750	s[0] = s[1];
	751	s[1] = tmp;
	752	}
	753	return;
	754	default:
	755	/* Partition the array */
	756	na = ns / 2;
	757	nb = ns - na;
	758	a = s;
	759	b = s + na;
	760	/* Sort the two halves of the array */
	761	utf32__sort_ccc(a, na, buffer);
	762	utf32__sort_ccc(b, nb, buffer);
	763	/* Merge them back into one, via the buffer */
	764	bp = buffer;
	765	while(na > 0 && nb > 0) {
	766	/* We want ascending order of combining class (hence <)
	767	* and we want stability within combining classes (hence <=)
	768	*/
	769	if(utf32__combining_class(a) <= utf32__combining_class(b)) {
	770	bp++ = a++;
	771	--na;
	772	} else {
	773	bp++ = b++;
	774	--nb;
	775	}
	776	}
	777	while(na > 0) {
	778	bp++ = a++;
	779	--na;
	780	}
	781	while(nb > 0) {
	782	bp++ = b++;
	783	--nb;
	784	}
	785	memcpy(s, buffer, ns * sizeof(uint32_t));
	786	return;
	787	}
	788	}
	789
	790	/** @brief Put combining characters into canonical order
	791	* @param s Pointer to UTF-32 string
	792	* @param ns Length of @p s
	793	* @return 0 on success, non-0 on error
	794	*
	795	* @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
	796	* ordering.
	797	*
	798	* Currently we only support a maximum of 1024 combining characters after each
	799	* base character. If this limit is exceeded then a non-0 value is returned.
	800	*/
	801	static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
	802	size_t nc;
	803	uint32_t buffer[1024];
	804
	805	/* The ordering amounts to a stable sort of each contiguous group of
	806	* characters with non-0 combining class. */
	807	while(ns > 0) {
	808	/* Skip non-combining characters */
	809	if(utf32__combining_class(*s) == 0) {
	810	++s;
	811	--ns;
	812	continue;
	813	}
	814	/* We must now have at least one combining character; see how many
	815	* there are */
	816	for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
	817	;
	818	if(nc > 1024)
	819	return -1;
	820	/* Sort the array */
	821	utf32__sort_ccc(s, nc, buffer);
	822	s += nc;
	823	ns -= nc;
	824	}
	825	return 0;
	826	}
	827
	828	/* Magic numbers from UAX #15 s16 */
	829	#define SBase 0xAC00
	830	#define LBase 0x1100
	831	#define VBase 0x1161
	832	#define TBase 0x11A7
	833	#define LCount 19
	834	#define VCount 21
	835	#define TCount 28
	836	#define NCount (VCount * TCount)
	837	#define SCount (LCount * NCount)
	838
	839	/** @brief Guts of the decomposition lookup functions */
	840	#define utf32__decompose_one_generic(WHICH) do { \
	841	const uint32_t *dc = utf32__decomposition_##WHICH(c); \
	842	if(dc) { \
	843	/* Found a canonical decomposition in the table */ \
	844	while(*dc) \
	845	utf32__decompose_one_##WHICH(d, *dc++); \
	846	} else if(c >= SBase && c < SBase + SCount) { \
	847	/* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
	848	const uint32_t SIndex = c - SBase; \
	849	const uint32_t L = LBase + SIndex / NCount; \
	850	const uint32_t V = VBase + (SIndex % NCount) / TCount; \
	851	const uint32_t T = TBase + SIndex % TCount; \
	852	dynstr_ucs4_append(d, L); \
	853	dynstr_ucs4_append(d, V); \
	854	if(T != TBase) \
	855	dynstr_ucs4_append(d, T); \
	856	} else \
	857	/* Equal to own canonical decomposition */ \
	858	dynstr_ucs4_append(d, c); \
	859	} while(0)
	860
	861	/** @brief Recursively compute the canonical decomposition of @p c
	862	* @param d Dynamic string to store decomposition in
	863	* @param c Code point to decompose (must be a valid!)
	864	* @return 0 on success, non-0 on error
	865	*/
	866	static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
	867	utf32__decompose_one_generic(canon);
	868	}
	869
	870	/** @brief Recursively compute the compatibility decomposition of @p c
	871	* @param d Dynamic string to store decomposition in
	872	* @param c Code point to decompose (must be a valid!)
	873	* @return 0 on success, non-0 on error
	874	*/
	875	static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
	876	utf32__decompose_one_generic(compat);
	877	}
	878
	879	/** @brief Magic utf32__compositions() return value for Hangul Choseong */
	880	static const uint32_t utf32__hangul_L[1];
	881
	882	/** @brief Return the list of compositions that @p c starts
	883	* @param c Starter code point
	884	* @return Composition list or NULL
	885	*
	886	* For Hangul leading (Choseong) jamo we return the special value
	887	* utf32__hangul_L. These code points are not listed as the targets of
	888	* canonical decompositions (make-unidata checks) so there is no confusion with
	889	* real decompositions here.
	890	*/
	891	static const uint32_t *utf32__compositions(uint32_t c) {
	892	const uint32_t *compositions = utf32__unidata(c)->composed;
	893
	894	if(compositions)
	895	return compositions;
	896	/* Special-casing for Hangul */
	897	switch(utf32__grapheme_break(c)) {
	898	default:
	899	return 0;
	900	case unicode_Grapheme_Break_L:
	901	return utf32__hangul_L;
	902	}
	903	}
	904
	905	/** @brief Composition step
	906	* @param s Start of string
	907	* @param ns Length of string
	908	* @return New length of string
	909	*
	910	* This is called from utf32__decompose_generic() to compose the result string
	911	* in place.
	912	*/
	913	static size_t utf32__compose(uint32_t *s, size_t ns) {
	914	const uint32_t *compositions;
	915	uint32_t start = s, t = s, *tt, cc;
	916
	917	while(ns > 0) {
	918	uint32_t starter = *s++;
	919	int block_starters = 0;
	920	--ns;
	921	/* We don't attempt to compose the following things:
	922	* - final characters whatever kind they are
	923	* - non-starter characters
	924	* - starters that don't take part in a canonical decomposition mapping
	925	*/
	926	if(ns == 0
	927	\|\| utf32__combining_class(starter)
	928	\|\| !(compositions = utf32__compositions(starter))) {
	929	*t++ = starter;
	930	continue;
	931	}
	932	if(compositions != utf32__hangul_L) {
	933	/* Where we'll put the eventual starter */
	934	tt = t++;
	935	do {
	936	/* See if we can find composition of starter+s /
	937	const uint32_t cchar = s, cp = compositions;
	938	while((cc = *cp++)) {
	939	const uint32_t *decomp = utf32__decomposition_canon(cc);
	940	/* We know decomp[0] == starter */
	941	if(decomp[1] == cchar)
	942	break;
	943	}
	944	if(cc) {
	945	/* Found a composition: cc decomposes to starter,s /
	946	starter = cc;
	947	compositions = utf32__compositions(starter);
	948	++s;
	949	--ns;
	950	} else {
	951	/* No composition found. */
	952	const int class = utf32__combining_class(*s);
	953	if(class) {
	954	/* Transfer the uncomposable combining character to the output */
	955	t++ = s++;
	956	--ns;
	957	/* All the combining characters of the same class of the
	958	* uncomposable character are blocked by it, but there may be
	959	* others of higher class later. We eat the uncomposable and
	960	* blocked characters and go back round the loop for that higher
	961	* class. */
	962	while(ns > 0 && utf32__combining_class(*s) == class) {
	963	t++ = s++;
	964	--ns;
	965	}
	966	/* Block any subsequent starters */
	967	block_starters = 1;
	968	} else {
	969	/* The uncombinable character is itself a starter, so we don't
	970	* transfer it to the output but instead go back round the main
	971	* loop. */
	972	break;
	973	}
	974	}
	975	/* Keep going while there are still characters and the starter takes
	976	* part in some composition */
	977	} while(ns > 0 && compositions
	978	&& (!block_starters \|\| utf32__combining_class(*s)));
	979	/* Store any remaining combining characters */
	980	while(ns > 0 && utf32__combining_class(*s)) {
	981	t++ = s++;
	982	--ns;
	983	}
	984	/* Store the resulting starter */
	985	*tt = starter;
	986	} else {
	987	/* Special-casing for Hangul
	988	*
	989	* If there are combining characters between the L and the V then they
	990	* will block the V and so no composition happens. Similarly combining
	991	* characters between V and T will block the T and so we only get as far
	992	* as LV.
	993	*/
	994	if(utf32__grapheme_break(*s) == unicode_Grapheme_Break_V) {
	995	const uint32_t V = *s++;
	996	const uint32_t LIndex = starter - LBase;
	997	const uint32_t VIndex = V - VBase;
	998	uint32_t TIndex;
	999	--ns;
	1000	if(ns > 0
	1001	&& utf32__grapheme_break(*s) == unicode_Grapheme_Break_T) {
	1002	/* We have an L V T sequence */
	1003	const uint32_t T = *s++;
	1004	TIndex = T - TBase;
	1005	--ns;
	1006	} else
	1007	/* It's just L V */
	1008	TIndex = 0;
	1009	/* Compose to LVT or LV as appropriate */
	1010	starter = (LIndex * VCount + VIndex) * TCount + TIndex + SBase;
	1011	} /* else we only have L or LV and no V or T */
	1012	*t++ = starter;
	1013	/* There could be some combining characters that belong to the V or T.
	1014	* These will be treated as non-starter characters at the top of the loop
	1015	* and thuss transferred to the output. */
	1016	}
	1017	}
	1018	return t - start;
	1019	}
	1020
	1021	/** @brief Guts of the composition and decomposition functions
	1022	* @param WHICH @c canon or @c compat to choose decomposition
	1023	* @param COMPOSE @c 0 or @c 1 to compose
	1024	*/
	1025	#define utf32__decompose_generic(WHICH, COMPOSE) do { \
	1026	struct dynstr_ucs4 d; \
	1027	uint32_t c; \
	1028	\
	1029	dynstr_ucs4_init(&d); \
	1030	while(ns) { \
	1031	c = *s++; \
	1032	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	1033	goto error; \
	1034	utf32__decompose_one_##WHICH(&d, c); \
	1035	--ns; \
	1036	} \
	1037	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	1038	goto error; \
	1039	if(COMPOSE) \
	1040	d.nvec = utf32__compose(d.vec, d.nvec); \
	1041	dynstr_ucs4_terminate(&d); \
	1042	if(ndp) \
	1043	*ndp = d.nvec; \
	1044	return d.vec; \
	1045	error: \
	1046	xfree(d.vec); \
	1047	return 0; \
	1048	} while(0)
	1049
	1050	/** @brief Canonically decompose @p [s,s+ns)
	1051	* @param s Pointer to string
	1052	* @param ns Length of string
	1053	* @param ndp Where to store length of result
	1054	* @return Pointer to result string, or NULL on error
	1055	*
	1056	* Computes NFD (Normalization Form D) of the string at @p s. This implies
	1057	* performing all canonical decompositions and then normalizing the order of
	1058	* combining characters.
	1059	*
	1060	* Returns NULL if the string is not valid for either of the following reasons:
	1061	* - it codes for a UTF-16 surrogate
	1062	* - it codes for a value outside the unicode code space
	1063	*
	1064	* See also:
	1065	* - utf32_decompose_compat()
	1066	* - utf32_compose_canon()
	1067	*/
	1068	uint32_t utf32_decompose_canon(const uint32_t s, size_t ns, size_t *ndp) {
	1069	utf32__decompose_generic(canon, 0);
	1070	}
	1071
	1072	/** @brief Compatibility decompose @p [s,s+ns)
	1073	* @param s Pointer to string
	1074	* @param ns Length of string
	1075	* @param ndp Where to store length of result
	1076	* @return Pointer to result string, or NULL on error
	1077	*
	1078	* Computes NFKD (Normalization Form KD) of the string at @p s. This implies
	1079	* performing all canonical and compatibility decompositions and then
	1080	* normalizing the order of combining characters.
	1081	*
	1082	* Returns NULL if the string is not valid for either of the following reasons:
	1083	* - it codes for a UTF-16 surrogate
	1084	* - it codes for a value outside the unicode code space
	1085	*
	1086	* See also:
	1087	* - utf32_decompose_canon()
	1088	* - utf32_compose_compat()
	1089	*/
	1090	uint32_t utf32_decompose_compat(const uint32_t s, size_t ns, size_t *ndp) {
	1091	utf32__decompose_generic(compat, 0);
	1092	}
	1093
	1094	/** @brief Canonically compose @p [s,s+ns)
	1095	* @param s Pointer to string
	1096	* @param ns Length of string
	1097	* @param ndp Where to store length of result
	1098	* @return Pointer to result string, or NULL on error
	1099	*
	1100	* Computes NFC (Normalization Form C) of the string at @p s. This implies
	1101	* performing all canonical decompositions, normalizing the order of combining
	1102	* characters and then composing all unblocked primary compositables.
	1103	*
	1104	* Returns NULL if the string is not valid for either of the following reasons:
	1105	* - it codes for a UTF-16 surrogate
	1106	* - it codes for a value outside the unicode code space
	1107	*
	1108	* See also:
	1109	* - utf32_compose_compat()
	1110	* - utf32_decompose_canon()
	1111	*/
	1112	uint32_t utf32_compose_canon(const uint32_t s, size_t ns, size_t *ndp) {
	1113	utf32__decompose_generic(canon, 1);
	1114	}
	1115
	1116	/** @brief Compatibility compose @p [s,s+ns)
	1117	* @param s Pointer to string
	1118	* @param ns Length of string
	1119	* @param ndp Where to store length of result
	1120	* @return Pointer to result string, or NULL on error
	1121	*
	1122	* Computes NFKC (Normalization Form KC) of the string at @p s. This implies
	1123	* performing all canonical and compatibility decompositions, normalizing the
	1124	* order of combining characters and then composing all unblocked primary
	1125	* compositables.
	1126	*
	1127	* Returns NULL if the string is not valid for either of the following reasons:
	1128	* - it codes for a UTF-16 surrogate
	1129	* - it codes for a value outside the unicode code space
	1130	*
	1131	* See also:
	1132	* - utf32_compose_canon()
	1133	* - utf32_decompose_compat()
	1134	*/
	1135	uint32_t utf32_compose_compat(const uint32_t s, size_t ns, size_t *ndp) {
	1136	utf32__decompose_generic(compat, 1);
	1137	}
	1138
	1139	/** @brief Single-character case-fold and decompose operation */
	1140	#define utf32__casefold_one(WHICH) do { \
	1141	const uint32_t *cf = utf32__unidata(c)->casefold; \
	1142	if(cf) { \
	1143	/* Found a case-fold mapping in the table */ \
	1144	while(*cf) \
	1145	utf32__decompose_one_##WHICH(&d, *cf++); \
	1146	} else \
	1147	utf32__decompose_one_##WHICH(&d, c); \
	1148	} while(0)
	1149
	1150	/** @brief Case-fold @p [s,s+ns)
	1151	* @param s Pointer to string
	1152	* @param ns Length of string
	1153	* @param ndp Where to store length of result
	1154	* @return Pointer to result string, or NULL on error
	1155	*
	1156	* Case-fold the string at @p s according to full default case-folding rules
	1157	* (s3.13) for caseless matching. The result will be in NFD.
	1158	*
	1159	* Returns NULL if the string is not valid for either of the following reasons:
	1160	* - it codes for a UTF-16 surrogate
	1161	* - it codes for a value outside the unicode code space
	1162	*/
	1163	uint32_t utf32_casefold_canon(const uint32_t s, size_t ns, size_t *ndp) {
	1164	struct dynstr_ucs4 d;
	1165	uint32_t c;
	1166	size_t n;
	1167	uint32_t *ss = 0;
	1168
	1169	/* If the canonical decomposition of the string includes any combining
	1170	* character that case-folds to a non-combining character then we must
	1171	* normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
	1172	* GREEK YPOGEGRAMMENI in its decomposition and the various characters that
	1173	* canonically decompose to it. */
	1174	for(n = 0; n < ns; ++n)
	1175	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	1176	break;
	1177	if(n < ns) {
	1178	/* We need a preliminary decomposition */
	1179	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	1180	return 0;
	1181	s = ss;
	1182	}
	1183	dynstr_ucs4_init(&d);
	1184	while(ns) {
	1185	c = *s++;
	1186	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF)
	1187	goto error;
	1188	utf32__casefold_one(canon);
	1189	--ns;
	1190	}
	1191	if(utf32__canonical_ordering(d.vec, d.nvec))
	1192	goto error;
	1193	dynstr_ucs4_terminate(&d);
	1194	if(ndp)
	1195	*ndp = d.nvec;
	1196	return d.vec;
	1197	error:
	1198	xfree(d.vec);
	1199	xfree(ss);
	1200	return 0;
	1201	}
	1202
	1203	/** @brief Compatibility case-fold @p [s,s+ns)
	1204	* @param s Pointer to string
	1205	* @param ns Length of string
	1206	* @param ndp Where to store length of result
	1207	* @return Pointer to result string, or NULL on error
	1208	*
	1209	* Case-fold the string at @p s according to full default case-folding rules
	1210	* (s3.13) for compatibility caseless matching. The result will be in NFKD.
	1211	*
	1212	* Returns NULL if the string is not valid for either of the following reasons:
	1213	* - it codes for a UTF-16 surrogate
	1214	* - it codes for a value outside the unicode code space
	1215	*/
	1216	uint32_t utf32_casefold_compat(const uint32_t s, size_t ns, size_t *ndp) {
	1217	struct dynstr_ucs4 d;
	1218	uint32_t c;
	1219	size_t n;
	1220	uint32_t *ss = 0;
	1221
	1222	for(n = 0; n < ns; ++n)
	1223	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
	1224	break;
	1225	if(n < ns) {
	1226	/* We need a preliminary _canonical_ decomposition */
	1227	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	1228	return 0;
	1229	s = ss;
	1230	}
	1231	/* This computes NFKD(toCaseFold(s)) */
	1232	#define compat_casefold_middle() do { \
	1233	dynstr_ucs4_init(&d); \
	1234	while(ns) { \
	1235	c = *s++; \
	1236	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	1237	goto error; \
	1238	utf32__casefold_one(compat); \
	1239	--ns; \
	1240	} \
	1241	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	1242	goto error; \
	1243	} while(0)
	1244	/* Do the inner (NFKD o toCaseFold) */
	1245	compat_casefold_middle();
	1246	/* We can do away with the NFD'd copy of the input now */
	1247	xfree(ss);
	1248	s = ss = d.vec;
	1249	ns = d.nvec;
	1250	/* Do the outer (NFKD o toCaseFold) */
	1251	compat_casefold_middle();
	1252	/* That's all */
	1253	dynstr_ucs4_terminate(&d);
	1254	if(ndp)
	1255	*ndp = d.nvec;
	1256	return d.vec;
	1257	error:
	1258	xfree(d.vec);
	1259	xfree(ss);
	1260	return 0;
	1261	}
	1262
	1263	/** @brief Order a pair of UTF-32 strings
	1264	* @param a First 0-terminated string
	1265	* @param b Second 0-terminated string
	1266	* @return -1, 0 or 1 for a less than, equal to or greater than b
	1267	*
	1268	* "Comparable to strcmp() at its best."
	1269	*/
	1270	int utf32_cmp(const uint32_t a, const uint32_t b) {
	1271	while(a && b && a == b) {
	1272	++a;
	1273	++b;
	1274	}
	1275	return a < b ? -1 : (a > b ? 1 : 0);
	1276	}
	1277
	1278	/** @brief Identify a grapheme cluster boundary
	1279	* @param s Start of string (must be NFD)
	1280	* @param ns Length of string
	1281	* @param n Index within string (in [0,ns].)
	1282	* @return 1 at a grapheme cluster boundary, 0 otherwise
	1283	*
	1284	* This function identifies default grapheme cluster boundaries as described in
	1285	* UAX #29 s3. It returns non-0 if @p n points at the code point just after a
	1286	* grapheme cluster boundary (including the hypothetical code point just after
	1287	* the end of the string).
	1288	*
	1289	* This function uses utf32_iterator_set() internally; see that function for
	1290	* remarks on performance.
	1291	*/
	1292	int utf32_is_grapheme_boundary(const uint32_t *s, size_t ns, size_t n) {
	1293	struct utf32_iterator_data it[1];
	1294
	1295	utf32__iterator_init(it, s, ns, n);
	1296	return utf32_iterator_grapheme_boundary(it);
	1297	}
	1298
	1299	/** @brief Identify a word boundary
	1300	* @param s Start of string (must be NFD)
	1301	* @param ns Length of string
	1302	* @param n Index within string (in [0,ns].)
	1303	* @return 1 at a word boundary, 0 otherwise
	1304	*
	1305	* This function identifies default word boundaries as described in UAX #29 s4.
	1306	* It returns non-0 if @p n points at the code point just after a word boundary
	1307	* (including the hypothetical code point just after the end of the string).
	1308	*
	1309	* This function uses utf32_iterator_set() internally; see that function for
	1310	* remarks on performance.
	1311	*/
	1312	int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
	1313	struct utf32_iterator_data it[1];
	1314
	1315	utf32__iterator_init(it, s, ns, n);
	1316	return utf32_iterator_word_boundary(it);
	1317	}
	1318
	1319	/** @brief Split [s,ns) into multiple words
	1320	* @param s Pointer to start of string
	1321	* @param ns Length of string
	1322	* @param nwp Where to store word count, or NULL
	1323	* @param wbreak Word_Break property tailor, or NULL
	1324	* @return Pointer to array of pointers to words
	1325	*
	1326	* The returned array is terminated by a NULL pointer and individual
	1327	* strings are 0-terminated.
	1328	*/
	1329	uint32_t *utf32_word_split(const uint32_t s, size_t ns, size_t *nwp,
	1330	unicode_property_tailor *wbreak) {
	1331	struct utf32_iterator_data it[1];
	1332	size_t b1 = 0, b2 = 0 ,i;
	1333	int isword;
	1334	struct vector32 v32[1];
	1335	uint32_t *w;
	1336
	1337	vector32_init(v32);
	1338	utf32__iterator_init(it, s, ns, 0);
	1339	it->word_break = wbreak;
	1340	/* Work our way through the string stopping at each word break. */
	1341	do {
	1342	if(utf32_iterator_word_boundary(it)) {
	1343	/* We've found a new boundary */
	1344	b1 = b2;
	1345	b2 = it->n;
	1346	/fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);/
	1347	/* Inspect the characters between the boundary and form an opinion as to
	1348	* whether they are a word or not */
	1349	isword = 0;
	1350	for(i = b1; i < b2; ++i) {
	1351	switch(utf32__iterator_word_break(it, it->s[i])) {
	1352	case unicode_Word_Break_ALetter:
	1353	case unicode_Word_Break_Numeric:
	1354	case unicode_Word_Break_Katakana:
	1355	isword = 1;
	1356	break;
	1357	default:
	1358	break;
	1359	}
	1360	}
	1361	/* If it's a word add it to the list of results */
	1362	if(isword) {
	1363	w = xcalloc(b2 - b1 + 1, sizeof(uint32_t));
	1364	memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t));
	1365	vector32_append(v32, w);
	1366	}
	1367	}
	1368	} while(!utf32_iterator_advance(it, 1));
	1369	vector32_terminate(v32);
	1370	if(nwp)
	1371	*nwp = v32->nvec;
	1372	return v32->vec;
	1373	}
	1374
	1375	/@}/
	1376	/** @defgroup utf8 Functions that operate on UTF-8 strings */
	1377	/@{/
	1378
	1379	/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
	1380	#define utf8__transform(FN) do { \
	1381	uint32_t to32 = 0, decomp32 = 0; \
	1382	size_t nto32, ndecomp32; \
	1383	char *decomp8 = 0; \
	1384	\
	1385	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
	1386	if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
	1387	decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
	1388	error: \
	1389	xfree(to32); \
	1390	xfree(decomp32); \
	1391	return decomp8; \
	1392	} while(0)
	1393
	1394	/** @brief Canonically decompose @p [s,s+ns)
	1395	* @param s Pointer to string
	1396	* @param ns Length of string
	1397	* @param ndp Where to store length of result
	1398	* @return Pointer to result string, or NULL on error
	1399	*
	1400	* Computes NFD (Normalization Form D) of the string at @p s. This implies
	1401	* performing all canonical decompositions and then normalizing the order of
	1402	* combining characters.
	1403	*
	1404	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1405	* this might be.
	1406	*
	1407	* See also:
	1408	* - utf32_decompose_canon().
	1409	* - utf8_decompose_compat()
	1410	* - utf8_compose_canon()
	1411	*/
	1412	char utf8_decompose_canon(const char s, size_t ns, size_t *ndp) {
	1413	utf8__transform(utf32_decompose_canon);
	1414	}
	1415
	1416	/** @brief Compatibility decompose @p [s,s+ns)
	1417	* @param s Pointer to string
	1418	* @param ns Length of string
	1419	* @param ndp Where to store length of result
	1420	* @return Pointer to result string, or NULL on error
	1421	*
	1422	* Computes NFKD (Normalization Form KD) of the string at @p s. This implies
	1423	* performing all canonical and compatibility decompositions and then
	1424	* normalizing the order of combining characters.
	1425	*
	1426	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1427	* this might be.
	1428	*
	1429	* See also:
	1430	* - utf32_decompose_compat().
	1431	* - utf8_decompose_canon()
	1432	* - utf8_compose_compat()
	1433	*/
	1434	char utf8_decompose_compat(const char s, size_t ns, size_t *ndp) {
	1435	utf8__transform(utf32_decompose_compat);
	1436	}
	1437
	1438	/** @brief Canonically compose @p [s,s+ns)
	1439	* @param s Pointer to string
	1440	* @param ns Length of string
	1441	* @param ndp Where to store length of result
	1442	* @return Pointer to result string, or NULL on error
	1443	*
	1444	* Computes NFC (Normalization Form C) of the string at @p s. This implies
	1445	* performing all canonical decompositions, normalizing the order of combining
	1446	* characters and then composing all unblocked primary compositables.
	1447	*
	1448	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1449	* this might be.
	1450	*
	1451	* See also:
	1452	* - utf32_compose_canon()
	1453	* - utf8_compose_compat()
	1454	* - utf8_decompose_canon()
	1455	*/
	1456	char utf8_compose_canon(const char s, size_t ns, size_t *ndp) {
	1457	utf8__transform(utf32_compose_canon);
	1458	}
	1459
	1460	/** @brief Compatibility compose @p [s,s+ns)
	1461	* @param s Pointer to string
	1462	* @param ns Length of string
	1463	* @param ndp Where to store length of result
	1464	* @return Pointer to result string, or NULL on error
	1465	*
	1466	* Computes NFKC (Normalization Form KC) of the string at @p s. This implies
	1467	* performing all canonical and compatibility decompositions, normalizing the
	1468	* order of combining characters and then composing all unblocked primary
	1469	* compositables.
	1470	*
	1471	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1472	* this might be.
	1473	*
	1474	* See also:
	1475	* - utf32_compose_compat()
	1476	* - utf8_compose_canon()
	1477	* - utf8_decompose_compat()
	1478	*/
	1479	char utf8_compose_compat(const char s, size_t ns, size_t *ndp) {
	1480	utf8__transform(utf32_compose_compat);
	1481	}
	1482
	1483	/** @brief Case-fold @p [s,s+ns)
	1484	* @param s Pointer to string
	1485	* @param ns Length of string
	1486	* @param ndp Where to store length of result
	1487	* @return Pointer to result string, or NULL on error
	1488	*
	1489	* Case-fold the string at @p s according to full default case-folding rules
	1490	* (s3.13). The result will be in NFD.
	1491	*
	1492	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1493	* this might be.
	1494	*/
	1495	char utf8_casefold_canon(const char s, size_t ns, size_t *ndp) {
	1496	utf8__transform(utf32_casefold_canon);
	1497	}
	1498
	1499	/** @brief Compatibility case-fold @p [s,s+ns)
	1500	* @param s Pointer to string
	1501	* @param ns Length of string
	1502	* @param ndp Where to store length of result
	1503	* @return Pointer to result string, or NULL on error
	1504	*
	1505	* Case-fold the string at @p s according to full default case-folding rules
	1506	* (s3.13). The result will be in NFKD.
	1507	*
	1508	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1509	* this might be.
	1510	*/
	1511	char utf8_casefold_compat(const char s, size_t ns, size_t *ndp) {
	1512	utf8__transform(utf32_casefold_compat);
	1513	}
	1514
	1515	/** @brief Split [s,ns) into multiple words
	1516	* @param s Pointer to start of string
	1517	* @param ns Length of string
	1518	* @param nwp Where to store word count, or NULL
	1519	* @param wbreak Word_Break property tailor, or NULL
	1520	* @return Pointer to array of pointers to words
	1521	*
	1522	* The returned array is terminated by a NULL pointer and individual
	1523	* strings are 0-terminated.
	1524	*/
	1525	char *utf8_word_split(const char s, size_t ns, size_t *nwp,
	1526	unicode_property_tailor *wbreak) {
	1527	uint32_t to32 = 0, *v32 = 0;
	1528	size_t nto32, nv, n;
	1529	char v8 = 0, ret = 0;
	1530
	1531	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
	1532	if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
	1533	v8 = xcalloc(sizeof (char *), nv + 1);
	1534	for(n = 0; n < nv; ++n)
	1535	if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
	1536	goto error;
	1537	ret = v8;
	1538	*nwp = nv;
	1539	v8 = 0; /* don't free */
	1540	error:
	1541	if(v8) {
	1542	for(n = 0; n < nv; ++n)
	1543	xfree(v8[n]);
	1544	xfree(v8);
	1545	}
	1546	if(v32) {
	1547	for(n = 0; n < nv; ++n)
	1548	xfree(v32[n]);
	1549	xfree(v32);
	1550	}
	1551	xfree(to32);
	1552	return ret;
	1553	}
	1554
	1555
	1556	/@}/
	1557
	1558	/*
	1559	Local Variables:
	1560	c-basic-offset:2
	1561	comment-column:40
	1562	fill-column:79
	1563	indent-tabs-mode:nil
	1564	End:
	1565	*/