chiark - git - mdw - disorder/blame

Commit	Line	Data
e5a5a138 RK	1	/*
e5a5a138 RK	2	* This file is part of DisOrder
bb5c7798	3	* Copyright (C) 2007, 2009, 2013 Richard Kettlewell
e5a5a138	4	*
e7eb3a27	5	* This program is free software: you can redistribute it and/or modify
e5a5a138	6	* it under the terms of the GNU General Public License as published by
e7eb3a27	7	* the Free Software Foundation, either version 3 of the License, or
e5a5a138	8	* (at your option) any later version.
e7eb3a27 RK	9	*
	10	* This program is distributed in the hope that it will be useful,
	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	* GNU General Public License for more details.
	14	*
e5a5a138	15	* You should have received a copy of the GNU General Public License
e7eb3a27	16	* along with this program. If not, see <http://www.gnu.org/licenses/>.
e5a5a138 RK	17	*/
	18	/** @file lib/unicode.c
	19	* @brief Unicode support functions
	20	*
	21	* Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
35b651f0 RK	22	* encoding schemes). The primary encoding form is UTF-32 but convenience
35b651f0 RK	23	* wrappers using UTF-8 are provided for a number of functions.
e5a5a138 RK	24	*
	25	* The idea is that all the strings that hit the database will be in a
	26	* particular normalization form, and for the search and tags database
	27	* in case-folded form, so they can be naively compared within the
	28	* database code.
	29	*
	30	* As the code stands this guarantee is not well met!
0ae60b83 RK	31	*
	32	* Subpages:
	33	* - @ref utf32props
	34	* - @ref utftransform
	35	* - @ref utf32iterator
	36	* - @ref utf32
	37	* - @ref utf8
e5a5a138 RK	38	*/
e5a5a138 RK	39
05b75f8d	40	#include "common.h"
e5a5a138 RK	41
	42	#include "mem.h"
	43	#include "vector.h"
	44	#include "unicode.h"
	45	#include "unidata.h"
	46
092f426f RK	47	/** @defgroup utf32props Unicode Code Point Properties */
	48	/@{/
	49
	50	static const struct unidata *utf32__unidata_hard(uint32_t c);
	51
	52	/** @brief Find definition of code point @p c
	53	* @param c Code point
	54	* @return Pointer to @ref unidata structure for @p c
	55	*
	56	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	57	* The returned pointer is NOT guaranteed to be unique to @p c.
	58	*/
	59	static inline const struct unidata *utf32__unidata(uint32_t c) {
	60	/* The bottom half of the table contains almost everything of interest
	61	* and we can just return the right thing straight away */
	62	if(c < UNICODE_BREAK_START)
	63	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	64	else
	65	return utf32__unidata_hard(c);
	66	}
	67
	68	/** @brief Find definition of code point @p c
	69	* @param c Code point
	70	* @return Pointer to @ref unidata structure for @p c
	71	*
	72	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	73	* The returned pointer is NOT guaranteed to be unique to @p c.
	74	*
	75	* Don't use this function (although it will work fine) - use utf32__unidata()
	76	* instead.
	77	*/
	78	static const struct unidata *utf32__unidata_hard(uint32_t c) {
	79	if(c < UNICODE_BREAK_START)
	80	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	81	/* Within the break everything is unassigned */
	82	if(c < UNICODE_BREAK_END)
	83	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	84	/* Planes 15 and 16 are (mostly) private use */
	85	if((c >= 0xF0000 && c <= 0xFFFFD)
	86	\|\| (c >= 0x100000 && c <= 0x10FFFD))
	87	return utf32__unidata(0xE000); /* first Co code point */
	88	/* Everything else above the break top is unassigned */
	89	if(c >= UNICODE_BREAK_TOP)
	90	return utf32__unidata(0xFFFF); /* guaranteed to be Cn */
	91	/* Currently the rest is language tags and variation selectors */
	92	c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
	93	return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
	94	}
	95
	96	/** @brief Return the combining class of @p c
	97	* @param c Code point
	98	* @return Combining class of @p c
	99	*
	100	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	101	*/
	102	static inline int utf32__combining_class(uint32_t c) {
	103	return utf32__unidata(c)->ccc;
	104	}
	105
3c82b504 RK	106	/** @brief Return the combining class of @p c
	107	* @param c Code point
	108	* @return Combining class of @p c
	109	*
	110	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	111	*/
	112	int utf32_combining_class(uint32_t c) {
	113	return utf32__combining_class(c);
	114	}
	115
092f426f	116	/** @brief Return the General_Category value for @p c
0ae60b83	117	* @param c Code point
092f426f RK	118	* @return General_Category property value
	119	*
	120	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	121	*/
	122	static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
	123	return utf32__unidata(c)->general_category;
	124	}
	125
	126	/** @brief Determine Grapheme_Break property
	127	* @param c Code point
	128	* @return Grapheme_Break property value of @p c
	129	*
	130	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	131	*/
	132	static inline enum unicode_Grapheme_Break utf32__grapheme_break(uint32_t c) {
	133	return utf32__unidata(c)->grapheme_break;
	134	}
	135
	136	/** @brief Determine Word_Break property
	137	* @param c Code point
	138	* @return Word_Break property value of @p c
	139	*
	140	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	141	*/
	142	static inline enum unicode_Word_Break utf32__word_break(uint32_t c) {
	143	return utf32__unidata(c)->word_break;
	144	}
	145
	146	/** @brief Determine Sentence_Break property
	147	* @param c Code point
	148	* @return Word_Break property value of @p c
	149	*
	150	* @p c can be any 32-bit value, a sensible value will be returned regardless.
	151	*/
	152	static inline enum unicode_Sentence_Break utf32__sentence_break(uint32_t c) {
	153	return utf32__unidata(c)->sentence_break;
	154	}
	155
	156	/** @brief Return true if @p c is ignorable for boundary specifications
	157	* @param wb Word break property value
	158	* @return non-0 if @p wb is unicode_Word_Break_Extend or unicode_Word_Break_Format
	159	*/
	160	static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
	161	return (wb == unicode_Word_Break_Extend
	162	\|\| wb == unicode_Word_Break_Format);
	163	}
	164
f98fcddb RK	165	/** @brief Return the canonical decomposition of @p c
	166	* @param c Code point
	167	* @return 0-terminated canonical decomposition, or 0
	168	*/
	169	static inline const uint32_t *utf32__decomposition_canon(uint32_t c) {
	170	const struct unidata *const data = utf32__unidata(c);
	171	const uint32_t *const decomp = data->decomp;
	172
	173	if(decomp && !(data->flags & unicode_compatibility_decomposition))
	174	return decomp;
	175	else
	176	return 0;
	177	}
	178
	179	/** @brief Return the compatibility decomposition of @p c
	180	* @param c Code point
	181	* @return 0-terminated decomposition, or 0
	182	*/
	183	static inline const uint32_t *utf32__decomposition_compat(uint32_t c) {
	184	return utf32__unidata(c)->decomp;
	185	}
	186
092f426f	187	/@}/
e5a5a138 RK	188	/** @defgroup utftransform Functions that transform between different Unicode encoding forms */
	189	/@{/
	190
	191	/** @brief Convert UTF-32 to UTF-8
	192	* @param s Source string
	193	* @param ns Length of source string in code points
	194	* @param ndp Where to store length of destination string (or NULL)
	195	* @return Newly allocated destination string or NULL on error
	196	*
56fd389c RK	197	* If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
56fd389c RK	198	* invalid if:
e5a5a138 RK	199	* - it codes for a UTF-16 surrogate
	200	* - it codes for a value outside the unicode code space
	201	*
56fd389c RK	202	* The return value is always 0-terminated. The value returned via @p *ndp
56fd389c RK	203	* does not include the terminator.
e5a5a138 RK	204	*/
	205	char utf32_to_utf8(const uint32_t s, size_t ns, size_t *ndp) {
	206	struct dynstr d;
	207	uint32_t c;
	208
	209	dynstr_init(&d);
	210	while(ns > 0) {
	211	c = *s++;
	212	if(c < 0x80)
	213	dynstr_append(&d, c);
	214	else if(c < 0x0800) {
	215	dynstr_append(&d, 0xC0 \| (c >> 6));
	216	dynstr_append(&d, 0x80 \| (c & 0x3F));
	217	} else if(c < 0x10000) {
56fd389c	218	if(c >= 0xD800 && c <= 0xDFFF)
e5a5a138 RK	219	goto error;
	220	dynstr_append(&d, 0xE0 \| (c >> 12));
	221	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	222	dynstr_append(&d, 0x80 \| (c & 0x3F));
	223	} else if(c < 0x110000) {
	224	dynstr_append(&d, 0xF0 \| (c >> 18));
	225	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	226	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	227	dynstr_append(&d, 0x80 \| (c & 0x3F));
	228	} else
	229	goto error;
	230	--ns;
	231	}
	232	dynstr_terminate(&d);
	233	if(ndp)
	234	*ndp = d.nvec;
	235	return d.vec;
	236	error:
	237	xfree(d.vec);
	238	return 0;
	239	}
	240
	241	/** @brief Convert UTF-8 to UTF-32
	242	* @param s Source string
	243	* @param ns Length of source string in code points
	244	* @param ndp Where to store length of destination string (or NULL)
f98fcddb	245	* @return Newly allocated destination string or NULL on error
e5a5a138	246	*
56fd389c RK	247	* The return value is always 0-terminated. The value returned via @p *ndp
56fd389c RK	248	* does not include the terminator.
e5a5a138 RK	249	*
	250	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
	251	* for a code point is invalid if:
	252	* - it is not the shortest possible sequence for the code point
	253	* - it codes for a UTF-16 surrogate
	254	* - it codes for a value outside the unicode code space
	255	*/
	256	uint32_t utf8_to_utf32(const char s, size_t ns, size_t *ndp) {
	257	struct dynstr_ucs4 d;
32b158f2	258	uint32_t c32;
e5a5a138	259	const uint8_t ss = (const uint8_t )s;
32b158f2	260	int n;
e5a5a138 RK	261
	262	dynstr_ucs4_init(&d);
	263	while(ns > 0) {
32b158f2 RK	264	const struct unicode_utf8_row const r = &unicode_utf8_valid[ss];
	265	if(r->count <= ns) {
	266	switch(r->count) {
	267	case 1:
	268	c32 = *ss;
	269	break;
	270	case 2:
	271	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	272	goto error;
	273	c32 = *ss & 0x1F;
	274	break;
	275	case 3:
	276	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	277	goto error;
	278	c32 = *ss & 0x0F;
	279	break;
	280	case 4:
	281	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	282	goto error;
	283	c32 = *ss & 0x07;
	284	break;
	285	default:
	286	goto error;
	287	}
e5a5a138 RK	288	} else
e5a5a138 RK	289	goto error;
32b158f2 RK	290	for(n = 1; n < r->count; ++n) {
	291	if(ss[n] < 0x80 \|\| ss[n] > 0xBF)
	292	goto error;
	293	c32 = (c32 << 6) \| (ss[n] & 0x3F);
	294	}
e5a5a138	295	dynstr_ucs4_append(&d, c32);
32b158f2 RK	296	ss += r->count;
32b158f2 RK	297	ns -= r->count;
e5a5a138 RK	298	}
	299	dynstr_ucs4_terminate(&d);
	300	if(ndp)
	301	*ndp = d.nvec;
	302	return d.vec;
	303	error:
	304	xfree(d.vec);
	305	return 0;
	306	}
	307
bb5c7798 RK	308	/** @brief Convert UTF-16 to UTF-8
	309	* @param s Source string
	310	* @param ns Length of source string in code points
	311	* @param ndp Where to store length of destination string (or NULL)
	312	* @return Newly allocated destination string or NULL on error
	313	*
	314	* If the UTF-16 is not valid then NULL is returned. A UTF-16 sequence t is
	315	* invalid if it contains an incomplete surrogate.
	316	*
	317	* The return value is always 0-terminated. The value returned via @p *ndp
	318	* does not include the terminator.
	319	*/
	320	char utf16_to_utf8(const uint16_t s, size_t ns, size_t *ndp) {
	321	struct dynstr d;
	322	uint32_t c;
	323
	324	dynstr_init(&d);
	325	while(ns > 0) {
	326	c = *s++;
	327	--ns;
	328	if(c >= 0xD800 && c <= 0xDBFF) {
	329	if(ns && *s >= 0xDC00 && c <= 0xDFFF)
	330	c = ((c - 0xD800) << 10) + (*s++ - 0xDC00) + 0x10000;
	331	else
	332	goto error;
	333	} else if(c >= 0xDC00 && c <= 0xDFFF)
	334	goto error;
	335	if(c < 0x80)
	336	dynstr_append(&d, c);
	337	else if(c < 0x0800) {
	338	dynstr_append(&d, 0xC0 \| (c >> 6));
	339	dynstr_append(&d, 0x80 \| (c & 0x3F));
	340	} else if(c < 0x10000) {
	341	if(c >= 0xD800 && c <= 0xDFFF)
	342	goto error;
	343	dynstr_append(&d, 0xE0 \| (c >> 12));
	344	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	345	dynstr_append(&d, 0x80 \| (c & 0x3F));
	346	} else if(c < 0x110000) {
	347	dynstr_append(&d, 0xF0 \| (c >> 18));
	348	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	349	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	350	dynstr_append(&d, 0x80 \| (c & 0x3F));
	351	} else
	352	goto error;
	353	}
	354	dynstr_terminate(&d);
	355	if(ndp)
	356	*ndp = d.nvec;
	357	return d.vec;
	358	error:
	359	xfree(d.vec);
	360	return 0;
	361	}
	362
	363	/** @brief Convert UTF-8 to UTF-16
	364	* @param s Source string
	365	* @param ns Length of source string in code points
	366	* @param ndp Where to store length of destination string (or NULL)
	367	* @return Newly allocated destination string or NULL on error
	368	*
	369	* The return value is always 0-terminated. The value returned via @p *ndp
	370	* does not include the terminator.
	371	*
372	* If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
373	* for a code point is invalid if:
374	* - it is not the shortest possible sequence for the code point
375	* - it codes for a UTF-16 surrogate
376	* - it codes for a value outside the unicode code space
377	*/
378	uint16_t utf8_to_utf16(const char s, size_t ns, size_t *ndp) {
379	struct dynstr_utf16 d;
380	uint32_t c32;
381	const uint8_t ss = (const uint8_t )s;
382	int n;
383
384	dynstr_utf16_init(&d);
385	while(ns > 0) {
386	const struct unicode_utf8_row const r = &unicode_utf8_valid[ss];
387	if(r->count <= ns) {
388	switch(r->count) {
389	case 1:
390	c32 = *ss;
391	break;
392	case 2:
393	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
394	goto error;
395	c32 = *ss & 0x1F;
396	break;
397	case 3:
398	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
399	goto error;
400	c32 = *ss & 0x0F;
401	break;
402	case 4:
403	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
404	goto error;
405	c32 = *ss & 0x07;
406	break;
407	default:
408	goto error;
409	}
410	} else
411	goto error;
412	for(n = 1; n < r->count; ++n) {
413	if(ss[n] < 0x80 \|\| ss[n] > 0xBF)
414	goto error;
415	c32 = (c32 << 6) \| (ss[n] & 0x3F);
416	}
417	if(c32 >= 0x10000) {
418	c32 -= 0x10000;
419	dynstr_utf16_append(&d, 0xD800 + (c32 >> 10));
420	dynstr_utf16_append(&d, 0xDC00 + (c32 & 0x03FF));
421	} else
422	dynstr_utf16_append(&d, c32);
423	ss += r->count;
424	ns -= r->count;
425	}
426	dynstr_utf16_terminate(&d);
427	if(ndp)
428	*ndp = d.nvec;
429	return d.vec;
430	error:
431	xfree(d.vec);
432	return 0;
433	}
434
18cda350 RK	435	/** @brief Test whether [s,s+ns) is valid UTF-8
	436	* @param s Start of string
	437	* @param ns Length of string
	438	* @return non-0 if @p s is valid UTF-8, 0 if it is not valid
	439	*
	440	* This function is intended to be much faster than calling utf8_to_utf32() and
	441	* throwing away the result.
	442	*/
	443	int utf8_valid(const char *s, size_t ns) {
	444	const uint8_t ss = (const uint8_t )s;
	445	while(ns > 0) {
	446	const struct unicode_utf8_row const r = &unicode_utf8_valid[ss];
	447	if(r->count <= ns) {
	448	switch(r->count) {
	449	case 1:
	450	break;
	451	case 2:
	452	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	453	return 0;
	454	break;
	455	case 3:
	456	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	457	return 0;
	458	if(ss[2] < 0x80 \|\| ss[2] > 0xBF)
	459	return 0;
	460	break;
	461	case 4:
	462	if(ss[1] < r->min2 \|\| ss[1] > r->max2)
	463	return 0;
	464	if(ss[2] < 0x80 \|\| ss[2] > 0xBF)
	465	return 0;
	466	if(ss[3] < 0x80 \|\| ss[3] > 0xBF)
	467	return 0;
	468	break;
	469	default:
	470	return 0;
	471	}
	472	} else
	473	return 0;
	474	ss += r->count;
	475	ns -= r->count;
	476	}
	477	return 1;
	478	}
	479
092f426f RK	480	/@}/
	481	/** @defgroup utf32iterator UTF-32 string iterators */
	482	/@{/
	483
	484	struct utf32_iterator_data {
	485	/** @brief Start of string */
	486	const uint32_t *s;
	487
	488	/** @brief Length of string */
	489	size_t ns;
	490
	491	/** @brief Current position */
	492	size_t n;
	493
	494	/** @brief Last two non-ignorable characters or (uint32_t)-1
	495	*
	496	* last[1] is the non-Extend/Format character just before position @p n;
	497	* last[0] is the one just before that.
	498	*
	499	* Exception 1: if there is no such non-Extend/Format character then an
	500	* Extend/Format character is accepted instead.
	501	*
	502	* Exception 2: if there is no such character even taking that into account
	503	* the value is (uint32_t)-1.
	504	*/
	505	uint32_t last[2];
092f426f	506
c85b7022 RK	507	/** @brief Tailoring for Word_Break */
	508	unicode_property_tailor *word_break;
	509	};
092f426f RK	510
	511	/** @brief Initialize an internal private iterator
	512	* @param it Iterator
	513	* @param s Start of string
	514	* @param ns Length of string
	515	* @param n Absolute position
	516	*/
	517	static void utf32__iterator_init(utf32_iterator it,
	518	const uint32_t *s, size_t ns, size_t n) {
	519	it->s = s;
	520	it->ns = ns;
	521	it->n = 0;
	522	it->last[0] = it->last[1] = -1;
c85b7022	523	it->word_break = 0;
b21a155c	524	utf32_iterator_set(it, n);
092f426f RK	525	}
092f426f RK	526
c85b7022 RK	527	/** @brief Create a new iterator pointing at the start of a string
	528	* @param s Start of string
	529	* @param ns Length of string
	530	* @return New iterator
	531	*/
	532	utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
	533	utf32_iterator it = xmalloc(sizeof *it);
	534	utf32__iterator_init(it, s, ns, 0);
	535	return it;
	536	}
	537
	538	/** @brief Tailor this iterator's interpretation of the Word_Break property.
	539	* @param it Iterator
	540	* @param pt Property tailor function or NULL
	541	*
	542	* After calling this the iterator will call @p pt to determine the Word_Break
	543	* property of each code point. If it returns -1 the default value will be
	544	* used otherwise the returned value will be used.
	545	*
	546	* @p pt can be NULL to revert to the default value of the property.
	547	*
	548	* It is safe to call this function at any time; the iterator's internal state
	549	* will be reset to suit the new tailoring.
	550	*/
	551	void utf32_iterator_tailor_word_break(utf32_iterator it,
	552	unicode_property_tailor *pt) {
	553	it->word_break = pt;
	554	utf32_iterator_set(it, it->n);
	555	}
	556
	557	static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
	558	uint32_t c) {
	559	if(!it->word_break)
	560	return utf32__word_break(c);
	561	else {
	562	const int t = it->word_break(c);
	563
	564	if(t < 0)
	565	return utf32__word_break(c);
	566	else
	567	return t;
	568	}
	569	}
	570
092f426f RK	571	/** @brief Destroy an iterator
	572	* @param it Iterator
	573	*/
	574	void utf32_iterator_destroy(utf32_iterator it) {
	575	xfree(it);
	576	}
	577
	578	/** @brief Find the current position of an interator
	579	* @param it Iterator
	580	*/
	581	size_t utf32_iterator_where(utf32_iterator it) {
	582	return it->n;
	583	}
	584
	585	/** @brief Set an iterator's absolute position
	586	* @param it Iterator
	587	* @param n Absolute position
	588	* @return 0 on success, non-0 on error
	589	*
	590	* It is an error to position the iterator outside the string (but acceptable
	591	* to point it at the hypothetical post-final character). If an invalid value
	592	* of @p n is specified then the iterator is not changed.
f98fcddb RK	593	*
	594	* This function works by backing up and then advancing to reconstruct the
	595	* iterator's internal state for position @p n. The worst case will be O(n)
	596	* time complexity (with a worse constant factor that utf32_iterator_advance())
	597	* but the typical case is essentially constant-time.
092f426f RK	598	*/
092f426f RK	599	int utf32_iterator_set(utf32_iterator it, size_t n) {
5617aaff RK	600	/* We can't just jump to position @p n; the @p last[] values will be wrong.
	601	* What we need is to jump a bit behind @p n and then advance forward,
	602	* updating @p last[] along the way. How far back? We need to cross two
	603	* non-ignorable code points as we advance forwards, so we'd better pass two
	604	* such characters on the way back (if such are available).
	605	*/
b21a155c	606	size_t m;
5617aaff RK	607
5617aaff RK	608	if(n > it->ns) /* range check */
092f426f	609	return -1;
b21a155c RK	610	/* Walk backwards skipping ignorable code points */
b21a155c RK	611	m = n;
c85b7022 RK	612	while(m > 0
	613	&& (utf32__boundary_ignorable(utf32__iterator_word_break(it,
	614	it->s[m-1]))))
b21a155c RK	615	--m;
	616	/* Either m=0 or s[m-1] is not ignorable */
	617	if(m > 0) {
	618	--m;
	619	/* s[m] is our first non-ignorable code; look for a second in the same
	620	way **/
c85b7022 RK	621	while(m > 0
	622	&& (utf32__boundary_ignorable(utf32__iterator_word_break(it,
	623	it->s[m-1]))))
5617aaff	624	--m;
b21a155c RK	625	/* Either m=0 or s[m-1] is not ignorable */
	626	if(m > 0)
	627	--m;
	628	}
	629	it->last[0] = it->last[1] = -1;
5617aaff RK	630	it->n = m;
5617aaff RK	631	return utf32_iterator_advance(it, n - m);
092f426f RK	632	}
	633
	634	/** @brief Advance an iterator
	635	* @param it Iterator
	636	* @param count Number of code points to advance by
	637	* @return 0 on success, non-0 on error
	638	*
	639	* It is an error to advance an iterator beyond the hypothetical post-final
	640	* character of the string. If an invalid value of @p n is specified then the
	641	* iterator is not changed.
	642	*
	643	* This function has O(n) time complexity: it works by advancing naively
	644	* forwards through the string.
	645	*/
	646	int utf32_iterator_advance(utf32_iterator it, size_t count) {
	647	if(count <= it->ns - it->n) {
	648	while(count > 0) {
	649	const uint32_t c = it->s[it->n];
c85b7022	650	const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
092f426f RK	651	if(it->last[1] == (uint32_t)-1
	652	\|\| !utf32__boundary_ignorable(wb)) {
	653	it->last[0] = it->last[1];
	654	it->last[1] = c;
	655	}
	656	++it->n;
	657	--count;
	658	}
	659	return 0;
	660	} else
	661	return -1;
	662	}
	663
	664	/** @brief Find the current code point
	665	* @param it Iterator
	666	* @return Current code point or 0
	667	*
	668	* If the iterator points at the hypothetical post-final character of the
	669	* string then 0 is returned. NB that this doesn't mean that there aren't any
	670	* 0 code points inside the string!
	671	*/
	672	uint32_t utf32_iterator_code(utf32_iterator it) {
	673	if(it->n < it->ns)
	674	return it->s[it->n];
	675	else
	676	return 0;
	677	}
	678
	679	/** @brief Test for a grapheme boundary
	680	* @param it Iterator
	681	* @return Non-0 if pointing just after a grapheme boundary, otherwise 0
f98fcddb RK	682	*
	683	* This function identifies default grapheme cluster boundaries as described in
	684	* UAX #29 s3. It returns non-0 if @p it points at the code point just after a
	685	* grapheme cluster boundary (including the hypothetical code point just after
	686	* the end of the string).
092f426f RK	687	*/
	688	int utf32_iterator_grapheme_boundary(utf32_iterator it) {
	689	uint32_t before, after;
	690	enum unicode_Grapheme_Break gbbefore, gbafter;
	691	/* GB1 and GB2 */
	692	if(it->n == 0 \|\| it->n == it->ns)
	693	return 1;
	694	/* Now we know that s[n-1] and s[n] are safe to inspect */
	695	/* GB3 */
	696	before = it->s[it->n-1];
	697	after = it->s[it->n];
	698	if(before == 0x000D && after == 0x000A)
	699	return 0;
	700	gbbefore = utf32__grapheme_break(before);
	701	gbafter = utf32__grapheme_break(after);
	702	/* GB4 */
	703	if(gbbefore == unicode_Grapheme_Break_Control
	704	\|\| before == 0x000D
	705	\|\| before == 0x000A)
	706	return 1;
	707	/* GB5 */
	708	if(gbafter == unicode_Grapheme_Break_Control
	709	\|\| after == 0x000D
	710	\|\| after == 0x000A)
	711	return 1;
	712	/* GB6 */
	713	if(gbbefore == unicode_Grapheme_Break_L
	714	&& (gbafter == unicode_Grapheme_Break_L
	715	\|\| gbafter == unicode_Grapheme_Break_V
	716	\|\| gbafter == unicode_Grapheme_Break_LV
	717	\|\| gbafter == unicode_Grapheme_Break_LVT))
	718	return 0;
	719	/* GB7 */
	720	if((gbbefore == unicode_Grapheme_Break_LV
	721	\|\| gbbefore == unicode_Grapheme_Break_V)
	722	&& (gbafter == unicode_Grapheme_Break_V
	723	\|\| gbafter == unicode_Grapheme_Break_T))
	724	return 0;
	725	/* GB8 */
	726	if((gbbefore == unicode_Grapheme_Break_LVT
	727	\|\| gbbefore == unicode_Grapheme_Break_T)
	728	&& gbafter == unicode_Grapheme_Break_T)
	729	return 0;
	730	/* GB9 */
	731	if(gbafter == unicode_Grapheme_Break_Extend)
	732	return 0;
e2e88ad8 RK	733	/* GB9a */
	734	if(gbafter == unicode_Grapheme_Break_SpacingMark)
	735	return 0;
	736	/* GB9b */
	737	if(gbbefore == unicode_Grapheme_Break_Prepend)
	738	return 0;
092f426f RK	739	/* GB10 */
	740	return 1;
	741
	742	}
	743
	744	/** @brief Test for a word boundary
	745	* @param it Iterator
	746	* @return Non-0 if pointing just after a word boundary, otherwise 0
f98fcddb RK	747	*
	748	* This function identifies default word boundaries as described in UAX #29 s4.
	749	* It returns non-0 if @p it points at the code point just after a word
	750	* boundary (including the hypothetical code point just after the end of the
	751	* string) and 0 otherwise.
092f426f RK	752	*/
092f426f RK	753	int utf32_iterator_word_boundary(utf32_iterator it) {
36f522a4	754	uint32_t before, after;
2dc0bc24	755	enum unicode_Word_Break wbtwobefore, wbbefore, wbafter, wbtwoafter;
092f426f RK	756	size_t nn;
	757
	758	/* WB1 and WB2 */
	759	if(it->n == 0 \|\| it->n == it->ns)
	760	return 1;
36f522a4 RK	761	before = it->s[it->n-1];
36f522a4 RK	762	after = it->s[it->n];
092f426f	763	/* WB3 */
36f522a4	764	if(before == 0x000D && after == 0x000A)
092f426f	765	return 0;
fb4c61da	766	/* WB3a */
36f522a4 RK	767	if(utf32__iterator_word_break(it, before) == unicode_Word_Break_Newline
	768	\|\| before == 0x000D
	769	\|\| before == 0x000A)
fb4c61da RK	770	return 1;
fb4c61da RK	771	/* WB3b */
36f522a4 RK	772	if(utf32__iterator_word_break(it, after) == unicode_Word_Break_Newline
	773	\|\| after == 0x000D
	774	\|\| after == 0x000A)
fb4c61da	775	return 1;
092f426f RK	776	/* WB4 */
092f426f RK	777	/* (!Sep) x (Extend\|Format) as in UAX #29 s6.2 */
36f522a4 RK	778	if(utf32__sentence_break(before) != unicode_Sentence_Break_Sep
36f522a4 RK	779	&& utf32__boundary_ignorable(utf32__iterator_word_break(it, after)))
092f426f RK	780	return 0;
	781	/* Gather the property values we'll need for the rest of the test taking the
	782	* s6.2 changes into account */
	783	/* First we look at the code points after the proposed boundary */
	784	nn = it->n; /* <it->ns */
2dc0bc24 RK	785	wbafter = utf32__iterator_word_break(it, it->s[nn++]);
2dc0bc24 RK	786	if(!utf32__boundary_ignorable(wbafter)) {
092f426f RK	787	/* X (Extend\|Format)* -> X */
092f426f RK	788	while(nn < it->ns
c85b7022 RK	789	&& utf32__boundary_ignorable(utf32__iterator_word_break(it,
c85b7022 RK	790	it->s[nn])))
092f426f RK	791	++nn;
	792	}
	793	/* It's possible now that nn=ns */
	794	if(nn < it->ns)
2dc0bc24	795	wbtwoafter = utf32__iterator_word_break(it, it->s[nn]);
092f426f	796	else
2dc0bc24	797	wbtwoafter = unicode_Word_Break_Other;
092f426f RK	798
	799	/* We've already recorded the non-ignorable code points before the proposed
	800	* boundary */
2dc0bc24 RK	801	wbbefore = utf32__iterator_word_break(it, it->last[1]);
2dc0bc24 RK	802	wbtwobefore = utf32__iterator_word_break(it, it->last[0]);
092f426f RK	803
092f426f RK	804	/* WB5 */
2dc0bc24 RK	805	if(wbbefore == unicode_Word_Break_ALetter
2dc0bc24 RK	806	&& wbafter == unicode_Word_Break_ALetter)
092f426f RK	807	return 0;
092f426f RK	808	/* WB6 */
2dc0bc24 RK	809	if(wbbefore == unicode_Word_Break_ALetter
	810	&& (wbafter == unicode_Word_Break_MidLetter
	811	\|\| wbafter == unicode_Word_Break_MidNumLet)
	812	&& wbtwoafter == unicode_Word_Break_ALetter)
092f426f RK	813	return 0;
092f426f RK	814	/* WB7 */
2dc0bc24 RK	815	if(wbtwobefore == unicode_Word_Break_ALetter
	816	&& (wbbefore == unicode_Word_Break_MidLetter
	817	\|\| wbbefore == unicode_Word_Break_MidNumLet)
	818	&& wbafter == unicode_Word_Break_ALetter)
092f426f	819	return 0;
c85b7022	820	/* WB8 */
2dc0bc24 RK	821	if(wbbefore == unicode_Word_Break_Numeric
2dc0bc24 RK	822	&& wbafter == unicode_Word_Break_Numeric)
092f426f RK	823	return 0;
092f426f RK	824	/* WB9 */
2dc0bc24 RK	825	if(wbbefore == unicode_Word_Break_ALetter
2dc0bc24 RK	826	&& wbafter == unicode_Word_Break_Numeric)
092f426f RK	827	return 0;
092f426f RK	828	/* WB10 */
2dc0bc24 RK	829	if(wbbefore == unicode_Word_Break_Numeric
2dc0bc24 RK	830	&& wbafter == unicode_Word_Break_ALetter)
092f426f RK	831	return 0;
092f426f RK	832	/* WB11 */
2dc0bc24 RK	833	if(wbtwobefore == unicode_Word_Break_Numeric
	834	&& (wbbefore == unicode_Word_Break_MidNum
	835	\|\| wbbefore == unicode_Word_Break_MidNumLet)
	836	&& wbafter == unicode_Word_Break_Numeric)
092f426f RK	837	return 0;
092f426f RK	838	/* WB12 */
2dc0bc24 RK	839	if(wbbefore == unicode_Word_Break_Numeric
	840	&& (wbafter == unicode_Word_Break_MidNum
	841	\|\| wbafter == unicode_Word_Break_MidNumLet)
	842	&& wbtwoafter == unicode_Word_Break_Numeric)
092f426f RK	843	return 0;
092f426f RK	844	/* WB13 */
2dc0bc24 RK	845	if(wbbefore == unicode_Word_Break_Katakana
2dc0bc24 RK	846	&& wbafter == unicode_Word_Break_Katakana)
092f426f RK	847	return 0;
092f426f RK	848	/* WB13a */
2dc0bc24 RK	849	if((wbbefore == unicode_Word_Break_ALetter
	850	\|\| wbbefore == unicode_Word_Break_Numeric
	851	\|\| wbbefore == unicode_Word_Break_Katakana
	852	\|\| wbbefore == unicode_Word_Break_ExtendNumLet)
	853	&& wbafter == unicode_Word_Break_ExtendNumLet)
092f426f RK	854	return 0;
092f426f RK	855	/* WB13b */
2dc0bc24 RK	856	if(wbbefore == unicode_Word_Break_ExtendNumLet
	857	&& (wbafter == unicode_Word_Break_ALetter
	858	\|\| wbafter == unicode_Word_Break_Numeric
	859	\|\| wbafter == unicode_Word_Break_Katakana))
092f426f RK	860	return 0;
	861	/* WB14 */
	862	return 1;
	863	}
	864
e5a5a138 RK	865	/@}/
	866	/** @defgroup utf32 Functions that operate on UTF-32 strings */
	867	/@{/
	868
	869	/** @brief Return the length of a 0-terminated UTF-32 string
	870	* @param s Pointer to 0-terminated string
	871	* @return Length of string in code points (excluding terminator)
	872	*
56fd389c	873	* Unlike the conversion functions no validity checking is done on the string.
e5a5a138 RK	874	*/
	875	size_t utf32_len(const uint32_t *s) {
	876	const uint32_t *t = s;
	877
	878	while(*t)
	879	++t;
	880	return (size_t)(t - s);
	881	}
	882
e5a5a138 RK	883	/** @brief Stably sort [s,s+ns) into descending order of combining class
	884	* @param s Start of array
	885	* @param ns Number of elements, must be at least 1
	886	* @param buffer Buffer of at least @p ns elements
	887	*/
	888	static void utf32__sort_ccc(uint32_t s, size_t ns, uint32_t buffer) {
	889	uint32_t a, b, *bp;
	890	size_t na, nb;
	891
	892	switch(ns) {
	893	case 1: /* 1-element array is always sorted */
	894	return;
	895	case 2: /* 2-element arrays are trivial to sort */
	896	if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
	897	uint32_t tmp = s[0];
	898	s[0] = s[1];
	899	s[1] = tmp;
	900	}
	901	return;
	902	default:
	903	/* Partition the array */
	904	na = ns / 2;
	905	nb = ns - na;
	906	a = s;
	907	b = s + na;
	908	/* Sort the two halves of the array */
	909	utf32__sort_ccc(a, na, buffer);
	910	utf32__sort_ccc(b, nb, buffer);
	911	/* Merge them back into one, via the buffer */
	912	bp = buffer;
	913	while(na > 0 && nb > 0) {
16506c9d	914	/* We want ascending order of combining class (hence <)
e5a5a138 RK	915	* and we want stability within combining classes (hence <=)
	916	*/
	917	if(utf32__combining_class(a) <= utf32__combining_class(b)) {
	918	bp++ = a++;
	919	--na;
	920	} else {
	921	bp++ = b++;
	922	--nb;
	923	}
	924	}
	925	while(na > 0) {
	926	bp++ = a++;
	927	--na;
	928	}
	929	while(nb > 0) {
	930	bp++ = b++;
	931	--nb;
	932	}
	933	memcpy(s, buffer, ns * sizeof(uint32_t));
	934	return;
	935	}
	936	}
	937
	938	/** @brief Put combining characters into canonical order
	939	* @param s Pointer to UTF-32 string
	940	* @param ns Length of @p s
f98fcddb	941	* @return 0 on success, non-0 on error
e5a5a138	942	*
56fd389c RK	943	* @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
56fd389c RK	944	* ordering.
e5a5a138	945	*
56fd389c	946	* Currently we only support a maximum of 1024 combining characters after each
f98fcddb	947	* base character. If this limit is exceeded then a non-0 value is returned.
e5a5a138 RK	948	*/
	949	static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
	950	size_t nc;
	951	uint32_t buffer[1024];
	952
	953	/* The ordering amounts to a stable sort of each contiguous group of
	954	* characters with non-0 combining class. */
	955	while(ns > 0) {
	956	/* Skip non-combining characters */
	957	if(utf32__combining_class(*s) == 0) {
	958	++s;
	959	--ns;
	960	continue;
	961	}
	962	/* We must now have at least one combining character; see how many
	963	* there are */
	964	for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
	965	;
	966	if(nc > 1024)
	967	return -1;
	968	/* Sort the array */
	969	utf32__sort_ccc(s, nc, buffer);
	970	s += nc;
	971	ns -= nc;
	972	}
	973	return 0;
	974	}
	975
	976	/* Magic numbers from UAX #15 s16 */
	977	#define SBase 0xAC00
	978	#define LBase 0x1100
	979	#define VBase 0x1161
	980	#define TBase 0x11A7
	981	#define LCount 19
	982	#define VCount 21
	983	#define TCount 28
	984	#define NCount (VCount * TCount)
	985	#define SCount (LCount * NCount)
	986
	987	/** @brief Guts of the decomposition lookup functions */
	988	#define utf32__decompose_one_generic(WHICH) do { \
f98fcddb	989	const uint32_t *dc = utf32__decomposition_##WHICH(c); \
e5a5a138 RK	990	if(dc) { \
	991	/* Found a canonical decomposition in the table */ \
	992	while(*dc) \
	993	utf32__decompose_one_##WHICH(d, *dc++); \
	994	} else if(c >= SBase && c < SBase + SCount) { \
	995	/* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
	996	const uint32_t SIndex = c - SBase; \
	997	const uint32_t L = LBase + SIndex / NCount; \
	998	const uint32_t V = VBase + (SIndex % NCount) / TCount; \
	999	const uint32_t T = TBase + SIndex % TCount; \
	1000	dynstr_ucs4_append(d, L); \
	1001	dynstr_ucs4_append(d, V); \
	1002	if(T != TBase) \
	1003	dynstr_ucs4_append(d, T); \
	1004	} else \
	1005	/* Equal to own canonical decomposition */ \
	1006	dynstr_ucs4_append(d, c); \
	1007	} while(0)
	1008
	1009	/** @brief Recursively compute the canonical decomposition of @p c
	1010	* @param d Dynamic string to store decomposition in
	1011	* @param c Code point to decompose (must be a valid!)
f98fcddb	1012	* @return 0 on success, non-0 on error
e5a5a138 RK	1013	*/
	1014	static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
	1015	utf32__decompose_one_generic(canon);
	1016	}
	1017
	1018	/** @brief Recursively compute the compatibility decomposition of @p c
	1019	* @param d Dynamic string to store decomposition in
	1020	* @param c Code point to decompose (must be a valid!)
f98fcddb	1021	* @return 0 on success, non-0 on error
e5a5a138 RK	1022	*/
	1023	static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
	1024	utf32__decompose_one_generic(compat);
	1025	}
	1026
16506c9d RK	1027	/** @brief Magic utf32__compositions() return value for Hangul Choseong */
	1028	static const uint32_t utf32__hangul_L[1];
	1029
	1030	/** @brief Return the list of compositions that @p c starts
	1031	* @param c Starter code point
	1032	* @return Composition list or NULL
	1033	*
	1034	* For Hangul leading (Choseong) jamo we return the special value
	1035	* utf32__hangul_L. These code points are not listed as the targets of
	1036	* canonical decompositions (make-unidata checks) so there is no confusion with
	1037	* real decompositions here.
	1038	*/
	1039	static const uint32_t *utf32__compositions(uint32_t c) {
	1040	const uint32_t *compositions = utf32__unidata(c)->composed;
	1041
	1042	if(compositions)
	1043	return compositions;
	1044	/* Special-casing for Hangul */
	1045	switch(utf32__grapheme_break(c)) {
	1046	default:
	1047	return 0;
	1048	case unicode_Grapheme_Break_L:
	1049	return utf32__hangul_L;
	1050	}
	1051	}
	1052
	1053	/** @brief Composition step
	1054	* @param s Start of string
	1055	* @param ns Length of string
	1056	* @return New length of string
	1057	*
	1058	* This is called from utf32__decompose_generic() to compose the result string
	1059	* in place.
	1060	*/
	1061	static size_t utf32__compose(uint32_t *s, size_t ns) {
	1062	const uint32_t *compositions;
	1063	uint32_t start = s, t = s, *tt, cc;
	1064
	1065	while(ns > 0) {
	1066	uint32_t starter = *s++;
	1067	int block_starters = 0;
	1068	--ns;
	1069	/* We don't attempt to compose the following things:
	1070	* - final characters whatever kind they are
	1071	* - non-starter characters
	1072	* - starters that don't take part in a canonical decomposition mapping
	1073	*/
	1074	if(ns == 0
	1075	\|\| utf32__combining_class(starter)
	1076	\|\| !(compositions = utf32__compositions(starter))) {
	1077	*t++ = starter;
	1078	continue;
	1079	}
	1080	if(compositions != utf32__hangul_L) {
	1081	/* Where we'll put the eventual starter */
	1082	tt = t++;
	1083	do {
	1084	/* See if we can find composition of starter+s /
	1085	const uint32_t cchar = s, cp = compositions;
	1086	while((cc = *cp++)) {
	1087	const uint32_t *decomp = utf32__decomposition_canon(cc);
	1088	/* We know decomp[0] == starter */
	1089	if(decomp[1] == cchar)
	1090	break;
1091	}
1092	if(cc) {
1093	/* Found a composition: cc decomposes to starter,s /
1094	starter = cc;
1095	compositions = utf32__compositions(starter);
1096	++s;
1097	--ns;
1098	} else {
1099	/* No composition found. */
1100	const int class = utf32__combining_class(*s);
1101	if(class) {
1102	/* Transfer the uncomposable combining character to the output */
1103	t++ = s++;
1104	--ns;
1105	/* All the combining characters of the same class of the
1106	* uncomposable character are blocked by it, but there may be
1107	* others of higher class later. We eat the uncomposable and
1108	* blocked characters and go back round the loop for that higher
1109	* class. */
1110	while(ns > 0 && utf32__combining_class(*s) == class) {
1111	t++ = s++;
1112	--ns;
1113	}
1114	/* Block any subsequent starters */
1115	block_starters = 1;
1116	} else {
1117	/* The uncombinable character is itself a starter, so we don't
1118	* transfer it to the output but instead go back round the main
1119	* loop. */
1120	break;
1121	}
1122	}
1123	/* Keep going while there are still characters and the starter takes
1124	* part in some composition */
1125	} while(ns > 0 && compositions
1126	&& (!block_starters \|\| utf32__combining_class(*s)));
1127	/* Store any remaining combining characters */
1128	while(ns > 0 && utf32__combining_class(*s)) {
1129	t++ = s++;
1130	--ns;
1131	}
1132	/* Store the resulting starter */
1133	*tt = starter;
1134	} else {
1135	/* Special-casing for Hangul
1136	*
1137	* If there are combining characters between the L and the V then they
1138	* will block the V and so no composition happens. Similarly combining
1139	* characters between V and T will block the T and so we only get as far
1140	* as LV.
1141	*/
1142	if(utf32__grapheme_break(*s) == unicode_Grapheme_Break_V) {
1143	const uint32_t V = *s++;
1144	const uint32_t LIndex = starter - LBase;
1145	const uint32_t VIndex = V - VBase;
1146	uint32_t TIndex;
1147	--ns;
1148	if(ns > 0
1149	&& utf32__grapheme_break(*s) == unicode_Grapheme_Break_T) {
1150	/* We have an L V T sequence */
1151	const uint32_t T = *s++;
1152	TIndex = T - TBase;
1153	--ns;
1154	} else
1155	/* It's just L V */
1156	TIndex = 0;
1157	/* Compose to LVT or LV as appropriate */
1158	starter = (LIndex * VCount + VIndex) * TCount + TIndex + SBase;
1159	} /* else we only have L or LV and no V or T */
1160	*t++ = starter;
1161	/* There could be some combining characters that belong to the V or T.
1162	* These will be treated as non-starter characters at the top of the loop
1163	* and thuss transferred to the output. */
1164	}
1165	}
1166	return t - start;
1167	}
1168
1169	/** @brief Guts of the composition and decomposition functions
1170	* @param WHICH @c canon or @c compat to choose decomposition
1171	* @param COMPOSE @c 0 or @c 1 to compose
1172	*/
1173	#define utf32__decompose_generic(WHICH, COMPOSE) do { \
e5a5a138 RK	1174	struct dynstr_ucs4 d; \
	1175	uint32_t c; \
	1176	\
	1177	dynstr_ucs4_init(&d); \
	1178	while(ns) { \
	1179	c = *s++; \
56fd389c	1180	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
e5a5a138 RK	1181	goto error; \
	1182	utf32__decompose_one_##WHICH(&d, c); \
	1183	--ns; \
	1184	} \
	1185	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	1186	goto error; \
16506c9d RK	1187	if(COMPOSE) \
16506c9d RK	1188	d.nvec = utf32__compose(d.vec, d.nvec); \
e5a5a138 RK	1189	dynstr_ucs4_terminate(&d); \
	1190	if(ndp) \
	1191	*ndp = d.nvec; \
	1192	return d.vec; \
	1193	error: \
	1194	xfree(d.vec); \
	1195	return 0; \
	1196	} while(0)
	1197
	1198	/** @brief Canonically decompose @p [s,s+ns)
	1199	* @param s Pointer to string
	1200	* @param ns Length of string
	1201	* @param ndp Where to store length of result
f98fcddb	1202	* @return Pointer to result string, or NULL on error
e5a5a138	1203	*
16506c9d RK	1204	* Computes NFD (Normalization Form D) of the string at @p s. This implies
	1205	* performing all canonical decompositions and then normalizing the order of
	1206	* combining characters.
e5a5a138	1207	*
56fd389c	1208	* Returns NULL if the string is not valid for either of the following reasons:
e5a5a138 RK	1209	* - it codes for a UTF-16 surrogate
e5a5a138 RK	1210	* - it codes for a value outside the unicode code space
16506c9d RK	1211	*
	1212	* See also:
	1213	* - utf32_decompose_compat()
	1214	* - utf32_compose_canon()
e5a5a138 RK	1215	*/
e5a5a138 RK	1216	uint32_t utf32_decompose_canon(const uint32_t s, size_t ns, size_t *ndp) {
16506c9d	1217	utf32__decompose_generic(canon, 0);
e5a5a138 RK	1218	}
	1219
	1220	/** @brief Compatibility decompose @p [s,s+ns)
	1221	* @param s Pointer to string
	1222	* @param ns Length of string
	1223	* @param ndp Where to store length of result
f98fcddb	1224	* @return Pointer to result string, or NULL on error
e5a5a138	1225	*
16506c9d RK	1226	* Computes NFKD (Normalization Form KD) of the string at @p s. This implies
	1227	* performing all canonical and compatibility decompositions and then
	1228	* normalizing the order of combining characters.
e5a5a138	1229	*
56fd389c	1230	* Returns NULL if the string is not valid for either of the following reasons:
e5a5a138 RK	1231	* - it codes for a UTF-16 surrogate
e5a5a138 RK	1232	* - it codes for a value outside the unicode code space
16506c9d RK	1233	*
	1234	* See also:
	1235	* - utf32_decompose_canon()
	1236	* - utf32_compose_compat()
e5a5a138 RK	1237	*/
e5a5a138 RK	1238	uint32_t utf32_decompose_compat(const uint32_t s, size_t ns, size_t *ndp) {
16506c9d RK	1239	utf32__decompose_generic(compat, 0);
	1240	}
	1241
	1242	/** @brief Canonically compose @p [s,s+ns)
	1243	* @param s Pointer to string
	1244	* @param ns Length of string
	1245	* @param ndp Where to store length of result
	1246	* @return Pointer to result string, or NULL on error
	1247	*
	1248	* Computes NFC (Normalization Form C) of the string at @p s. This implies
	1249	* performing all canonical decompositions, normalizing the order of combining
	1250	* characters and then composing all unblocked primary compositables.
	1251	*
	1252	* Returns NULL if the string is not valid for either of the following reasons:
	1253	* - it codes for a UTF-16 surrogate
	1254	* - it codes for a value outside the unicode code space
	1255	*
	1256	* See also:
	1257	* - utf32_compose_compat()
	1258	* - utf32_decompose_canon()
	1259	*/
	1260	uint32_t utf32_compose_canon(const uint32_t s, size_t ns, size_t *ndp) {
	1261	utf32__decompose_generic(canon, 1);
	1262	}
	1263
	1264	/** @brief Compatibility compose @p [s,s+ns)
	1265	* @param s Pointer to string
	1266	* @param ns Length of string
	1267	* @param ndp Where to store length of result
	1268	* @return Pointer to result string, or NULL on error
	1269	*
	1270	* Computes NFKC (Normalization Form KC) of the string at @p s. This implies
	1271	* performing all canonical and compatibility decompositions, normalizing the
	1272	* order of combining characters and then composing all unblocked primary
	1273	* compositables.
	1274	*
	1275	* Returns NULL if the string is not valid for either of the following reasons:
	1276	* - it codes for a UTF-16 surrogate
	1277	* - it codes for a value outside the unicode code space
	1278	*
	1279	* See also:
	1280	* - utf32_compose_canon()
	1281	* - utf32_decompose_compat()
	1282	*/
	1283	uint32_t utf32_compose_compat(const uint32_t s, size_t ns, size_t *ndp) {
	1284	utf32__decompose_generic(compat, 1);
e5a5a138 RK	1285	}
e5a5a138 RK	1286
56fd389c RK	1287	/** @brief Single-character case-fold and decompose operation */
56fd389c RK	1288	#define utf32__casefold_one(WHICH) do { \
bcf9ed7f	1289	const uint32_t *cf = utf32__unidata(c)->casefold; \
56fd389c RK	1290	if(cf) { \
	1291	/* Found a case-fold mapping in the table */ \
	1292	while(*cf) \
	1293	utf32__decompose_one_##WHICH(&d, *cf++); \
	1294	} else \
	1295	utf32__decompose_one_##WHICH(&d, c); \
	1296	} while(0)
e5a5a138 RK	1297
	1298	/** @brief Case-fold @p [s,s+ns)
	1299	* @param s Pointer to string
	1300	* @param ns Length of string
	1301	* @param ndp Where to store length of result
f98fcddb	1302	* @return Pointer to result string, or NULL on error
e5a5a138 RK	1303	*
e5a5a138 RK	1304	* Case-fold the string at @p s according to full default case-folding rules
56fd389c	1305	* (s3.13) for caseless matching. The result will be in NFD.
e5a5a138	1306	*
56fd389c	1307	* Returns NULL if the string is not valid for either of the following reasons:
e5a5a138 RK	1308	* - it codes for a UTF-16 surrogate
	1309	* - it codes for a value outside the unicode code space
	1310	*/
	1311	uint32_t utf32_casefold_canon(const uint32_t s, size_t ns, size_t *ndp) {
	1312	struct dynstr_ucs4 d;
	1313	uint32_t c;
	1314	size_t n;
	1315	uint32_t *ss = 0;
	1316
	1317	/* If the canonical decomposition of the string includes any combining
	1318	* character that case-folds to a non-combining character then we must
	1319	* normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
	1320	* GREEK YPOGEGRAMMENI in its decomposition and the various characters that
	1321	* canonically decompose to it. */
bcf9ed7f RK	1322	for(n = 0; n < ns; ++n)
bcf9ed7f RK	1323	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
e5a5a138	1324	break;
e5a5a138 RK	1325	if(n < ns) {
	1326	/* We need a preliminary decomposition */
	1327	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	1328	return 0;
	1329	s = ss;
	1330	}
	1331	dynstr_ucs4_init(&d);
	1332	while(ns) {
	1333	c = *s++;
56fd389c	1334	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF)
e5a5a138	1335	goto error;
56fd389c	1336	utf32__casefold_one(canon);
e5a5a138 RK	1337	--ns;
	1338	}
	1339	if(utf32__canonical_ordering(d.vec, d.nvec))
	1340	goto error;
	1341	dynstr_ucs4_terminate(&d);
	1342	if(ndp)
	1343	*ndp = d.nvec;
	1344	return d.vec;
	1345	error:
	1346	xfree(d.vec);
	1347	xfree(ss);
	1348	return 0;
	1349	}
	1350
f98fcddb	1351	/** @brief Compatibility case-fold @p [s,s+ns)
56fd389c RK	1352	* @param s Pointer to string
	1353	* @param ns Length of string
	1354	* @param ndp Where to store length of result
f98fcddb	1355	* @return Pointer to result string, or NULL on error
56fd389c RK	1356	*
	1357	* Case-fold the string at @p s according to full default case-folding rules
	1358	* (s3.13) for compatibility caseless matching. The result will be in NFKD.
	1359	*
	1360	* Returns NULL if the string is not valid for either of the following reasons:
	1361	* - it codes for a UTF-16 surrogate
	1362	* - it codes for a value outside the unicode code space
	1363	*/
	1364	uint32_t utf32_casefold_compat(const uint32_t s, size_t ns, size_t *ndp) {
	1365	struct dynstr_ucs4 d;
	1366	uint32_t c;
	1367	size_t n;
	1368	uint32_t *ss = 0;
	1369
bcf9ed7f RK	1370	for(n = 0; n < ns; ++n)
bcf9ed7f RK	1371	if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
56fd389c	1372	break;
56fd389c RK	1373	if(n < ns) {
	1374	/* We need a preliminary _canonical_ decomposition */
	1375	if(!(ss = utf32_decompose_canon(s, ns, &ns)))
	1376	return 0;
	1377	s = ss;
	1378	}
	1379	/* This computes NFKD(toCaseFold(s)) */
	1380	#define compat_casefold_middle() do { \
	1381	dynstr_ucs4_init(&d); \
	1382	while(ns) { \
	1383	c = *s++; \
	1384	if((c >= 0xD800 && c <= 0xDFFF) \|\| c > 0x10FFFF) \
	1385	goto error; \
	1386	utf32__casefold_one(compat); \
	1387	--ns; \
	1388	} \
	1389	if(utf32__canonical_ordering(d.vec, d.nvec)) \
	1390	goto error; \
	1391	} while(0)
	1392	/* Do the inner (NFKD o toCaseFold) */
	1393	compat_casefold_middle();
	1394	/* We can do away with the NFD'd copy of the input now */
	1395	xfree(ss);
	1396	s = ss = d.vec;
	1397	ns = d.nvec;
	1398	/* Do the outer (NFKD o toCaseFold) */
	1399	compat_casefold_middle();
	1400	/* That's all */
	1401	dynstr_ucs4_terminate(&d);
	1402	if(ndp)
	1403	*ndp = d.nvec;
	1404	return d.vec;
	1405	error:
	1406	xfree(d.vec);
	1407	xfree(ss);
	1408	return 0;
	1409	}
	1410
e5a5a138 RK	1411	/** @brief Order a pair of UTF-32 strings
	1412	* @param a First 0-terminated string
	1413	* @param b Second 0-terminated string
	1414	* @return -1, 0 or 1 for a less than, equal to or greater than b
	1415	*
	1416	* "Comparable to strcmp() at its best."
	1417	*/
	1418	int utf32_cmp(const uint32_t a, const uint32_t b) {
	1419	while(a && b && a == b) {
	1420	++a;
	1421	++b;
	1422	}
	1423	return a < b ? -1 : (a > b ? 1 : 0);
	1424	}
	1425
35b651f0 RK	1426	/** @brief Identify a grapheme cluster boundary
	1427	* @param s Start of string (must be NFD)
	1428	* @param ns Length of string
	1429	* @param n Index within string (in [0,ns].)
	1430	* @return 1 at a grapheme cluster boundary, 0 otherwise
	1431	*
	1432	* This function identifies default grapheme cluster boundaries as described in
f98fcddb	1433	* UAX #29 s3. It returns non-0 if @p n points at the code point just after a
35b651f0 RK	1434	* grapheme cluster boundary (including the hypothetical code point just after
35b651f0 RK	1435	* the end of the string).
f98fcddb RK	1436	*
	1437	* This function uses utf32_iterator_set() internally; see that function for
	1438	* remarks on performance.
35b651f0	1439	*/
1625e11a	1440	int utf32_is_grapheme_boundary(const uint32_t *s, size_t ns, size_t n) {
092f426f	1441	struct utf32_iterator_data it[1];
35b651f0	1442
092f426f RK	1443	utf32__iterator_init(it, s, ns, n);
092f426f RK	1444	return utf32_iterator_grapheme_boundary(it);
0b7052da RK	1445	}
	1446
	1447	/** @brief Identify a word boundary
	1448	* @param s Start of string (must be NFD)
	1449	* @param ns Length of string
	1450	* @param n Index within string (in [0,ns].)
	1451	* @return 1 at a word boundary, 0 otherwise
	1452	*
	1453	* This function identifies default word boundaries as described in UAX #29 s4.
f98fcddb	1454	* It returns non-0 if @p n points at the code point just after a word boundary
0b7052da	1455	* (including the hypothetical code point just after the end of the string).
f98fcddb RK	1456	*
	1457	* This function uses utf32_iterator_set() internally; see that function for
	1458	* remarks on performance.
0b7052da RK	1459	*/
0b7052da RK	1460	int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
092f426f	1461	struct utf32_iterator_data it[1];
0b7052da	1462
092f426f RK	1463	utf32__iterator_init(it, s, ns, n);
092f426f RK	1464	return utf32_iterator_word_boundary(it);
0b7052da RK	1465	}
0b7052da RK	1466
8818b7fc RK	1467	/** @brief Split [s,ns) into multiple words
	1468	* @param s Pointer to start of string
	1469	* @param ns Length of string
	1470	* @param nwp Where to store word count, or NULL
c85b7022	1471	* @param wbreak Word_Break property tailor, or NULL
8818b7fc RK	1472	* @return Pointer to array of pointers to words
	1473	*
	1474	* The returned array is terminated by a NULL pointer and individual
	1475	* strings are 0-terminated.
	1476	*/
c85b7022 RK	1477	uint32_t *utf32_word_split(const uint32_t s, size_t ns, size_t *nwp,
c85b7022 RK	1478	unicode_property_tailor *wbreak) {
8818b7fc RK	1479	struct utf32_iterator_data it[1];
	1480	size_t b1 = 0, b2 = 0 ,i;
	1481	int isword;
	1482	struct vector32 v32[1];
	1483	uint32_t *w;
	1484
	1485	vector32_init(v32);
	1486	utf32__iterator_init(it, s, ns, 0);
c85b7022	1487	it->word_break = wbreak;
8818b7fc RK	1488	/* Work our way through the string stopping at each word break. */
	1489	do {
	1490	if(utf32_iterator_word_boundary(it)) {
	1491	/* We've found a new boundary */
	1492	b1 = b2;
	1493	b2 = it->n;
	1494	/fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);/
	1495	/* Inspect the characters between the boundary and form an opinion as to
	1496	* whether they are a word or not */
	1497	isword = 0;
	1498	for(i = b1; i < b2; ++i) {
c85b7022	1499	switch(utf32__iterator_word_break(it, it->s[i])) {
8818b7fc RK	1500	case unicode_Word_Break_ALetter:
	1501	case unicode_Word_Break_Numeric:
	1502	case unicode_Word_Break_Katakana:
	1503	isword = 1;
	1504	break;
	1505	default:
	1506	break;
	1507	}
	1508	}
	1509	/* If it's a word add it to the list of results */
	1510	if(isword) {
8e93ddd1 RK	1511	const size_t len = b2 - b1;
	1512	w = xcalloc_noptr(len + 1, sizeof(uint32_t));
	1513	memcpy(w, it->s + b1, len * sizeof (uint32_t));
	1514	w[len] = 0;
8818b7fc RK	1515	vector32_append(v32, w);
	1516	}
	1517	}
	1518	} while(!utf32_iterator_advance(it, 1));
	1519	vector32_terminate(v32);
	1520	if(nwp)
	1521	*nwp = v32->nvec;
	1522	return v32->vec;
	1523	}
	1524
e5a5a138	1525	/@}/
349b7b74	1526	/** @defgroup utf8 Functions that operate on UTF-8 strings */
e5a5a138 RK	1527	/@{/
	1528
	1529	/** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
	1530	#define utf8__transform(FN) do { \
	1531	uint32_t to32 = 0, decomp32 = 0; \
	1532	size_t nto32, ndecomp32; \
	1533	char *decomp8 = 0; \
	1534	\
	1535	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
	1536	if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
	1537	decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
	1538	error: \
	1539	xfree(to32); \
	1540	xfree(decomp32); \
	1541	return decomp8; \
	1542	} while(0)
	1543
	1544	/** @brief Canonically decompose @p [s,s+ns)
	1545	* @param s Pointer to string
	1546	* @param ns Length of string
	1547	* @param ndp Where to store length of result
f98fcddb	1548	* @return Pointer to result string, or NULL on error
e5a5a138	1549	*
0ae60b83 RK	1550	* Computes NFD (Normalization Form D) of the string at @p s. This implies
	1551	* performing all canonical decompositions and then normalizing the order of
	1552	* combining characters.
e5a5a138 RK	1553	*
	1554	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1555	* this might be.
	1556	*
0ae60b83 RK	1557	* See also:
	1558	* - utf32_decompose_canon().
	1559	* - utf8_decompose_compat()
	1560	* - utf8_compose_canon()
e5a5a138 RK	1561	*/
	1562	char utf8_decompose_canon(const char s, size_t ns, size_t *ndp) {
	1563	utf8__transform(utf32_decompose_canon);
	1564	}
	1565
	1566	/** @brief Compatibility decompose @p [s,s+ns)
	1567	* @param s Pointer to string
	1568	* @param ns Length of string
	1569	* @param ndp Where to store length of result
f98fcddb	1570	* @return Pointer to result string, or NULL on error
e5a5a138	1571	*
0ae60b83 RK	1572	* Computes NFKD (Normalization Form KD) of the string at @p s. This implies
	1573	* performing all canonical and compatibility decompositions and then
	1574	* normalizing the order of combining characters.
e5a5a138 RK	1575	*
	1576	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1577	* this might be.
	1578	*
0ae60b83 RK	1579	* See also:
	1580	* - utf32_decompose_compat().
	1581	* - utf8_decompose_canon()
	1582	* - utf8_compose_compat()
e5a5a138 RK	1583	*/
	1584	char utf8_decompose_compat(const char s, size_t ns, size_t *ndp) {
	1585	utf8__transform(utf32_decompose_compat);
	1586	}
	1587
0ae60b83 RK	1588	/** @brief Canonically compose @p [s,s+ns)
	1589	* @param s Pointer to string
	1590	* @param ns Length of string
	1591	* @param ndp Where to store length of result
	1592	* @return Pointer to result string, or NULL on error
	1593	*
	1594	* Computes NFC (Normalization Form C) of the string at @p s. This implies
	1595	* performing all canonical decompositions, normalizing the order of combining
	1596	* characters and then composing all unblocked primary compositables.
	1597	*
	1598	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1599	* this might be.
	1600	*
	1601	* See also:
	1602	* - utf32_compose_canon()
	1603	* - utf8_compose_compat()
	1604	* - utf8_decompose_canon()
	1605	*/
	1606	char utf8_compose_canon(const char s, size_t ns, size_t *ndp) {
	1607	utf8__transform(utf32_compose_canon);
	1608	}
	1609
	1610	/** @brief Compatibility compose @p [s,s+ns)
	1611	* @param s Pointer to string
	1612	* @param ns Length of string
	1613	* @param ndp Where to store length of result
	1614	* @return Pointer to result string, or NULL on error
	1615	*
	1616	* Computes NFKC (Normalization Form KC) of the string at @p s. This implies
	1617	* performing all canonical and compatibility decompositions, normalizing the
	1618	* order of combining characters and then composing all unblocked primary
	1619	* compositables.
	1620	*
	1621	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1622	* this might be.
	1623	*
	1624	* See also:
	1625	* - utf32_compose_compat()
	1626	* - utf8_compose_canon()
	1627	* - utf8_decompose_compat()
	1628	*/
	1629	char utf8_compose_compat(const char s, size_t ns, size_t *ndp) {
	1630	utf8__transform(utf32_compose_compat);
	1631	}
	1632
e5a5a138 RK	1633	/** @brief Case-fold @p [s,s+ns)
	1634	* @param s Pointer to string
	1635	* @param ns Length of string
	1636	* @param ndp Where to store length of result
f98fcddb	1637	* @return Pointer to result string, or NULL on error
e5a5a138 RK	1638	*
	1639	* Case-fold the string at @p s according to full default case-folding rules
	1640	* (s3.13). The result will be in NFD.
	1641	*
	1642	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1643	* this might be.
	1644	*/
	1645	char utf8_casefold_canon(const char s, size_t ns, size_t *ndp) {
	1646	utf8__transform(utf32_casefold_canon);
	1647	}
	1648
	1649	/** @brief Compatibility case-fold @p [s,s+ns)
	1650	* @param s Pointer to string
	1651	* @param ns Length of string
	1652	* @param ndp Where to store length of result
f98fcddb	1653	* @return Pointer to result string, or NULL on error
e5a5a138 RK	1654	*
	1655	* Case-fold the string at @p s according to full default case-folding rules
	1656	* (s3.13). The result will be in NFKD.
	1657	*
	1658	* Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
	1659	* this might be.
	1660	*/
e5a5a138 RK	1661	char utf8_casefold_compat(const char s, size_t ns, size_t *ndp) {
	1662	utf8__transform(utf32_casefold_compat);
	1663	}
e5a5a138	1664
8818b7fc RK	1665	/** @brief Split [s,ns) into multiple words
	1666	* @param s Pointer to start of string
	1667	* @param ns Length of string
	1668	* @param nwp Where to store word count, or NULL
c85b7022	1669	* @param wbreak Word_Break property tailor, or NULL
8818b7fc RK	1670	* @return Pointer to array of pointers to words
	1671	*
	1672	* The returned array is terminated by a NULL pointer and individual
	1673	* strings are 0-terminated.
	1674	*/
c85b7022 RK	1675	char *utf8_word_split(const char s, size_t ns, size_t *nwp,
c85b7022 RK	1676	unicode_property_tailor *wbreak) {
8818b7fc RK	1677	uint32_t to32 = 0, *v32 = 0;
	1678	size_t nto32, nv, n;
	1679	char v8 = 0, ret = 0;
c85b7022	1680
8818b7fc	1681	if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
c85b7022	1682	if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
8818b7fc RK	1683	v8 = xcalloc(sizeof (char *), nv + 1);
	1684	for(n = 0; n < nv; ++n)
	1685	if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
	1686	goto error;
	1687	ret = v8;
	1688	*nwp = nv;
	1689	v8 = 0; /* don't free */
c85b7022	1690	error:
8818b7fc RK	1691	if(v8) {
	1692	for(n = 0; n < nv; ++n)
	1693	xfree(v8[n]);
	1694	xfree(v8);
	1695	}
	1696	if(v32) {
	1697	for(n = 0; n < nv; ++n)
	1698	xfree(v32[n]);
	1699	xfree(v32);
	1700	}
	1701	xfree(to32);
	1702	return ret;
	1703	}
	1704
	1705
e5a5a138 RK	1706	/@}/
e5a5a138 RK	1707
bb5c7798 RK	1708	/** @brief Return the length of a 0-terminated UTF-16 string
	1709	* @param s Pointer to 0-terminated string
	1710	* @return Length of string in code points (excluding terminator)
	1711	*
	1712	* Unlike the conversion functions no validity checking is done on the string.
	1713	*/
	1714	size_t utf16_len(const uint16_t *s) {
	1715	const uint16_t *t = s;
	1716
	1717	while(*t)
	1718	++t;
	1719	return (size_t)(t - s);
	1720	}
	1721
e5a5a138 RK	1722	/*
	1723	Local Variables:
	1724	c-basic-offset:2
	1725	comment-column:40
	1726	fill-column:79
	1727	indent-tabs-mode:nil
	1728	End:
	1729	*/