+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+ unicode_property_tailor *wbreak) {
+ struct utf32_iterator_data it[1];
+ size_t b1 = 0, b2 = 0 ,i;
+ int isword;
+ struct vector32 v32[1];
+ uint32_t *w;
+
+ vector32_init(v32);
+ utf32__iterator_init(it, s, ns, 0);
+ it->word_break = wbreak;
+ /* Work our way through the string stopping at each word break. */
+ do {
+ if(utf32_iterator_word_boundary(it)) {
+ /* We've found a new boundary */
+ b1 = b2;
+ b2 = it->n;
+ /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
+ /* Inspect the characters between the boundary and form an opinion as to
+ * whether they are a word or not */
+ isword = 0;
+ for(i = b1; i < b2; ++i) {
+ switch(utf32__iterator_word_break(it, it->s[i])) {
+ case unicode_Word_Break_ALetter:
+ case unicode_Word_Break_Numeric:
+ case unicode_Word_Break_Katakana:
+ isword = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ /* If it's a word add it to the list of results */
+ if(isword) {
+ const size_t len = b2 - b1;
+ w = xcalloc_noptr(len + 1, sizeof(uint32_t));
+ memcpy(w, it->s + b1, len * sizeof (uint32_t));
+ w[len] = 0;
+ vector32_append(v32, w);
+ }
+ }
+ } while(!utf32_iterator_advance(it, 1));
+ vector32_terminate(v32);
+ if(nwp)
+ *nwp = v32->nvec;
+ return v32->vec;
+}
+