utf32_word_split() and utf8_word_split() splits a string into words

[disorder] / lib / unicode.c
diff --git a/lib/unicode.c b/lib/unicode.c

index 45d0f0b3f70e664a07e029dcc3b8b0144a18e786..b5b520cf07991a60f04a15d1ebb60ac966170676 100644 (file)
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -30,6 +30,13 @@
   * database code.
   *
   * As the code stands this guarantee is not well met!
+ *
+ * Subpages:
+ * - @ref utf32props
+ * - @ref utftransform
+ * - @ref utf32iterator
+ * - @ref utf32
+ * - @ref utf8
   */
  
  #include <config.h>
@@ -103,7 +110,7 @@ static inline int utf32__combining_class(uint32_t c) {
  }
  
  /** @brief Return the General_Category value for @p c
- * @param Code point
+ * @param c Code point
   * @return General_Category property value
   *
   * @p c can be any 32-bit value, a sensible value will be returned regardless.
@@ -1264,6 +1271,59 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
    return utf32_iterator_word_boundary(it);
  }
  
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+  struct utf32_iterator_data it[1];
+  size_t b1 = 0, b2 = 0 ,i;
+  int isword;
+  struct vector32 v32[1];
+  uint32_t *w;
+
+  vector32_init(v32);
+  utf32__iterator_init(it, s, ns, 0);
+  /* Work our way through the string stopping at each word break. */
+  do {
+    if(utf32_iterator_word_boundary(it)) {
+      /* We've found a new boundary */
+      b1 = b2;
+      b2 = it->n;
+      /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
+      /* Inspect the characters between the boundary and form an opinion as to
+       * whether they are a word or not */
+      isword = 0;
+      for(i = b1; i < b2; ++i) {
+        switch(utf32__word_break(it->s[i])) {
+        case unicode_Word_Break_ALetter:
+        case unicode_Word_Break_Numeric:
+        case unicode_Word_Break_Katakana:
+          isword = 1;
+          break;
+        default:
+          break;
+        }
+      }
+      /* If it's a word add it to the list of results */
+      if(isword) {
+        w = xcalloc(b2 - b1 + 1, sizeof(uint32_t));
+        memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t));
+        vector32_append(v32, w);
+      }
+    }
+  } while(!utf32_iterator_advance(it, 1));
+  vector32_terminate(v32);
+  if(nwp)
+    *nwp = v32->nvec;
+  return v32->vec;
+}
+
  /*@}*/
  /** @defgroup utf8 Functions that operate on UTF-8 strings */
  /*@{*/
@@ -1289,15 +1349,17 @@ error:                                                          \
   * @param ndp Where to store length of result
   * @return Pointer to result string, or NULL on error
   *
- * Computes the canonical decomposition of a string and stably sorts combining
- * characters into canonical order.  The result is in Normalization Form D and
- * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
- * NormalizationTest.txt.
+ * Computes NFD (Normalization Form D) of the string at @p s.  This implies
+ * performing all canonical decompositions and then normalizing the order of
+ * combining characters.
   *
   * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
   * this might be.
   *
- * See also utf32_decompose_canon().
+ * See also:
+ * - utf32_decompose_canon().
+ * - utf8_decompose_compat()
+ * - utf8_compose_canon()
   */
  char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
    utf8__transform(utf32_decompose_canon);
@@ -1309,20 +1371,67 @@ char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
   * @param ndp Where to store length of result
   * @return Pointer to result string, or NULL on error
   *
- * Computes the compatibility decomposition of a string and stably sorts
- * combining characters into canonical order.  The result is in Normalization
- * Form KD and (at the time of writing!) passes the NFKD tests defined in
- * Unicode 5.0's NormalizationTest.txt.
+ * Computes NFKD (Normalization Form KD) of the string at @p s.  This implies
+ * performing all canonical and compatibility decompositions and then
+ * normalizing the order of combining characters.
   *
   * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
   * this might be.
   *
- * See also utf32_decompose_compat().
+ * See also:
+ * - utf32_decompose_compat().
+ * - utf8_decompose_canon()
+ * - utf8_compose_compat()
   */
  char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
    utf8__transform(utf32_decompose_compat);
  }
  
+/** @brief Canonically compose @p [s,s+ns)
+ * @param s Pointer to string
+ * @param ns Length of string
+ * @param ndp Where to store length of result
+ * @return Pointer to result string, or NULL on error
+ *
+ * Computes NFC (Normalization Form C) of the string at @p s.  This implies
+ * performing all canonical decompositions, normalizing the order of combining
+ * characters and then composing all unblocked primary compositables.
+ *
+ * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
+ * this might be.
+ *
+ * See also:
+ * - utf32_compose_canon()
+ * - utf8_compose_compat()
+ * - utf8_decompose_canon()
+ */
+char *utf8_compose_canon(const char *s, size_t ns, size_t *ndp) {
+  utf8__transform(utf32_compose_canon);
+}
+
+/** @brief Compatibility compose @p [s,s+ns)
+ * @param s Pointer to string
+ * @param ns Length of string
+ * @param ndp Where to store length of result
+ * @return Pointer to result string, or NULL on error
+ *
+ * Computes NFKC (Normalization Form KC) of the string at @p s.  This implies
+ * performing all canonical and compatibility decompositions, normalizing the
+ * order of combining characters and then composing all unblocked primary
+ * compositables.
+ *
+ * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
+ * this might be.
+ *
+ * See also:
+ * - utf32_compose_compat()
+ * - utf8_compose_canon()
+ * - utf8_decompose_compat()
+ */
+char *utf8_compose_compat(const char *s, size_t ns, size_t *ndp) {
+  utf8__transform(utf32_compose_compat);
+}
+
  /** @brief Case-fold @p [s,s+ns)
   * @param s Pointer to string
   * @param ns Length of string
@@ -1355,6 +1464,45 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
    utf8__transform(utf32_casefold_compat);
  }
  
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+  uint32_t *to32 = 0, **v32 = 0;
+  size_t nto32, nv, n;
+  char **v8 = 0, **ret = 0;
+                                                                
+  if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
+  if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+  v8 = xcalloc(sizeof (char *), nv + 1);
+  for(n = 0; n < nv; ++n)
+    if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
+      goto error;
+  ret = v8;
+  *nwp = nv;
+  v8 = 0;                               /* don't free */
+error:                                                          
+  if(v8) {
+    for(n = 0; n < nv; ++n)
+      xfree(v8[n]);
+    xfree(v8);
+  }
+  if(v32) {
+    for(n = 0; n < nv; ++n)
+      xfree(v32[n]);
+    xfree(v32);
+  }
+  xfree(to32);
+  return ret;
+}
+
+
  /*@}*/
  
  /*