test and fix utf32_iterator_set()

[disorder] / lib / words.c
diff --git a/lib/words.c b/lib/words.c

index e6e4087b444ad4e0d7b5db8526212b1584920da6..2638ea645c991974696fef808afaf29850f6cbf9 100644 (file)
--- a/lib/words.c
+++ b/lib/words.c
@@ -33,36 +33,18 @@
  #include "charset.h"
  
  #include "unidata.h"
+#include "unicode.h"
  
  const char *casefold(const char *ptr) {
-  struct dynstr d;
-  uint32_t c;
-  const char *s = ptr;
-
-  dynstr_init(&d);
-  while(*s) {
-    /* Convert UTF-8 to UCS-32 */
-    PARSE_UTF8(s, c, return ptr);
-    /* Normalize */
-    if(c < UNICODE_NCHARS) {
-      /* If this a known character, convert it to lower case */
-      const struct unidata *const ud = &unidata[c / 256][c % 256];
-      c += ud->lower_offset;
-    }
-    /* Convert UCS-4 back to UTF-8 */
-    one_ucs42utf8(c, &d);
-  }
-  dynstr_terminate(&d);
-  return d.vec;
+  return utf8_casefold_canon(ptr, strlen(ptr), 0);
  }
  
-static enum unicode_gc_cat cat(uint32_t c) {
+static enum unicode_General_Category cat(uint32_t c) {
    if(c < UNICODE_NCHARS) {
-    /* If this a known character, convert it to lower case */
-    const struct unidata *const ud = &unidata[c / 256][c % 256];
-    return ud->gc;
+    const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
+    return ud->general_category;
    } else
-    return unicode_gc_Cn;
+    return unicode_General_Category_Cn;
  }
  
  /* XXX this is a bit kludgy */
@@ -91,18 +73,18 @@ char **words(const char *s, int *nvecp) {
      }
      /* do the rest on category */
      switch(cat(c)) {
-    case unicode_gc_Ll:
-    case unicode_gc_Lm:
-    case unicode_gc_Lo:
-    case unicode_gc_Lt:
-    case unicode_gc_Lu:
-    case unicode_gc_Nd:
-    case unicode_gc_Nl:
-    case unicode_gc_No:
-    case unicode_gc_Sc:
-    case unicode_gc_Sk:
-    case unicode_gc_Sm:
-    case unicode_gc_So:
+    case unicode_General_Category_Ll:
+    case unicode_General_Category_Lm:
+    case unicode_General_Category_Lo:
+    case unicode_General_Category_Lt:
+    case unicode_General_Category_Lu:
+    case unicode_General_Category_Nd:
+    case unicode_General_Category_Nl:
+    case unicode_General_Category_No:
+    case unicode_General_Category_Sc:
+    case unicode_General_Category_Sk:
+    case unicode_General_Category_Sm:
+    case unicode_General_Category_So:
        /* letters, digits and symbols are considered to be part of
         * words */
        if(!in_word) {
@@ -112,15 +94,15 @@ char **words(const char *s, int *nvecp) {
        dynstr_append_bytes(&d, start, s - start);
        break;
  
-    case unicode_gc_Cc:
-    case unicode_gc_Cf:
-    case unicode_gc_Co:
-    case unicode_gc_Cs:
-    case unicode_gc_Zl:
-    case unicode_gc_Zp:
-    case unicode_gc_Zs:
-    case unicode_gc_Pe:
-    case unicode_gc_Ps:
+    case unicode_General_Category_Cc:
+    case unicode_General_Category_Cf:
+    case unicode_General_Category_Co:
+    case unicode_General_Category_Cs:
+    case unicode_General_Category_Zl:
+    case unicode_General_Category_Zp:
+    case unicode_General_Category_Zs:
+    case unicode_General_Category_Pe:
+    case unicode_General_Category_Ps:
      separator:
        if(in_word) {
         dynstr_terminate(&d);
@@ -129,15 +111,15 @@ char **words(const char *s, int *nvecp) {
        }
        break;
  
-    case unicode_gc_Mc:
-    case unicode_gc_Me:
-    case unicode_gc_Mn:
-    case unicode_gc_Pc:
-    case unicode_gc_Pd:
-    case unicode_gc_Pf:
-    case unicode_gc_Pi:
-    case unicode_gc_Po:
-    case unicode_gc_Cn:
+    case unicode_General_Category_Mc:
+    case unicode_General_Category_Me:
+    case unicode_General_Category_Mn:
+    case unicode_General_Category_Pc:
+    case unicode_General_Category_Pd:
+    case unicode_General_Category_Pf:
+    case unicode_General_Category_Pi:
+    case unicode_General_Category_Po:
+    case unicode_General_Category_Cn:
        /* control and punctuation is completely ignored */
        break;