From c85b702265244a34b1f977c9bf22f913689b906f Mon Sep 17 00:00:00 2001
Message-Id: <c85b702265244a34b1f977c9bf22f913689b906f.1715140704.git.mdw@distorted.org.uk>
From: Mark Wooding <mdw@chiark.greenend.org.uk>
Date: Tue, 20 Nov 2007 20:32:06 +0000
Subject: [PATCH] unicode.c iterators can now have their notion of Word_Break
 tailored.
Organization: Straylight/Edgeware

From: Richard Kettlewell <rjk@greenend.org.uk>

We use this to arrange that underscores are treated as spaces, since a
common track naming convention replaces spaces with underscores.  We
only apply this to raw filenames, not to the overrides provided by
preferences, on the assumption that if you typed an underscore there
you really meant it.

words.c/h are now gone; all references to words() and casefold() use
unicode.c functions directly.
---
 lib/Makefile.am  |   1 -
 lib/test.c       |   5 +--
 lib/trackname.c  |  11 +++--
 lib/unicode.c    | 104 +++++++++++++++++++++++++++++++++--------------
 lib/unicode.h    |  16 +++++++-
 lib/words.c      |  56 -------------------------
 lib/words.h      |  39 ------------------
 server/dcgi.c    |   1 -
 server/trackdb.c |  45 ++++++++++++++++----
 9 files changed, 134 insertions(+), 144 deletions(-)
 delete mode 100644 lib/words.c
 delete mode 100644 lib/words.h

diff --git a/lib/Makefile.am b/lib/Makefile.am
index 0eeccd7..b642c89 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -69,7 +69,6 @@ libdisorder_a_SOURCES=charset.c charset.h		\
 	vacopy.h					\
 	vector.c vector.h				\
 	wav.h wav.c					\
-	words.c words.h 				\
 	wstat.c wstat.h					\
 	disorder.h
 
diff --git a/lib/test.c b/lib/test.c
index 0790efc..f4043b9 100644
--- a/lib/test.c
+++ b/lib/test.c
@@ -38,7 +38,6 @@
 #include "charset.h"
 #include "mime.h"
 #include "hex.h"
-#include "words.h"
 #include "heap.h"
 #include "unicode.h"
 #include "inputline.h"
@@ -411,7 +410,7 @@ static void test_casefold(void) {
       ++tests;
     }
   }
-  check_string(casefold(""), "");
+  check_string(utf8_casefold_canon("", 0, 0), "");
 }
 
 struct {
@@ -450,7 +449,7 @@ static void test_words(void) {
   
   fprintf(stderr, "test_words\n");
   for(t = 0; t < NWTEST; ++t) {
-    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot, 0);
 
     for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
       ;
diff --git a/lib/trackname.c b/lib/trackname.c
index 53824d7..bb07e64 100644
--- a/lib/trackname.c
+++ b/lib/trackname.c
@@ -31,7 +31,7 @@
 #include "regsub.h"
 #include "log.h"
 #include "filepart.h"
-#include "words.h"
+#include "unicode.h"
 
 const struct collection *find_track_collection(const char *track) {
   int n;
@@ -114,15 +114,20 @@ int compare_tracks(const char *sa, const char *sb,
 		   const char *ta, const char *tb) {
   int c;
 
-  if((c = strcmp(casefold(sa), casefold(sb)))) return c;
+  if((c = strcmp(utf8_casefold_canon(sa, strlen(sa), 0),
+		 utf8_casefold_canon(sb, strlen(sb), 0))))
+    return c;
   if((c = strcmp(sa, sb))) return c;
-  if((c = strcmp(casefold(da), casefold(db)))) return c;
+  if((c = strcmp(utf8_casefold_canon(da, strlen(da), 0),
+		 utf8_casefold_canon(db, strlen(db), 0))))
+    return c;
   if((c = strcmp(da, db))) return c;
   return compare_path(ta, tb);
 }
 
 int compare_path_raw(const unsigned char *ap, size_t an,
 		     const unsigned char *bp, size_t bn) {
+  /* Don't change this function!  The database sort order depends on it */
   while(an > 0 && bn > 0) {
     if(*ap == *bp) {
       ap++;
diff --git a/lib/unicode.c b/lib/unicode.c
index b5b520c..40b9854 100644
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -372,21 +372,10 @@ struct utf32_iterator_data {
    * the value is (uint32_t)-1.
    */
   uint32_t last[2];
-};
 
-/** @brief Create a new iterator pointing at the start of a string
- * @param s Start of string
- * @param ns Length of string
- * @return New iterator
- */
-utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
-  utf32_iterator it = xmalloc(sizeof *it);
-  it->s = s;
-  it->ns = ns;
-  it->n = 0;
-  it->last[0] = it->last[1] = -1;
-  return it;
-}
+  /** @brief Tailoring for Word_Break */
+  unicode_property_tailor *word_break;
+};
 
 /** @brief Initialize an internal private iterator
  * @param it Iterator
@@ -400,9 +389,54 @@ static void utf32__iterator_init(utf32_iterator it,
   it->ns = ns;
   it->n = 0;
   it->last[0] = it->last[1] = -1;
+  it->word_break = 0;
   utf32_iterator_set(it, n);
 }
 
+/** @brief Create a new iterator pointing at the start of a string
+ * @param s Start of string
+ * @param ns Length of string
+ * @return New iterator
+ */
+utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
+  utf32_iterator it = xmalloc(sizeof *it);
+  utf32__iterator_init(it, s, ns, 0);
+  return it;
+}
+
+/** @brief Tailor this iterator's interpretation of the Word_Break property.
+ * @param it Iterator
+ * @param pt Property tailor function or NULL
+ *
+ * After calling this the iterator will call @p pt to determine the Word_Break
+ * property of each code point.  If it returns -1 the default value will be
+ * used otherwise the returned value will be used.
+ *
+ * @p pt can be NULL to revert to the default value of the property.
+ *
+ * It is safe to call this function at any time; the iterator's internal state
+ * will be reset to suit the new tailoring.
+ */
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+                                      unicode_property_tailor *pt) {
+  it->word_break = pt;
+  utf32_iterator_set(it, it->n);
+}
+
+static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
+                                                                 uint32_t c) {
+  if(!it->word_break)
+    return utf32__word_break(c);
+  else {
+    const int t = it->word_break(c);
+
+    if(t < 0)
+      return utf32__word_break(c);
+    else
+      return t;
+  }
+}
+
 /** @brief Destroy an iterator
  * @param it Iterator
  */
@@ -444,14 +478,18 @@ int utf32_iterator_set(utf32_iterator it, size_t n) {
     return -1;
   /* Walk backwards skipping ignorable code points */
   m = n;
-  while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+  while(m > 0
+        && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                 it->s[m-1]))))
     --m;
   /* Either m=0 or s[m-1] is not ignorable */
   if(m > 0) {
     --m;
     /* s[m] is our first non-ignorable code; look for a second in the same
        way **/
-    while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+    while(m > 0
+          && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                   it->s[m-1]))))
       --m;
     /* Either m=0 or s[m-1] is not ignorable */
     if(m > 0)
@@ -478,7 +516,7 @@ int utf32_iterator_advance(utf32_iterator it, size_t count) {
   if(count <= it->ns - it->n) {
     while(count > 0) {
       const uint32_t c = it->s[it->n];
-      const enum unicode_Word_Break wb = utf32__word_break(c);
+      const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
       if(it->last[1] == (uint32_t)-1
          || !utf32__boundary_ignorable(wb)) {
         it->last[0] = it->last[1];
@@ -588,29 +626,30 @@ int utf32_iterator_word_boundary(utf32_iterator it) {
   /* WB4 */
   /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
   if(utf32__sentence_break(it->s[it->n-1]) != unicode_Sentence_Break_Sep
-     && utf32__boundary_ignorable(utf32__word_break(it->s[it->n])))
+     && utf32__boundary_ignorable(utf32__iterator_word_break(it, it->s[it->n])))
     return 0;
   /* Gather the property values we'll need for the rest of the test taking the
    * s6.2 changes into account */
   /* First we look at the code points after the proposed boundary */
   nn = it->n;                           /* <it->ns */
-  after = utf32__word_break(it->s[nn++]);
+  after = utf32__iterator_word_break(it, it->s[nn++]);
   if(!utf32__boundary_ignorable(after)) {
     /* X (Extend|Format)* -> X */
     while(nn < it->ns
-          && utf32__boundary_ignorable(utf32__word_break(it->s[nn])))
+          && utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                  it->s[nn])))
       ++nn;
   }
   /* It's possible now that nn=ns */
   if(nn < it->ns)
-    twoafter = utf32__word_break(it->s[nn]);
+    twoafter = utf32__iterator_word_break(it, it->s[nn]);
   else
     twoafter = unicode_Word_Break_Other;
 
   /* We've already recorded the non-ignorable code points before the proposed
    * boundary */
-  before = utf32__word_break(it->last[1]);
-  twobefore = utf32__word_break(it->last[0]);
+  before = utf32__iterator_word_break(it, it->last[1]);
+  twobefore = utf32__iterator_word_break(it, it->last[0]);
 
   /* WB5 */
   if(before == unicode_Word_Break_ALetter
@@ -626,7 +665,7 @@ int utf32_iterator_word_boundary(utf32_iterator it) {
      && before == unicode_Word_Break_MidLetter
      && after == unicode_Word_Break_ALetter)
     return 0;
-  /* WB8 */  
+  /* WB8 */
   if(before == unicode_Word_Break_Numeric
      && after == unicode_Word_Break_Numeric)
     return 0;
@@ -1275,12 +1314,14 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
  * @param s Pointer to start of string
  * @param ns Length of string
  * @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
  * @return Pointer to array of pointers to words
  *
  * The returned array is terminated by a NULL pointer and individual
  * strings are 0-terminated.
  */
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak) {
   struct utf32_iterator_data it[1];
   size_t b1 = 0, b2 = 0 ,i;
   int isword;
@@ -1289,6 +1330,7 @@ uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
 
   vector32_init(v32);
   utf32__iterator_init(it, s, ns, 0);
+  it->word_break = wbreak;
   /* Work our way through the string stopping at each word break. */
   do {
     if(utf32_iterator_word_boundary(it)) {
@@ -1300,7 +1342,7 @@ uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
        * whether they are a word or not */
       isword = 0;
       for(i = b1; i < b2; ++i) {
-        switch(utf32__word_break(it->s[i])) {
+        switch(utf32__iterator_word_break(it, it->s[i])) {
         case unicode_Word_Break_ALetter:
         case unicode_Word_Break_Numeric:
         case unicode_Word_Break_Katakana:
@@ -1468,18 +1510,20 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
  * @param s Pointer to start of string
  * @param ns Length of string
  * @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
  * @return Pointer to array of pointers to words
  *
  * The returned array is terminated by a NULL pointer and individual
  * strings are 0-terminated.
  */
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+                       unicode_property_tailor *wbreak) {
   uint32_t *to32 = 0, **v32 = 0;
   size_t nto32, nv, n;
   char **v8 = 0, **ret = 0;
-                                                                
+
   if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
-  if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+  if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
   v8 = xcalloc(sizeof (char *), nv + 1);
   for(n = 0; n < nv; ++n)
     if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
@@ -1487,7 +1531,7 @@ char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
   ret = v8;
   *nwp = nv;
   v8 = 0;                               /* don't free */
-error:                                                          
+error:
   if(v8) {
     for(n = 0; n < nv; ++n)
       xfree(v8[n]);
diff --git a/lib/unicode.h b/lib/unicode.h
index 7f32207..e9e58ca 100644
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -36,6 +36,14 @@
  */
 typedef struct utf32_iterator_data *utf32_iterator;
 
+/** @brief Property tailor function
+ * @param c Code point
+ * @return Tailored property or -1 to use standard value
+ *
+ * See also utf32_iterator_tailor_word_break().
+ */
+typedef int unicode_property_tailor(uint32_t c);
+
 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd);
 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd);
 int utf8_valid(const char *s, size_t ns);
@@ -73,9 +81,13 @@ int utf32_iterator_advance(utf32_iterator it, size_t n);
 uint32_t utf32_iterator_code(utf32_iterator it);
 int utf32_iterator_grapheme_boundary(utf32_iterator it);
 int utf32_iterator_word_boundary(utf32_iterator it);
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+                                      unicode_property_tailor *pt);
 
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak);
 
 /** @brief Convert 0-terminated UTF-32 to UTF-8
  * @param s 0-terminated UTF-32 string
diff --git a/lib/words.c b/lib/words.c
deleted file mode 100644
index 89174cd..0000000
--- a/lib/words.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004, 2007 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#include <config.h>
-#include "types.h"
-
-#include <string.h>
-#include <stddef.h>
-
-#include "mem.h"
-#include "vector.h"
-#include "table.h"
-#include "words.h"
-#include "utf8.h"
-#include "log.h"
-#include "charset.h"
-
-#include "unidata.h"
-#include "unicode.h"
-
-const char *casefold(const char *ptr) {
-  return utf8_casefold_compat(ptr, strlen(ptr), 0);
-}
-
-char **words(const char *s, int *nvecp) {
-  size_t nv;
-  char **v;
-
-  v = utf8_word_split(s, strlen(s), &nv);
-  *nvecp = nv;
-  return v;
-}
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
diff --git a/lib/words.h b/lib/words.h
deleted file mode 100644
index 9fb7448..0000000
--- a/lib/words.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#ifndef WORDS_H
-#define WORDS_H
-
-const char *casefold(const char *s);
-/* return a case-folded version of UTF-8 string @s@, or the original
- * string if malformed. */
-
-char **words(const char *s, int *nvecp);
-/* return the words found in UTF-8 string @s@, with punctuation
- * stripped out.  (Doesn't casefold.) */
-
-#endif /* WORDS_H */
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
diff --git a/server/dcgi.c b/server/dcgi.c
index 8eff93a..07e1825 100644
--- a/server/dcgi.c
+++ b/server/dcgi.c
@@ -46,7 +46,6 @@
 #include "queue.h"
 #include "plugin.h"
 #include "split.h"
-#include "words.h"
 #include "wstat.h"
 #include "kvp.h"
 #include "syscalls.h"
diff --git a/server/trackdb.c b/server/trackdb.c
index 4be5f25..b4e0de9 100644
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -44,7 +44,6 @@
 #include "configuration.h"
 #include "syscalls.h"
 #include "wstat.h"
-#include "words.h"
 #include "printf.h"
 #include "filepart.h"
 #include "trackname.h"
@@ -53,6 +52,8 @@
 #include "cache.h"
 #include "eventlog.h"
 #include "hash.h"
+#include "unicode.h"
+#include "unidata.h"
 
 #define RESCAN "disorder-rescan"
 #define DEADLOCK "disorder-deadlock"
@@ -573,24 +574,50 @@ static int is_display_pref(const char *name) {
   return !strncmp(name, prefix, (sizeof prefix) - 1);
 }
 
+/** @brief Word_Break property tailor that treats underscores as spaces */
+static int tailor_underscore_Word_Break_Other(uint32_t c) {
+  switch(c) {
+  default:
+    return -1;
+  case 0x005F: /* LOW LINE (SPACING UNDERSCORE) */
+    return unicode_Word_Break_Other;
+  }
+}
+
+/** @brief Normalize and split a string using a given tailoring */
+static void word_split(struct vector *v,
+                       const char *s,
+                       unicode_property_tailor *pt) {
+  size_t nw, nt32, i;
+  uint32_t *t32, **w32;
+
+  /* Convert to UTF-32 */
+  if(!(t32 = utf8_to_utf32(s, strlen(s), &nt32)))
+    return;
+  /* Erase case distinctions */
+  if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
+    return;
+  /* Split into words, treating _ as a space */
+  w32 = utf32_word_split(t32, nt32, &nw, pt);
+  /* Convert words back to UTF-8 and append to result */
+  for(i = 0; i < nw; ++i)
+    vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+}
+
 /* compute the words of a track name */
 static char **track_to_words(const char *track,
                              const struct kvp *p) {
   struct vector v;
-  char **w;
-  int nw;
   const char *rootless = track_rootless(track);
 
   if(!rootless)
     rootless = track;                   /* bodge */
   vector_init(&v);
-  if((w = words(casefold(strip_extension(rootless)), &nw)))
-    vector_append_many(&v, w, nw);
-
+  rootless = strip_extension(rootless);
+  word_split(&v, strip_extension(rootless), tailor_underscore_Word_Break_Other);
   for(; p; p = p->next)
     if(is_display_pref(p->name))
-      if((w = words(casefold(p->value), &nw)))
-        vector_append_many(&v, w, nw);
+      word_split(&v, p->value, 0);
   vector_terminate(&v);
   return dedupe(v.vec, v.nvec);
 }
@@ -1739,7 +1766,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
   /* casefold all the words */
   w = xmalloc(nwordlist * sizeof (char *));
   for(n = 0; n < nwordlist; ++n) {
-    w[n] = casefold(wordlist[n]);
+    w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
     if(checktag(w[n])) ++ntags;         /* count up tags */
   }
   /* find the longest non-stopword */
-- 
[mdw]