From 8818b7fca12456e62410ef914a7bef250a0633c9 Mon Sep 17 00:00:00 2001
Message-Id: <8818b7fca12456e62410ef914a7bef250a0633c9.1714161298.git.mdw@distorted.org.uk>
From: Mark Wooding <mdw@chiark.greenend.org.uk>
Date: Tue, 20 Nov 2007 18:13:56 +0000
Subject: [PATCH] utf32_word_split() and utf8_word_split() splits a string into
 words using the UAX #29 word boundary algorithm.  words() is therefore now a
 wrapper around this.  There is scope for improvement in the use of this
 function as currently we do some needless converting back and forth between
 encoding forms.
Organization: Straylight/Edgeware

From: Richard Kettlewell <rjk@greenend.org.uk>

casefold() now uses the compatibility case-folding algorithm, which
seems more appropriate for searching.

dbversions are now integers not strings.  Some dbversion=2
functionality can be selectively disabled for testing purposes.

README.dbversions documents the differences between the dbversions.
---
 lib/configuration.c      |   2 +
 lib/configuration.h      |   3 ++
 lib/test.c               |  64 +++++++++++++++++++++++++
 lib/unicode.c            |  92 +++++++++++++++++++++++++++++++++++
 lib/unicode.h            |   3 ++
 lib/vector.h             |   2 +
 lib/words.c              | 100 +++------------------------------------
 server/Makefile.am       |   2 +-
 server/README.dbversions |  42 ++++++++++++++++
 server/rescan.c          |  10 ++--
 server/trackdb.c         |  21 ++++----
 server/trackdb.h         |   3 --
 12 files changed, 234 insertions(+), 110 deletions(-)
 create mode 100644 server/README.dbversions

diff --git a/lib/configuration.c b/lib/configuration.c
index 35ed090..221be9c 100644
--- a/lib/configuration.c
+++ b/lib/configuration.c
@@ -884,6 +884,7 @@ static const struct conf conf[] = {
   { C(checkpoint_min),   &type_integer,          validate_non_negative },
   { C(collection),       &type_collections,      validate_any },
   { C(connect),          &type_stringlist,       validate_addrport },
+  { C(dbversion),        &type_integer,          validate_positive },
   { C(device),           &type_string,           validate_any },
   { C(gap),              &type_integer,          validate_non_negative },
   { C(history),          &type_integer,          validate_positive },
@@ -1039,6 +1040,7 @@ static struct config *config_default(void) {
   c->short_display = 32;
   c->mixer = xstrdup("/dev/mixer");
   c->channel = xstrdup("pcm");
+  c->dbversion = 2;
   return c;
 }
 
diff --git a/lib/configuration.h b/lib/configuration.h
index 4ef7862..a4ffa63 100644
--- a/lib/configuration.h
+++ b/lib/configuration.h
@@ -246,6 +246,9 @@ struct config {
   /* derived values: */
   int nparts;				/* number of distinct name parts */
   char **parts;				/* name part list  */
+
+  /* undocumented, for testing only */
+  long dbversion;
 };
 
 extern struct config *config;
diff --git a/lib/test.c b/lib/test.c
index a5e3295..0790efc 100644
--- a/lib/test.c
+++ b/lib/test.c
@@ -414,6 +414,69 @@ static void test_casefold(void) {
   check_string(casefold(""), "");
 }
 
+struct {
+  const char *in;
+  const char *expect[10];
+} wtest[] = {
+  /* Empty string */
+  { "", { 0 } },
+  /* Only whitespace and punctuation */
+  { "    ", { 0 } },
+  { " '   ", { 0 } },
+  { " !  ", { 0 } },
+  { " \"\"  ", { 0 } },
+  { " @  ", { 0 } },
+  /* Basics */
+  { "wibble", { "wibble", 0 } },
+  { " wibble", { "wibble", 0 } },
+  { " wibble ", { "wibble", 0 } },
+  { "wibble ", { "wibble", 0 } },
+  { "wibble spong", { "wibble", "spong", 0 } },
+  { " wibble  spong", { "wibble", "spong", 0 } },
+  { " wibble  spong   ", { "wibble", "spong", 0 } },
+  { "wibble   spong  ", { "wibble", "spong", 0 } },
+  { "wibble   spong splat foo zot  ", { "wibble", "spong", "splat", "foo", "zot", 0 } },
+  /* Apostrophes */
+  { "wibble 'spong", { "wibble", "spong", 0 } },
+  { " wibble's", { "wibble's", 0 } },
+  { " wibblespong'   ", { "wibblespong", 0 } },
+  { "wibble   sp''ong  ", { "wibble", "sp", "ong", 0 } },
+};
+#define NWTEST (sizeof wtest / sizeof *wtest)
+
+static void test_words(void) {
+  size_t t, nexpect, ngot, i;
+  int right;
+  
+  fprintf(stderr, "test_words\n");
+  for(t = 0; t < NWTEST; ++t) {
+    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+
+    for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
+      ;
+    if(nexpect == ngot) {
+      for(i = 0; i < ngot; ++i)
+        if(strcmp(wtest[t].expect[i], got[i]))
+          break;
+      right = i == ngot;
+    } else
+      right = 0;
+    if(!right) {
+      fprintf(stderr, "word split %zu failed\n", t);
+      fprintf(stderr, "input: %s\n", wtest[t].in);
+      fprintf(stderr, "    | %-30s | %-30s\n",
+              "expected", "got");
+      for(i = 0; i < nexpect || i < ngot; ++i) {
+        const char *e = i < nexpect ? wtest[t].expect[i] : "<none>";
+        const char *g = i < ngot ? got[i] : "<none>";
+        fprintf(stderr, " %2zu | %-30s | %-30s\n", i, e, g);
+      }
+      count_error();
+    }
+    ++tests;
+  }
+}
+
 /** @brief Less-than comparison function for integer heap */
 static inline int int_lt(int a, int b) { return a < b; }
 
@@ -657,6 +720,7 @@ int main(void) {
   /* vector.c */
   /* words.c */
   test_casefold();
+  test_words();
   /* XXX words() */
   /* wstat.c */
   fprintf(stderr,  "%d errors out of %d tests\n", errors, tests);
diff --git a/lib/unicode.c b/lib/unicode.c
index 4f4f2ca..b5b520c 100644
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -1271,6 +1271,59 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
   return utf32_iterator_word_boundary(it);
 }
 
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+  struct utf32_iterator_data it[1];
+  size_t b1 = 0, b2 = 0 ,i;
+  int isword;
+  struct vector32 v32[1];
+  uint32_t *w;
+
+  vector32_init(v32);
+  utf32__iterator_init(it, s, ns, 0);
+  /* Work our way through the string stopping at each word break. */
+  do {
+    if(utf32_iterator_word_boundary(it)) {
+      /* We've found a new boundary */
+      b1 = b2;
+      b2 = it->n;
+      /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
+      /* Inspect the characters between the boundary and form an opinion as to
+       * whether they are a word or not */
+      isword = 0;
+      for(i = b1; i < b2; ++i) {
+        switch(utf32__word_break(it->s[i])) {
+        case unicode_Word_Break_ALetter:
+        case unicode_Word_Break_Numeric:
+        case unicode_Word_Break_Katakana:
+          isword = 1;
+          break;
+        default:
+          break;
+        }
+      }
+      /* If it's a word add it to the list of results */
+      if(isword) {
+        w = xcalloc(b2 - b1 + 1, sizeof(uint32_t));
+        memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t));
+        vector32_append(v32, w);
+      }
+    }
+  } while(!utf32_iterator_advance(it, 1));
+  vector32_terminate(v32);
+  if(nwp)
+    *nwp = v32->nvec;
+  return v32->vec;
+}
+
 /*@}*/
 /** @defgroup utf8 Functions that operate on UTF-8 strings */
 /*@{*/
@@ -1411,6 +1464,45 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
   utf8__transform(utf32_casefold_compat);
 }
 
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+  uint32_t *to32 = 0, **v32 = 0;
+  size_t nto32, nv, n;
+  char **v8 = 0, **ret = 0;
+                                                                
+  if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
+  if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+  v8 = xcalloc(sizeof (char *), nv + 1);
+  for(n = 0; n < nv; ++n)
+    if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
+      goto error;
+  ret = v8;
+  *nwp = nv;
+  v8 = 0;                               /* don't free */
+error:                                                          
+  if(v8) {
+    for(n = 0; n < nv; ++n)
+      xfree(v8[n]);
+    xfree(v8);
+  }
+  if(v32) {
+    for(n = 0; n < nv; ++n)
+      xfree(v32[n]);
+    xfree(v32);
+  }
+  xfree(to32);
+  return ret;
+}
+
+
 /*@}*/
 
 /*
diff --git a/lib/unicode.h b/lib/unicode.h
index a996844..7f32207 100644
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -74,6 +74,9 @@ uint32_t utf32_iterator_code(utf32_iterator it);
 int utf32_iterator_grapheme_boundary(utf32_iterator it);
 int utf32_iterator_word_boundary(utf32_iterator it);
 
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+
 /** @brief Convert 0-terminated UTF-32 to UTF-8
  * @param s 0-terminated UTF-32 string
  * @return 0-terminated UTF-8 string or 0 on error
diff --git a/lib/vector.h b/lib/vector.h
index 081a71d..bb944a2 100644
--- a/lib/vector.h
+++ b/lib/vector.h
@@ -80,6 +80,8 @@ VECTOR_TYPE(vector, char *, xrealloc);
 VECTOR_TYPE(dynstr, char, xrealloc_noptr);
 /** @brief A dynamic unicode string */
 VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr);
+/** @brief A dynamic array of pointers to unicode string */
+VECTOR_TYPE(vector32, uint32_t *, xrealloc);
 
 /** @brief Append many strings to a @ref vector */
 void vector_append_many(struct vector *v, char **vec, int nvec);
diff --git a/lib/words.c b/lib/words.c
index 2638ea6..89174cd 100644
--- a/lib/words.c
+++ b/lib/words.c
@@ -36,104 +36,16 @@
 #include "unicode.h"
 
 const char *casefold(const char *ptr) {
-  return utf8_casefold_canon(ptr, strlen(ptr), 0);
+  return utf8_casefold_compat(ptr, strlen(ptr), 0);
 }
 
-static enum unicode_General_Category cat(uint32_t c) {
-  if(c < UNICODE_NCHARS) {
-    const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
-    return ud->general_category;
-  } else
-    return unicode_General_Category_Cn;
-}
-
-/* XXX this is a bit kludgy */
-
 char **words(const char *s, int *nvecp) {
-  struct vector v;
-  struct dynstr d;
-  const char *start;
-  uint32_t c;
-  int in_word = 0;
-
-  vector_init(&v);
-  while(*s) {
-    start = s;
-    PARSE_UTF8(s, c, return 0);
-    /* special cases first */
-    switch(c) {
-    case '/':
-    case '.':
-    case '+':
-    case '&':
-    case ':':
-    case '_':
-    case '-':
-      goto separator;
-    }
-    /* do the rest on category */
-    switch(cat(c)) {
-    case unicode_General_Category_Ll:
-    case unicode_General_Category_Lm:
-    case unicode_General_Category_Lo:
-    case unicode_General_Category_Lt:
-    case unicode_General_Category_Lu:
-    case unicode_General_Category_Nd:
-    case unicode_General_Category_Nl:
-    case unicode_General_Category_No:
-    case unicode_General_Category_Sc:
-    case unicode_General_Category_Sk:
-    case unicode_General_Category_Sm:
-    case unicode_General_Category_So:
-      /* letters, digits and symbols are considered to be part of
-       * words */
-      if(!in_word) {
-	dynstr_init(&d);
-	in_word = 1;
-      }
-      dynstr_append_bytes(&d, start, s - start);
-      break;
-
-    case unicode_General_Category_Cc:
-    case unicode_General_Category_Cf:
-    case unicode_General_Category_Co:
-    case unicode_General_Category_Cs:
-    case unicode_General_Category_Zl:
-    case unicode_General_Category_Zp:
-    case unicode_General_Category_Zs:
-    case unicode_General_Category_Pe:
-    case unicode_General_Category_Ps:
-    separator:
-      if(in_word) {
-	dynstr_terminate(&d);
-	vector_append(&v, d.vec);
-	in_word = 0;
-      }
-      break;
-
-    case unicode_General_Category_Mc:
-    case unicode_General_Category_Me:
-    case unicode_General_Category_Mn:
-    case unicode_General_Category_Pc:
-    case unicode_General_Category_Pd:
-    case unicode_General_Category_Pf:
-    case unicode_General_Category_Pi:
-    case unicode_General_Category_Po:
-    case unicode_General_Category_Cn:
-      /* control and punctuation is completely ignored */
-      break;
+  size_t nv;
+  char **v;
 
-    }
-  }
-  if(in_word) {
-    /* pick up the final word */
-    dynstr_terminate(&d);
-    vector_append(&v, d.vec);
-  }
-  vector_terminate(&v);
-  if(nvecp)
-    *nvecp = v.nvec;
-  return v.vec;
+  v = utf8_word_split(s, strlen(s), &nv);
+  *nvecp = nv;
+  return v;
 }
 
 /*
diff --git a/server/Makefile.am b/server/Makefile.am
index e3a1a0a..bdb5071 100644
--- a/server/Makefile.am
+++ b/server/Makefile.am
@@ -118,7 +118,7 @@ cgi.o: ../lib/definitions.h
 # for Mac OS X >=10.4
 SEDFILES=uk.org.greenend.rjk.disorder.plist
 include ${top_srcdir}/scripts/sedfiles.make
-EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in
+EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in README.dbversions
 LAUNCHD=/Library/LaunchDaemons
 
 #install-data-hook:
diff --git a/server/README.dbversions b/server/README.dbversions
new file mode 100644
index 0000000..4c77db6
--- /dev/null
+++ b/server/README.dbversions
@@ -0,0 +1,42 @@
+DisOrder Database Versions
+==========================
+
+If no _dbversion global preference is found then database version 1 is
+assumed.  Database versions 2 and above always have a _dbversion
+global preference.
+
+Old database versions can be PARTIALLY emulated for testing purposes
+by setting the undocument dbversion configuration item.  Setting it on
+a production system would be a terrible idea.
+
+Database Version 1
+------------------
+
+Path names are in UTF-8, but with no normalization applied: you get
+whatever the filesystem gives you.
+
+Search terms are split according to the old words() function.
+  - "/", ".", "+", "&", ":", "_" and "-" are considered to be separators
+  - anything in General_Category Cc, Cf, Co, Cs, Zl, Cp, Sz, Pe or Ps
+    is considered to be a separator
+  - anything else in General_Category Ll, Lm, Lo, Lt, Lu, Nd, Nl, No,
+    Sc, Sk, Sm or So is considered to be part of a word
+  - everything else is ignored
+
+Search terms are case-folded by applying the CaseFolding.txt mapping,
+without any attempt at normalization.
+
+Database Version 2
+------------------
+
+Path names are in UTF-8, normalized to NFC.
+
+Search terms are split according to the default Unicode word boundary
+detection algorithm.
+
+Search terms are case-folded using the Unicode case-folding algorithm,
+normalizing to NFKD.
+
+Things that haven't been done yet:
+  - undump support for new dbversion
+  - automatic upgrade from dbversion 1
diff --git a/server/rescan.c b/server/rescan.c
index b9be72d..cc31888 100644
--- a/server/rescan.c
+++ b/server/rescan.c
@@ -152,10 +152,12 @@ static void rescan_collection(const struct collection *c) {
       error(0, "cannot convert track path to UTF-8: %s", path);
       continue;
     }
-    /* We use NFC track names */
-    if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
-      error(0, "cannot convert track path to NFC: %s", path);
-      continue;
+    if(config->dbversion > 1) {
+      /* We use NFC track names */
+      if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
+        error(0, "cannot convert track path to NFC: %s", path);
+        continue;
+      }
     }
     D(("track %s", track));
     /* only tracks with a known player are admitted */
diff --git a/server/trackdb.c b/server/trackdb.c
index e1848c4..4be5f25 100644
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -295,15 +295,16 @@ void trackdb_open(void) {
   trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666);
   if(trackdb_globaldb) {
     /* This is an existing database */
-    const char *oldversion;
+    const char *s;
+    long oldversion;
 
-    oldversion = trackdb_get_global("_dbversion");
-    if(!oldversion)
-      oldversion = "1.x";
-    if(strcmp(oldversion, DBVERSION)) {
+    s = trackdb_get_global("_dbversion");
+    oldversion = s ? atol(s) : 1;
+    if(oldversion != config->dbversion) {
       /* This database needs upgrading.  This isn't implemented yet so we just
        * fail. */
-      fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION);
+      fatal(0, "database needs upgrading from %ld to %ld",
+            oldversion, config->dbversion);
     }
     newdb = 0;
     /* Close the database again,  we'll open it property below */
@@ -326,8 +327,12 @@ void trackdb_open(void) {
   trackdb_noticeddb = open_db("noticed.db",
                              DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
   /* Stash the database version */
-  if(newdb)
-    trackdb_set_global("_dbversion", DBVERSION, 0);
+  if(newdb) {
+    char buf[32];
+
+    snprintf(buf, sizeof buf, "%ld", config->dbversion);
+    trackdb_set_global("_dbversion", buf, 0);
+  }
   D(("opened databases"));
 }
 
diff --git a/server/trackdb.h b/server/trackdb.h
index 854f63c..fe43474 100644
--- a/server/trackdb.h
+++ b/server/trackdb.h
@@ -23,9 +23,6 @@
 
 struct ev_source;
 
-/* Database version string */
-#define DBVERSION "2.0"
-
 extern const struct cache_type cache_files_type;
 extern unsigned long cache_files_hits, cache_files_misses;
 /* Cache entry type and tracking for regexp-based lookups */
-- 
[mdw]