utf32_word_split() and utf8_word_split() splits a string into words

author Richard Kettlewell <rjk@greenend.org.uk>

Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)

committer Richard Kettlewell <rjk@greenend.org.uk>

Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)
author Richard Kettlewell <rjk@greenend.org.uk>
Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)
committer Richard Kettlewell <rjk@greenend.org.uk>
Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)
diff --git a/lib/configuration.c b/lib/configuration.c

index 35ed090a012e6bd97e2153fe529d037d70e28422..221be9c44b80291f3ddde18663f088699f442ab1 100644 (file)
--- a/lib/configuration.c
+++ b/lib/configuration.c
@@ -884,6 +884,7 @@ static const struct conf conf[] = {
    { C(checkpoint_min),   &type_integer,          validate_non_negative },
    { C(collection),       &type_collections,      validate_any },
    { C(connect),          &type_stringlist,       validate_addrport },
+  { C(dbversion),        &type_integer,          validate_positive },
    { C(device),           &type_string,           validate_any },
    { C(gap),              &type_integer,          validate_non_negative },
    { C(history),          &type_integer,          validate_positive },
@@ -1039,6 +1040,7 @@ static struct config *config_default(void) {
    c->short_display = 32;
    c->mixer = xstrdup("/dev/mixer");
    c->channel = xstrdup("pcm");
+  c->dbversion = 2;
    return c;
  }
  
diff --git a/lib/configuration.h b/lib/configuration.h

index 4ef7862071aa0479222899ab2be7313bc5ce6fd9..a4ffa6337bfa047c828d8e6c111bd34465cd2228 100644 (file)
--- a/lib/configuration.h
+++ b/lib/configuration.h
@@ -246,6 +246,9 @@ struct config {
    /* derived values: */
    int nparts;                          /* number of distinct name parts */
    char **parts;                                /* name part list  */
+
+  /* undocumented, for testing only */
+  long dbversion;
  };
  
  extern struct config *config;
diff --git a/lib/test.c b/lib/test.c

index a5e32954fe0c2a84bd1d9f6e7b774063cbb046f2..0790efccc01c35865e6388298ccf6b7eccb28dc2 100644 (file)
--- a/lib/test.c
+++ b/lib/test.c
@@ -414,6 +414,69 @@ static void test_casefold(void) {
    check_string(casefold(""), "");
  }
  
+struct {
+  const char *in;
+  const char *expect[10];
+} wtest[] = {
+  /* Empty string */
+  { "", { 0 } },
+  /* Only whitespace and punctuation */
+  { "    ", { 0 } },
+  { " '   ", { 0 } },
+  { " !  ", { 0 } },
+  { " \"\"  ", { 0 } },
+  { " @  ", { 0 } },
+  /* Basics */
+  { "wibble", { "wibble", 0 } },
+  { " wibble", { "wibble", 0 } },
+  { " wibble ", { "wibble", 0 } },
+  { "wibble ", { "wibble", 0 } },
+  { "wibble spong", { "wibble", "spong", 0 } },
+  { " wibble  spong", { "wibble", "spong", 0 } },
+  { " wibble  spong   ", { "wibble", "spong", 0 } },
+  { "wibble   spong  ", { "wibble", "spong", 0 } },
+  { "wibble   spong splat foo zot  ", { "wibble", "spong", "splat", "foo", "zot", 0 } },
+  /* Apostrophes */
+  { "wibble 'spong", { "wibble", "spong", 0 } },
+  { " wibble's", { "wibble's", 0 } },
+  { " wibblespong'   ", { "wibblespong", 0 } },
+  { "wibble   sp''ong  ", { "wibble", "sp", "ong", 0 } },
+};
+#define NWTEST (sizeof wtest / sizeof *wtest)
+
+static void test_words(void) {
+  size_t t, nexpect, ngot, i;
+  int right;
+  
+  fprintf(stderr, "test_words\n");
+  for(t = 0; t < NWTEST; ++t) {
+    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+
+    for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
+      ;
+    if(nexpect == ngot) {
+      for(i = 0; i < ngot; ++i)
+        if(strcmp(wtest[t].expect[i], got[i]))
+          break;
+      right = i == ngot;
+    } else
+      right = 0;
+    if(!right) {
+      fprintf(stderr, "word split %zu failed\n", t);
+      fprintf(stderr, "input: %s\n", wtest[t].in);
+      fprintf(stderr, "    | %-30s | %-30s\n",
+              "expected", "got");
+      for(i = 0; i < nexpect || i < ngot; ++i) {
+        const char *e = i < nexpect ? wtest[t].expect[i] : "<none>";
+        const char *g = i < ngot ? got[i] : "<none>";
+        fprintf(stderr, " %2zu | %-30s | %-30s\n", i, e, g);
+      }
+      count_error();
+    }
+    ++tests;
+  }
+}
+
  /** @brief Less-than comparison function for integer heap */
  static inline int int_lt(int a, int b) { return a < b; }
  
@@ -657,6 +720,7 @@ int main(void) {
    /* vector.c */
    /* words.c */
    test_casefold();
+  test_words();
    /* XXX words() */
    /* wstat.c */
    fprintf(stderr,  "%d errors out of %d tests\n", errors, tests);
diff --git a/lib/unicode.c b/lib/unicode.c

index 4f4f2ca80bdee6649634706e06fba21a7476b952..b5b520cf07991a60f04a15d1ebb60ac966170676 100644 (file)
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -1271,6 +1271,59 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
    return utf32_iterator_word_boundary(it);
  }
  
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+  struct utf32_iterator_data it[1];
+  size_t b1 = 0, b2 = 0 ,i;
+  int isword;
+  struct vector32 v32[1];
+  uint32_t *w;
+
+  vector32_init(v32);
+  utf32__iterator_init(it, s, ns, 0);
+  /* Work our way through the string stopping at each word break. */
+  do {
+    if(utf32_iterator_word_boundary(it)) {
+      /* We've found a new boundary */
+      b1 = b2;
+      b2 = it->n;
+      /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
+      /* Inspect the characters between the boundary and form an opinion as to
+       * whether they are a word or not */
+      isword = 0;
+      for(i = b1; i < b2; ++i) {
+        switch(utf32__word_break(it->s[i])) {
+        case unicode_Word_Break_ALetter:
+        case unicode_Word_Break_Numeric:
+        case unicode_Word_Break_Katakana:
+          isword = 1;
+          break;
+        default:
+          break;
+        }
+      }
+      /* If it's a word add it to the list of results */
+      if(isword) {
+        w = xcalloc(b2 - b1 + 1, sizeof(uint32_t));
+        memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t));
+        vector32_append(v32, w);
+      }
+    }
+  } while(!utf32_iterator_advance(it, 1));
+  vector32_terminate(v32);
+  if(nwp)
+    *nwp = v32->nvec;
+  return v32->vec;
+}
+
  /*@}*/
  /** @defgroup utf8 Functions that operate on UTF-8 strings */
  /*@{*/
@@ -1411,6 +1464,45 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
    utf8__transform(utf32_casefold_compat);
  }
  
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+  uint32_t *to32 = 0, **v32 = 0;
+  size_t nto32, nv, n;
+  char **v8 = 0, **ret = 0;
+                                                                
+  if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
+  if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+  v8 = xcalloc(sizeof (char *), nv + 1);
+  for(n = 0; n < nv; ++n)
+    if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
+      goto error;
+  ret = v8;
+  *nwp = nv;
+  v8 = 0;                               /* don't free */
+error:                                                          
+  if(v8) {
+    for(n = 0; n < nv; ++n)
+      xfree(v8[n]);
+    xfree(v8);
+  }
+  if(v32) {
+    for(n = 0; n < nv; ++n)
+      xfree(v32[n]);
+    xfree(v32);
+  }
+  xfree(to32);
+  return ret;
+}
+
+
  /*@}*/
  
  /*
diff --git a/lib/unicode.h b/lib/unicode.h

index a996844ae5ebcd22a3851764f19b9cde72a4026c..7f3220773537d0447c2fbc810a1e517b61e1dcbf 100644 (file)
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -74,6 +74,9 @@ uint32_t utf32_iterator_code(utf32_iterator it);
  int utf32_iterator_grapheme_boundary(utf32_iterator it);
  int utf32_iterator_word_boundary(utf32_iterator it);
  
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+
  /** @brief Convert 0-terminated UTF-32 to UTF-8
   * @param s 0-terminated UTF-32 string
   * @return 0-terminated UTF-8 string or 0 on error
diff --git a/lib/vector.h b/lib/vector.h

index 081a71db73493732ea9e97914f1d1de5e88caae4..bb944a29c72a008c15ec191af81563ce0b492004 100644 (file)
--- a/lib/vector.h
+++ b/lib/vector.h
@@ -80,6 +80,8 @@ VECTOR_TYPE(vector, char *, xrealloc);
  VECTOR_TYPE(dynstr, char, xrealloc_noptr);
  /** @brief A dynamic unicode string */
  VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr);
+/** @brief A dynamic array of pointers to unicode string */
+VECTOR_TYPE(vector32, uint32_t *, xrealloc);
  
  /** @brief Append many strings to a @ref vector */
  void vector_append_many(struct vector *v, char **vec, int nvec);
diff --git a/lib/words.c b/lib/words.c

index 2638ea645c991974696fef808afaf29850f6cbf9..89174cd8e5048ff9047dd1c652fcc8de760d1810 100644 (file)
--- a/lib/words.c
+++ b/lib/words.c
@@ -36,104 +36,16 @@
  #include "unicode.h"
  
  const char *casefold(const char *ptr) {
-  return utf8_casefold_canon(ptr, strlen(ptr), 0);
+  return utf8_casefold_compat(ptr, strlen(ptr), 0);
  }
  
-static enum unicode_General_Category cat(uint32_t c) {
-  if(c < UNICODE_NCHARS) {
-    const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
-    return ud->general_category;
-  } else
-    return unicode_General_Category_Cn;
-}
-
-/* XXX this is a bit kludgy */
-
  char **words(const char *s, int *nvecp) {
-  struct vector v;
-  struct dynstr d;
-  const char *start;
-  uint32_t c;
-  int in_word = 0;
-
-  vector_init(&v);
-  while(*s) {
-    start = s;
-    PARSE_UTF8(s, c, return 0);
-    /* special cases first */
-    switch(c) {
-    case '/':
-    case '.':
-    case '+':
-    case '&':
-    case ':':
-    case '_':
-    case '-':
-      goto separator;
-    }
-    /* do the rest on category */
-    switch(cat(c)) {
-    case unicode_General_Category_Ll:
-    case unicode_General_Category_Lm:
-    case unicode_General_Category_Lo:
-    case unicode_General_Category_Lt:
-    case unicode_General_Category_Lu:
-    case unicode_General_Category_Nd:
-    case unicode_General_Category_Nl:
-    case unicode_General_Category_No:
-    case unicode_General_Category_Sc:
-    case unicode_General_Category_Sk:
-    case unicode_General_Category_Sm:
-    case unicode_General_Category_So:
-      /* letters, digits and symbols are considered to be part of
-       * words */
-      if(!in_word) {
-       dynstr_init(&d);
-       in_word = 1;
-      }
-      dynstr_append_bytes(&d, start, s - start);
-      break;
-
-    case unicode_General_Category_Cc:
-    case unicode_General_Category_Cf:
-    case unicode_General_Category_Co:
-    case unicode_General_Category_Cs:
-    case unicode_General_Category_Zl:
-    case unicode_General_Category_Zp:
-    case unicode_General_Category_Zs:
-    case unicode_General_Category_Pe:
-    case unicode_General_Category_Ps:
-    separator:
-      if(in_word) {
-       dynstr_terminate(&d);
-       vector_append(&v, d.vec);
-       in_word = 0;
-      }
-      break;
-
-    case unicode_General_Category_Mc:
-    case unicode_General_Category_Me:
-    case unicode_General_Category_Mn:
-    case unicode_General_Category_Pc:
-    case unicode_General_Category_Pd:
-    case unicode_General_Category_Pf:
-    case unicode_General_Category_Pi:
-    case unicode_General_Category_Po:
-    case unicode_General_Category_Cn:
-      /* control and punctuation is completely ignored */
-      break;
+  size_t nv;
+  char **v;
  
-    }
-  }
-  if(in_word) {
-    /* pick up the final word */
-    dynstr_terminate(&d);
-    vector_append(&v, d.vec);
-  }
-  vector_terminate(&v);
-  if(nvecp)
-    *nvecp = v.nvec;
-  return v.vec;
+  v = utf8_word_split(s, strlen(s), &nv);
+  *nvecp = nv;
+  return v;
  }
  
  /*
diff --git a/server/Makefile.am b/server/Makefile.am

index e3a1a0a85165865406616b30e3d57402fac18725..bdb5071146387cc90f97f4a0d1d3198239c96b21 100644 (file)
--- a/server/Makefile.am
+++ b/server/Makefile.am
@@ -118,7 +118,7 @@ cgi.o: ../lib/definitions.h
  # for Mac OS X >=10.4
  SEDFILES=uk.org.greenend.rjk.disorder.plist
  include ${top_srcdir}/scripts/sedfiles.make
-EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in
+EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in README.dbversions
  LAUNCHD=/Library/LaunchDaemons
  
  #install-data-hook:
diff --git a/server/README.dbversions b/server/README.dbversions

new file mode 100644 (file)

index 0000000..4c77db6
--- /dev/null
+++ b/server/README.dbversions
@@ -0,0 +1,42 @@
+DisOrder Database Versions
+==========================
+
+If no _dbversion global preference is found then database version 1 is
+assumed.  Database versions 2 and above always have a _dbversion
+global preference.
+
+Old database versions can be PARTIALLY emulated for testing purposes
+by setting the undocument dbversion configuration item.  Setting it on
+a production system would be a terrible idea.
+
+Database Version 1
+------------------
+
+Path names are in UTF-8, but with no normalization applied: you get
+whatever the filesystem gives you.
+
+Search terms are split according to the old words() function.
+  - "/", ".", "+", "&", ":", "_" and "-" are considered to be separators
+  - anything in General_Category Cc, Cf, Co, Cs, Zl, Cp, Sz, Pe or Ps
+    is considered to be a separator
+  - anything else in General_Category Ll, Lm, Lo, Lt, Lu, Nd, Nl, No,
+    Sc, Sk, Sm or So is considered to be part of a word
+  - everything else is ignored
+
+Search terms are case-folded by applying the CaseFolding.txt mapping,
+without any attempt at normalization.
+
+Database Version 2
+------------------
+
+Path names are in UTF-8, normalized to NFC.
+
+Search terms are split according to the default Unicode word boundary
+detection algorithm.
+
+Search terms are case-folded using the Unicode case-folding algorithm,
+normalizing to NFKD.
+
+Things that haven't been done yet:
+  - undump support for new dbversion
+  - automatic upgrade from dbversion 1
diff --git a/server/rescan.c b/server/rescan.c

index b9be72d9a482da953ef0cffb1955a470ccec4f0f..cc31888ab91be9334aed228fba4322a34e68ab88 100644 (file)
--- a/server/rescan.c
+++ b/server/rescan.c
@@ -152,10 +152,12 @@ static void rescan_collection(const struct collection *c) {
        error(0, "cannot convert track path to UTF-8: %s", path);
        continue;
      }
-    /* We use NFC track names */
-    if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
-      error(0, "cannot convert track path to NFC: %s", path);
-      continue;
+    if(config->dbversion > 1) {
+      /* We use NFC track names */
+      if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
+        error(0, "cannot convert track path to NFC: %s", path);
+        continue;
+      }
      }
      D(("track %s", track));
      /* only tracks with a known player are admitted */
diff --git a/server/trackdb.c b/server/trackdb.c

index e1848c4eca1fa583dd3da01e193ea0f5884124fc..4be5f25802187cc422dc4ae8fa38f03af895bb4f 100644 (file)
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -295,15 +295,16 @@ void trackdb_open(void) {
    trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666);
    if(trackdb_globaldb) {
      /* This is an existing database */
-    const char *oldversion;
+    const char *s;
+    long oldversion;
  
-    oldversion = trackdb_get_global("_dbversion");
-    if(!oldversion)
-      oldversion = "1.x";
-    if(strcmp(oldversion, DBVERSION)) {
+    s = trackdb_get_global("_dbversion");
+    oldversion = s ? atol(s) : 1;
+    if(oldversion != config->dbversion) {
        /* This database needs upgrading.  This isn't implemented yet so we just
         * fail. */
-      fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION);
+      fatal(0, "database needs upgrading from %ld to %ld",
+            oldversion, config->dbversion);
      }
      newdb = 0;
      /* Close the database again,  we'll open it property below */
@@ -326,8 +327,12 @@ void trackdb_open(void) {
    trackdb_noticeddb = open_db("noticed.db",
                               DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
    /* Stash the database version */
-  if(newdb)
-    trackdb_set_global("_dbversion", DBVERSION, 0);
+  if(newdb) {
+    char buf[32];
+
+    snprintf(buf, sizeof buf, "%ld", config->dbversion);
+    trackdb_set_global("_dbversion", buf, 0);
+  }
    D(("opened databases"));
  }
  
diff --git a/server/trackdb.h b/server/trackdb.h

index 854f63c7daa1bc7c8dff297040e497dbae41725b..fe43474ca0b1652b07145672c9eb6f767fb32630 100644 (file)
--- a/server/trackdb.h
+++ b/server/trackdb.h
@@ -23,9 +23,6 @@
  
  struct ev_source;
  
-/* Database version string */
-#define DBVERSION "2.0"
-
  extern const struct cache_type cache_files_type;
  extern unsigned long cache_files_hits, cache_files_misses;
  /* Cache entry type and tracking for regexp-based lookups */
author	Richard Kettlewell <rjk@greenend.org.uk>
	Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)
committer	Richard Kettlewell <rjk@greenend.org.uk>
	Tue, 20 Nov 2007 18:13:56 +0000 (18:13 +0000)
lib/configuration.c		patch \| blob \| blame \| history
lib/configuration.h		patch \| blob \| blame \| history
lib/test.c		patch \| blob \| blame \| history
lib/unicode.c		patch \| blob \| blame \| history
lib/unicode.h		patch \| blob \| blame \| history
lib/vector.h		patch \| blob \| blame \| history
lib/words.c		patch \| blob \| blame \| history
server/Makefile.am		patch \| blob \| blame \| history
server/README.dbversions	[new file with mode: 0644]	patch \| blob
server/rescan.c		patch \| blob \| blame \| history
server/trackdb.c		patch \| blob \| blame \| history
server/trackdb.h		patch \| blob \| blame \| history