normalize tags and exercise this

[disorder] / server / trackdb.c
diff --git a/server/trackdb.c b/server/trackdb.c

index 57ebb68267d3c8217c97fb7e9b303b8933bfb19e..5ffa19e4844bde5007a0b677362c1ab24ccf1201 100644 (file)
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -17,6 +17,8 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
   * USA
   */
+/** @file server/trackdb.c
+ * @brief Track database */
  
  #include <config.h>
  #include "types.h"
@@ -45,7 +47,6 @@
  #include "configuration.h"
  #include "syscalls.h"
  #include "wstat.h"
-#include "words.h"
  #include "printf.h"
  #include "filepart.h"
  #include "trackname.h"
@@ -54,6 +55,8 @@
  #include "cache.h"
  #include "eventlog.h"
  #include "hash.h"
+#include "unicode.h"
+#include "unidata.h"
  
  #define RESCAN "disorder-rescan"
  #define DEADLOCK "disorder-deadlock"
@@ -64,26 +67,75 @@ static const char *getpart(const char *track,
                             const struct kvp *p,
                             int *used_db);
  static int trackdb_alltags_tid(DB_TXN *tid, char ***taglistp);
-static int trackdb_get_global_tid(const char *name,
-                                  DB_TXN *tid,
-                                  const char **rp);
  static char **trackdb_new_tid(int *ntracksp,
                                int maxtracks,
                                DB_TXN *tid);
  static int trackdb_expire_noticed_tid(time_t earliest, DB_TXN *tid);
+static char *normalize_tag(const char *s, size_t ns);
  
  const struct cache_type cache_files_type = { 86400 };
  unsigned long cache_files_hits, cache_files_misses;
  
+/** @brief Set by trackdb_open() */
+int trackdb_existing_database;
+
  /* setup and teardown ********************************************************/
  
  static const char *home;                /* home had better not change */
  DB_ENV *trackdb_env;                   /* db environment */
-DB *trackdb_tracksdb;                  /* the db itself */
-DB *trackdb_prefsdb;                   /* preferences */
-DB *trackdb_searchdb;                  /* the search database */
+
+/** @brief The tracks database
+ * - Keys are UTF-8(NFC(unicode(path name)))
+ * - Values are encoded key-value pairs
+ * - Data is reconstructable data about tracks that currently exist
+ */
+DB *trackdb_tracksdb;
+
+/** @brief The preferences database
+ *
+ * - Keys are UTF-8(NFC(unicode(path name)))
+ * - Values are encoded key-value pairs
+ * - Data is user data about tracks (that might not exist any more)
+ * and cannot be reconstructed
+ */
+DB *trackdb_prefsdb;
+
+/** @brief The search database
+ *
+ * - Keys are UTF-8(NFKC(casefold(search term)))
+ * - Values are UTF-8(NFC(unicode(path name)))
+ * - There can be more than one value per key
+ * - Presence of key,value means that path matches the search terms
+ * - Only tracks fond in @ref tracks_tracksdb are represented here
+ * - This database can be reconstructed, it contains no user data
+ */
+DB *trackdb_searchdb;
+
+/** @brief The tags database
+ *
+ * - Keys are UTF-8(NFKC(casefold(tag)))
+ * - Values are UTF-8(NFC(unicode(path name)))
+ * - There can be more than one value per key
+ * - Presence of key,value means that path matches the tag
+ * - This is always in sync with the tags preference
+ * - This database can be reconstructed, it contains no user data
+ */
  DB *trackdb_tagsdb;                    /* the tags database */
+
+/** @brief The global preferences database
+ * - Keys are UTF-8(NFC(preference))
+ * - Values are global preference values
+ * - Data is user data and cannot be reconstructed
+ */
  DB *trackdb_globaldb;                   /* global preferences */
+
+/** @brief The noticed database
+ * - Keys are 64-bit big-endian timestamps
+ * - Values are UTF-8(NFC(unicode(path name)))
+ * - There can be more than one value per key
+ * - Presence of key,value means that path was added at the given time
+ * - Data cannot be reconstructed (but isn't THAT important)
+ */
  DB *trackdb_noticeddb;                   /* when track noticed */
  static pid_t db_deadlock_pid = -1;      /* deadlock manager PID */
  static pid_t rescan_pid = -1;           /* rescanner PID */
@@ -99,9 +151,17 @@ static int compare(DB attribute((unused)) *db_,
    return compare_path_raw(a->data, a->size, b->data, b->size);
  }
  
-/* open environment */
-void trackdb_init(int recover) {
+/** @brief Open database environment
+ * @param flags Flags word
+ *
+ * Flags should be one of:
+ * - @ref TRACKDB_NO_RECOVER
+ * - @ref TRACKDB_NORMAL_RECOVER
+ * - @ref TRACKDB_FATAL_RECOVER
+ */
+void trackdb_init(int flags) {
    int err;
+  const int recover = flags & TRACKDB_RECOVER_MASK;
    static int recover_type[] = { 0, DB_RECOVER, DB_RECOVER_FATAL };
  
    /* sanity checks */
@@ -162,7 +222,8 @@ static pid_t subprogram(ev_source *ev, const char *prog,
    /* If we're in the background then trap subprocess stdout/stderr */
    if(!(pid = xfork())) {
      exitfn = _exit;
-    ev_signal_atfork(ev);
+    if(ev)
+      ev_signal_atfork(ev);
      signal(SIGPIPE, SIG_DFL);
      if(outputfd != -1) {
        xdup2(outputfd, 1);
@@ -231,16 +292,86 @@ static DB *open_db(const char *path,
      if((err = db->set_bt_compare(db, compare)))
        fatal(0, "db->set_bt_compare %s: %s", path, db_strerror(err));
    if((err = db->open(db, 0, path, 0, dbtype,
-                     openflags | DB_AUTO_COMMIT, mode)))
-    fatal(0, "db->open %s: %s", path, db_strerror(err));
+                     openflags | DB_AUTO_COMMIT, mode))) {
+    if((openflags & DB_CREATE) || errno != ENOENT)
+      fatal(0, "db->open %s: %s", path, db_strerror(err));
+    db->close(db, 0);
+    db = 0;
+  }
    return db;
  }
  
-/* open track databases */
-void trackdb_open(void) {
+/** @brief Open track databases
+ * @param Flags flags word
+ *
+ * @p flags should be one of:
+ * - @p TRACKDB_NO_UPGRADE, if no upgrade should be attempted
+ * - @p TRACKDB_CAN_UPGRADE, if an upgrade may be attempted
+ * - @p TRACKDB_OPEN_FOR_UPGRADE, if this is disorder-dbupgrade
+ */
+void trackdb_open(int flags) {
+  int err;
+  pid_t pid;
+
    /* sanity checks */
    assert(opened == 0);
    ++opened;
+  /* check the database version first */
+  trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666);
+  if(trackdb_globaldb) {
+    /* This is an existing database */
+    const char *s;
+    long oldversion;
+
+    s = trackdb_get_global("_dbversion");
+    /* Close the database again,  we'll open it property below */
+    if((err = trackdb_globaldb->close(trackdb_globaldb, 0)))
+      fatal(0, "error closing global.db: %s", db_strerror(err));
+    trackdb_globaldb = 0;
+    /* Convert version string to an integer */
+    oldversion = s ? atol(s) : 1;
+    if(oldversion > config->dbversion) {
+      /* Database is from the future; we never allow this. */
+      fatal(0, "this version of DisOrder is too old for database version %ld",
+            oldversion);
+    }
+    if(oldversion < config->dbversion) {
+      /* Database version is out of date */
+      switch(flags & TRACKDB_UPGRADE_MASK) {
+      case TRACKDB_NO_UPGRADE:
+        /* This database needs upgrading but this is not permitted */
+        fatal(0, "database needs upgrading from %ld to %ld",
+              oldversion, config->dbversion);
+      case TRACKDB_CAN_UPGRADE:
+        /* This database needs upgrading */
+        info("invoking disorder-dbupgrade to upgrade from %ld to %ld",
+             oldversion, config->dbversion);
+        pid = subprogram(0, "disorder-dbupgrade", -1);
+        while(waitpid(pid, &err, 0) == -1 && errno == EINTR)
+          ;
+        if(err)
+          fatal(0, "disorder-dbupgrade %s", wstat(err));
+        info("disorder-dbupgrade succeeded");
+        break;
+      case TRACKDB_OPEN_FOR_UPGRADE:
+        break;
+      default:
+        abort();
+      }
+    }
+    if(oldversion == config->dbversion && (flags & TRACKDB_OPEN_FOR_UPGRADE)) {
+      /* This doesn't make any sense */
+      fatal(0, "database is already at current version");
+    }
+    trackdb_existing_database = 1;
+  } else {
+    if(flags & TRACKDB_OPEN_FOR_UPGRADE) {
+      /* Cannot upgrade a new database */
+      fatal(0, "cannot upgrade a database that does not exist");
+    }
+    /* This is a brand new database */
+    trackdb_existing_database = 0;
+  }
    /* open the databases */
    trackdb_tracksdb = open_db("tracks.db",
                               DB_RECNUM, DB_BTREE, DB_CREATE, 0666);
@@ -252,6 +383,14 @@ void trackdb_open(void) {
    trackdb_globaldb = open_db("global.db", 0, DB_HASH, DB_CREATE, 0666);
    trackdb_noticeddb = open_db("noticed.db",
                               DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
+  if(!trackdb_existing_database) {
+    /* Stash the database version */
+    char buf[32];
+
+    assert(!(flags & TRACKDB_OPEN_FOR_UPGRADE));
+    snprintf(buf, sizeof buf, "%ld", config->dbversion);
+    trackdb_set_global("_dbversion", buf, 0);
+  }
    D(("opened databases"));
  }
  
@@ -492,24 +631,102 @@ static int is_display_pref(const char *name) {
    return !strncmp(name, prefix, (sizeof prefix) - 1);
  }
  
+/** @brief Word_Break property tailor that treats underscores as spaces */
+static int tailor_underscore_Word_Break_Other(uint32_t c) {
+  switch(c) {
+  default:
+    return -1;
+  case 0x005F: /* LOW LINE (SPACING UNDERSCORE) */
+    return unicode_Word_Break_Other;
+  }
+}
+
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+  uint32_t *start = s, *t = s, *end = s + ns;
+
+  while(s < end) {
+    const uint32_t c = *s++;
+    if(!utf32_combining_class(c))
+      *t++ = c;
+  }
+  return t - start;
+}
+
+/** @brief Normalize and split a string using a given tailoring */
+static void word_split(struct vector *v,
+                       const char *s,
+                       unicode_property_tailor *pt) {
+  size_t nw, nt32, i;
+  uint32_t *t32, **w32;
+
+  /* Convert to UTF-32 */
+  if(!(t32 = utf8_to_utf32(s, strlen(s), &nt32)))
+    return;
+  /* Erase case distinctions */
+  if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
+    return;
+  /* Drop combining characters */
+  nt32 = remove_combining_chars(t32, nt32);
+  /* Split into words, treating _ as a space */
+  w32 = utf32_word_split(t32, nt32, &nw, pt);
+  /* Convert words back to UTF-8 and append to result */
+  for(i = 0; i < nw; ++i)
+    vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+}
+
+/** @brief Normalize a tag
+ * @param s Tag
+ * @param ns Length of tag
+ * @return Normalized string or NULL on error
+ *
+ * The return value will be:
+ * - case-folded
+ * - have no leading or trailing space
+ * - have no combining characters
+ * - all spacing between words will be a single U+0020 SPACE
+ */
+static char *normalize_tag(const char *s, size_t ns) {
+  uint32_t *s32, **w32;
+  size_t ns32, nw32, i;
+  struct dynstr d[1];
+
+  if(!(s32 = utf8_to_utf32(s, ns, &ns32)))
+    return 0;
+  if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */
+    return 0;
+  ns32 = remove_combining_chars(s32, ns32);
+  /* Split into words, no Word_Break tailoring */
+  w32 = utf32_word_split(s32, ns32, &nw32, 0);
+  /* Compose back into a string */
+  dynstr_init(d);
+  for(i = 0; i < nw32; ++i) {
+    if(i)
+      dynstr_append(d, ' ');
+    dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+  }
+  dynstr_terminate(d);
+  return d->vec;
+}
+
  /* compute the words of a track name */
  static char **track_to_words(const char *track,
                               const struct kvp *p) {
    struct vector v;
-  char **w;
-  int nw;
    const char *rootless = track_rootless(track);
  
    if(!rootless)
      rootless = track;                   /* bodge */
    vector_init(&v);
-  if((w = words(casefold(strip_extension(rootless)), &nw)))
-    vector_append_many(&v, w, nw);
-
+  rootless = strip_extension(rootless);
+  word_split(&v, strip_extension(rootless), tailor_underscore_Word_Break_Other);
    for(; p; p = p->next)
      if(is_display_pref(p->name))
-      if((w = words(casefold(p->value), &nw)))
-        vector_append_many(&v, w, nw);
+      word_split(&v, p->value, 0);
    vector_terminate(&v);
    return dedupe(v.vec, v.nvec);
  }
@@ -561,7 +778,8 @@ static char **parsetags(const char *s) {
        /* strip trailing spaces */
        while(s > t && s[-1] == ' ')
          --s;
-      vector_append(&v, xstrndup(t, s - t));
+      /* add tag to list */
+      vector_append(&v, normalize_tag(t, (size_t)(s - t)));
        /* skip intermediate and trailing separators */
        while(*s && (!tagchar(*s) || *s == ' '))
          ++s;
@@ -701,6 +919,9 @@ int trackdb_notice(const char *track,
  }
  
  /** @brief notice a possibly new track
+ * @param track NFC UTF-8 track name
+ * @param path Raw path name
+ * @param tid Transaction ID
   * @return @c DB_NOTFOUND if new, 0 if already known, @c DB_LOCK_DEADLOCK also
   */
  int trackdb_notice_tid(const char *track,
@@ -880,11 +1101,43 @@ static int get_stats(struct vector *v,
    return 0;
  }
  
+/** @brief One entry in the search league */
  struct search_entry {
    char *word;
    int n;
  };
  
+/** @brief Add a word to the search league
+ * @param se Pointer to search league
+ * @param count Maximum size for search league
+ * @param nse Current size of search league
+ * @param word New word, or NULL
+ * @param n How often @p word appears
+ * @return New size of search league
+ */
+static int register_search_entry(struct search_entry *se,
+                                 int count,
+                                 int nse,
+                                 char *word,
+                                 int n) {
+  int i;
+
+  if(word && (nse < count || n > se[nse - 1].n)) {
+    /* Find the starting point */
+    if(nse == count)
+      i = nse - 1;
+    else
+      i = nse++;
+    /* Find the insertion point */
+    while(i > 0 && n > se[i - 1].n)
+      --i;
+    memmove(&se[i + 1], &se[i], (nse - i - 1) * sizeof *se);
+    se[i].word = word;
+    se[i].n = n;
+  }
+  return nse;
+}
+
  /* find the top COUNT words in the search database */
  static int search_league(struct vector *v, int count, DB_TXN *tid) {
    struct search_entry *se;
@@ -897,25 +1150,14 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) {
  
    cursor = trackdb_opencursor(trackdb_searchdb, tid);
    se = xmalloc(count * sizeof *se);
+  /* Walk across the whole database counting up the number of times each
+   * word appears. */
    while(!(err = cursor->c_get(cursor, prepare_data(&k), prepare_data(&d),
                                DB_NEXT))) {
      if(word && wl == k.size && !strncmp(word, k.data, wl))
-      ++n;
+      ++n;                              /* same word again */
      else {
-#define FINALIZE() do {                                                \
-  if(word && (nse < count || n > se[nse - 1].n)) {             \
-    if(nse == count)                                           \
-      i = nse - 1;                                             \
-    else                                                       \
-      i = nse++;                                               \
-    while(i > 0 && n > se[i - 1].n)                            \
-      --i;                                                     \
-    memmove(&se[i + 1], &se[i], (nse - i) * sizeof *se);       \
-    se[i].word = word;                                         \
-    se[i].n = n;                                               \
-  }                                                            \
-} while(0)
-      FINALIZE();
+      nse = register_search_entry(se, count, nse, word, n);
        word = xstrndup(k.data, wl = k.size);
        n = 1;
      }
@@ -932,7 +1174,7 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) {
    }
    if(trackdb_closecursor(cursor)) err = DB_LOCK_DEADLOCK;
    if(err) return err;
-  FINALIZE();
+  nse = register_search_entry(se, count, nse, word, n);
    byte_xasprintf(&str, "Top %d search words:", nse);
    vector_append(v, str);
    for(i = 0; i < nse; ++i) {
@@ -985,7 +1227,7 @@ struct stats_details {
  
  static void stats_complete(struct stats_details *d) {
    char *s;
-  
+
    if(!(d->exited && d->closed))
      return;
    byte_xasprintf(&s, "\n"
@@ -1652,11 +1894,25 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    const char *dbname;
  
    *ntracks = 0;                                /* for early returns */
-  /* casefold all the words */
+  /* normalize all the words */
    w = xmalloc(nwordlist * sizeof (char *));
    for(n = 0; n < nwordlist; ++n) {
-    w[n] = casefold(wordlist[n]);
-    if(checktag(w[n])) ++ntags;         /* count up tags */
+    uint32_t *w32;
+    size_t nw32;
+    
+    w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
+    if(checktag(w[n])) {
+      ++ntags;         /* count up tags */
+      /* Normalize the tag */
+      w[n] = normalize_tag(w[n], strlen(w[n]));
+    } else {
+      /* Normalize the search term by removing combining characters */
+      if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32)))
+        return 0;
+      nw32 = remove_combining_chars(w32, nw32);
+      if(!(w[n] = utf32_to_utf8(w32, nw32, 0)))
+        return 0;
+    }
    }
    /* find the longest non-stopword */
    for(n = 0; n < nwordlist; ++n)
@@ -1817,9 +2073,9 @@ static int reap_rescan(ev_source attribute((unused)) *ev,
                         void attribute((unused)) *u) {
    if(pid == rescan_pid) rescan_pid = -1;
    if(status)
-    error(0, "disorderd-rescan: %s", wstat(status));
+    error(0, RESCAN": %s", wstat(status));
    else
-    D(("disorderd-rescan terminate: %s", wstat(status)));
+    D((RESCAN" terminated: %s", wstat(status)));
    /* Our cache of file lookups is out of date now */
    cache_clean(&cache_files_type);
    eventlog("rescanned", (char *)0);
@@ -1828,6 +2084,7 @@ static int reap_rescan(ev_source attribute((unused)) *ev,
  
  void trackdb_rescan(ev_source *ev) {
    int w;
+
    if(rescan_pid != -1) {
      error(0, "rescan already underway");
      return;
@@ -1858,27 +2115,13 @@ void trackdb_set_global(const char *name,
                          const char *value,
                          const char *who) {
    DB_TXN *tid;
-  DBT k, d;
    int err;
    int state;
  
-  memset(&k, 0, sizeof k);
-  memset(&d, 0, sizeof d);
-  k.data = (void *)name;
-  k.size = strlen(name);
-  if(value) {
-    d.data = (void *)value;
-    d.size = strlen(value);
-  }
    for(;;) {
      tid = trackdb_begin_transaction();
-    if(value)
-      err = trackdb_globaldb->put(trackdb_globaldb, tid, &k, &d, 0);
-    else
-      err = trackdb_globaldb->del(trackdb_globaldb, tid, &k, 0);
-    if(!err || err == DB_NOTFOUND) break;
-    if(err != DB_LOCK_DEADLOCK)
-      fatal(0, "error updating database: %s", db_strerror(err));
+    if(!(err = trackdb_set_global_tid(name, value, tid)))
+      break;
      trackdb_abort_transaction(tid);
    }
    trackdb_commit_transaction(tid);
@@ -1901,6 +2144,30 @@ void trackdb_set_global(const char *name,
      reqtracks = 0;
  }
  
+int trackdb_set_global_tid(const char *name,
+                           const char *value,
+                           DB_TXN *tid) {
+  DBT k, d;
+  int err;
+
+  memset(&k, 0, sizeof k);
+  memset(&d, 0, sizeof d);
+  k.data = (void *)name;
+  k.size = strlen(name);
+  if(value) {
+    d.data = (void *)value;
+    d.size = strlen(value);
+  }
+  if(value)
+    err = trackdb_globaldb->put(trackdb_globaldb, tid, &k, &d, 0);
+  else
+    err = trackdb_globaldb->del(trackdb_globaldb, tid, &k, 0);
+  if(err == DB_LOCK_DEADLOCK) return err;
+  if(err)
+    fatal(0, "error updating database: %s", db_strerror(err));
+  return 0;
+}
+
  const char *trackdb_get_global(const char *name) {
    DB_TXN *tid;
    int err;
@@ -1916,9 +2183,9 @@ const char *trackdb_get_global(const char *name) {
    return r;
  }
  
-static int trackdb_get_global_tid(const char *name,
-                                  DB_TXN *tid,
-                                  const char **rp) {
+int trackdb_get_global_tid(const char *name,
+                           DB_TXN *tid,
+                           const char **rp) {
    DBT k, d;
    int err;
  
@@ -1936,7 +2203,7 @@ static int trackdb_get_global_tid(const char *name,
    case DB_LOCK_DEADLOCK:
      return err;
    default:
-    fatal(0, "error updating database: %s", db_strerror(err));
+    fatal(0, "error reading database: %s", db_strerror(err));
    }
  }