{ C(checkpoint_min), &type_integer, validate_non_negative },
{ C(collection), &type_collections, validate_any },
{ C(connect), &type_stringlist, validate_addrport },
+ { C(dbversion), &type_integer, validate_positive },
{ C(device), &type_string, validate_any },
{ C(gap), &type_integer, validate_non_negative },
{ C(history), &type_integer, validate_positive },
c->short_display = 32;
c->mixer = xstrdup("/dev/mixer");
c->channel = xstrdup("pcm");
+ c->dbversion = 2;
return c;
}
/* derived values: */
int nparts; /* number of distinct name parts */
char **parts; /* name part list */
+
+ /* undocumented, for testing only */
+ long dbversion;
};
extern struct config *config;
check_string(casefold(""), "");
}
+struct {
+ const char *in;
+ const char *expect[10];
+} wtest[] = {
+ /* Empty string */
+ { "", { 0 } },
+ /* Only whitespace and punctuation */
+ { " ", { 0 } },
+ { " ' ", { 0 } },
+ { " ! ", { 0 } },
+ { " \"\" ", { 0 } },
+ { " @ ", { 0 } },
+ /* Basics */
+ { "wibble", { "wibble", 0 } },
+ { " wibble", { "wibble", 0 } },
+ { " wibble ", { "wibble", 0 } },
+ { "wibble ", { "wibble", 0 } },
+ { "wibble spong", { "wibble", "spong", 0 } },
+ { " wibble spong", { "wibble", "spong", 0 } },
+ { " wibble spong ", { "wibble", "spong", 0 } },
+ { "wibble spong ", { "wibble", "spong", 0 } },
+ { "wibble spong splat foo zot ", { "wibble", "spong", "splat", "foo", "zot", 0 } },
+ /* Apostrophes */
+ { "wibble 'spong", { "wibble", "spong", 0 } },
+ { " wibble's", { "wibble's", 0 } },
+ { " wibblespong' ", { "wibblespong", 0 } },
+ { "wibble sp''ong ", { "wibble", "sp", "ong", 0 } },
+};
+#define NWTEST (sizeof wtest / sizeof *wtest)
+
+static void test_words(void) {
+ size_t t, nexpect, ngot, i;
+ int right;
+
+ fprintf(stderr, "test_words\n");
+ for(t = 0; t < NWTEST; ++t) {
+ char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+
+ for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
+ ;
+ if(nexpect == ngot) {
+ for(i = 0; i < ngot; ++i)
+ if(strcmp(wtest[t].expect[i], got[i]))
+ break;
+ right = i == ngot;
+ } else
+ right = 0;
+ if(!right) {
+ fprintf(stderr, "word split %zu failed\n", t);
+ fprintf(stderr, "input: %s\n", wtest[t].in);
+ fprintf(stderr, " | %-30s | %-30s\n",
+ "expected", "got");
+ for(i = 0; i < nexpect || i < ngot; ++i) {
+ const char *e = i < nexpect ? wtest[t].expect[i] : "<none>";
+ const char *g = i < ngot ? got[i] : "<none>";
+ fprintf(stderr, " %2zu | %-30s | %-30s\n", i, e, g);
+ }
+ count_error();
+ }
+ ++tests;
+ }
+}
+
/** @brief Less-than comparison function for integer heap */
static inline int int_lt(int a, int b) { return a < b; }
/* vector.c */
/* words.c */
test_casefold();
+ test_words();
/* XXX words() */
/* wstat.c */
fprintf(stderr, "%d errors out of %d tests\n", errors, tests);
return utf32_iterator_word_boundary(it);
}
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+ struct utf32_iterator_data it[1];
+ size_t b1 = 0, b2 = 0 ,i;
+ int isword;
+ struct vector32 v32[1];
+ uint32_t *w;
+
+ vector32_init(v32);
+ utf32__iterator_init(it, s, ns, 0);
+ /* Work our way through the string stopping at each word break. */
+ do {
+ if(utf32_iterator_word_boundary(it)) {
+ /* We've found a new boundary */
+ b1 = b2;
+ b2 = it->n;
+ /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/
+ /* Inspect the characters between the boundary and form an opinion as to
+ * whether they are a word or not */
+ isword = 0;
+ for(i = b1; i < b2; ++i) {
+ switch(utf32__word_break(it->s[i])) {
+ case unicode_Word_Break_ALetter:
+ case unicode_Word_Break_Numeric:
+ case unicode_Word_Break_Katakana:
+ isword = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ /* If it's a word add it to the list of results */
+ if(isword) {
+ w = xcalloc(b2 - b1 + 1, sizeof(uint32_t));
+ memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t));
+ vector32_append(v32, w);
+ }
+ }
+ } while(!utf32_iterator_advance(it, 1));
+ vector32_terminate(v32);
+ if(nwp)
+ *nwp = v32->nvec;
+ return v32->vec;
+}
+
/*@}*/
/** @defgroup utf8 Functions that operate on UTF-8 strings */
/*@{*/
utf8__transform(utf32_casefold_compat);
}
+/** @brief Split [s,ns) into multiple words
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @param nwp Where to store word count, or NULL
+ * @return Pointer to array of pointers to words
+ *
+ * The returned array is terminated by a NULL pointer and individual
+ * strings are 0-terminated.
+ */
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+ uint32_t *to32 = 0, **v32 = 0;
+ size_t nto32, nv, n;
+ char **v8 = 0, **ret = 0;
+
+ if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
+ if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+ v8 = xcalloc(sizeof (char *), nv + 1);
+ for(n = 0; n < nv; ++n)
+ if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
+ goto error;
+ ret = v8;
+ *nwp = nv;
+ v8 = 0; /* don't free */
+error:
+ if(v8) {
+ for(n = 0; n < nv; ++n)
+ xfree(v8[n]);
+ xfree(v8);
+ }
+ if(v32) {
+ for(n = 0; n < nv; ++n)
+ xfree(v32[n]);
+ xfree(v32);
+ }
+ xfree(to32);
+ return ret;
+}
+
+
/*@}*/
/*
int utf32_iterator_grapheme_boundary(utf32_iterator it);
int utf32_iterator_word_boundary(utf32_iterator it);
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+
/** @brief Convert 0-terminated UTF-32 to UTF-8
* @param s 0-terminated UTF-32 string
* @return 0-terminated UTF-8 string or 0 on error
VECTOR_TYPE(dynstr, char, xrealloc_noptr);
/** @brief A dynamic unicode string */
VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr);
+/** @brief A dynamic array of pointers to unicode string */
+VECTOR_TYPE(vector32, uint32_t *, xrealloc);
/** @brief Append many strings to a @ref vector */
void vector_append_many(struct vector *v, char **vec, int nvec);
#include "unicode.h"
const char *casefold(const char *ptr) {
- return utf8_casefold_canon(ptr, strlen(ptr), 0);
+ return utf8_casefold_compat(ptr, strlen(ptr), 0);
}
-static enum unicode_General_Category cat(uint32_t c) {
- if(c < UNICODE_NCHARS) {
- const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
- return ud->general_category;
- } else
- return unicode_General_Category_Cn;
-}
-
-/* XXX this is a bit kludgy */
-
char **words(const char *s, int *nvecp) {
- struct vector v;
- struct dynstr d;
- const char *start;
- uint32_t c;
- int in_word = 0;
-
- vector_init(&v);
- while(*s) {
- start = s;
- PARSE_UTF8(s, c, return 0);
- /* special cases first */
- switch(c) {
- case '/':
- case '.':
- case '+':
- case '&':
- case ':':
- case '_':
- case '-':
- goto separator;
- }
- /* do the rest on category */
- switch(cat(c)) {
- case unicode_General_Category_Ll:
- case unicode_General_Category_Lm:
- case unicode_General_Category_Lo:
- case unicode_General_Category_Lt:
- case unicode_General_Category_Lu:
- case unicode_General_Category_Nd:
- case unicode_General_Category_Nl:
- case unicode_General_Category_No:
- case unicode_General_Category_Sc:
- case unicode_General_Category_Sk:
- case unicode_General_Category_Sm:
- case unicode_General_Category_So:
- /* letters, digits and symbols are considered to be part of
- * words */
- if(!in_word) {
- dynstr_init(&d);
- in_word = 1;
- }
- dynstr_append_bytes(&d, start, s - start);
- break;
-
- case unicode_General_Category_Cc:
- case unicode_General_Category_Cf:
- case unicode_General_Category_Co:
- case unicode_General_Category_Cs:
- case unicode_General_Category_Zl:
- case unicode_General_Category_Zp:
- case unicode_General_Category_Zs:
- case unicode_General_Category_Pe:
- case unicode_General_Category_Ps:
- separator:
- if(in_word) {
- dynstr_terminate(&d);
- vector_append(&v, d.vec);
- in_word = 0;
- }
- break;
-
- case unicode_General_Category_Mc:
- case unicode_General_Category_Me:
- case unicode_General_Category_Mn:
- case unicode_General_Category_Pc:
- case unicode_General_Category_Pd:
- case unicode_General_Category_Pf:
- case unicode_General_Category_Pi:
- case unicode_General_Category_Po:
- case unicode_General_Category_Cn:
- /* control and punctuation is completely ignored */
- break;
+ size_t nv;
+ char **v;
- }
- }
- if(in_word) {
- /* pick up the final word */
- dynstr_terminate(&d);
- vector_append(&v, d.vec);
- }
- vector_terminate(&v);
- if(nvecp)
- *nvecp = v.nvec;
- return v.vec;
+ v = utf8_word_split(s, strlen(s), &nv);
+ *nvecp = nv;
+ return v;
}
/*
# for Mac OS X >=10.4
SEDFILES=uk.org.greenend.rjk.disorder.plist
include ${top_srcdir}/scripts/sedfiles.make
-EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in
+EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in README.dbversions
LAUNCHD=/Library/LaunchDaemons
#install-data-hook:
--- /dev/null
+DisOrder Database Versions
+==========================
+
+If no _dbversion global preference is found then database version 1 is
+assumed. Database versions 2 and above always have a _dbversion
+global preference.
+
+Old database versions can be PARTIALLY emulated for testing purposes
+by setting the undocument dbversion configuration item. Setting it on
+a production system would be a terrible idea.
+
+Database Version 1
+------------------
+
+Path names are in UTF-8, but with no normalization applied: you get
+whatever the filesystem gives you.
+
+Search terms are split according to the old words() function.
+ - "/", ".", "+", "&", ":", "_" and "-" are considered to be separators
+ - anything in General_Category Cc, Cf, Co, Cs, Zl, Cp, Sz, Pe or Ps
+ is considered to be a separator
+ - anything else in General_Category Ll, Lm, Lo, Lt, Lu, Nd, Nl, No,
+ Sc, Sk, Sm or So is considered to be part of a word
+ - everything else is ignored
+
+Search terms are case-folded by applying the CaseFolding.txt mapping,
+without any attempt at normalization.
+
+Database Version 2
+------------------
+
+Path names are in UTF-8, normalized to NFC.
+
+Search terms are split according to the default Unicode word boundary
+detection algorithm.
+
+Search terms are case-folded using the Unicode case-folding algorithm,
+normalizing to NFKD.
+
+Things that haven't been done yet:
+ - undump support for new dbversion
+ - automatic upgrade from dbversion 1
error(0, "cannot convert track path to UTF-8: %s", path);
continue;
}
- /* We use NFC track names */
- if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
- error(0, "cannot convert track path to NFC: %s", path);
- continue;
+ if(config->dbversion > 1) {
+ /* We use NFC track names */
+ if(!(track = utf8_compose_canon(track, strlen(track), 0))) {
+ error(0, "cannot convert track path to NFC: %s", path);
+ continue;
+ }
}
D(("track %s", track));
/* only tracks with a known player are admitted */
trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666);
if(trackdb_globaldb) {
/* This is an existing database */
- const char *oldversion;
+ const char *s;
+ long oldversion;
- oldversion = trackdb_get_global("_dbversion");
- if(!oldversion)
- oldversion = "1.x";
- if(strcmp(oldversion, DBVERSION)) {
+ s = trackdb_get_global("_dbversion");
+ oldversion = s ? atol(s) : 1;
+ if(oldversion != config->dbversion) {
/* This database needs upgrading. This isn't implemented yet so we just
* fail. */
- fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION);
+ fatal(0, "database needs upgrading from %ld to %ld",
+ oldversion, config->dbversion);
}
newdb = 0;
/* Close the database again, we'll open it property below */
trackdb_noticeddb = open_db("noticed.db",
DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
/* Stash the database version */
- if(newdb)
- trackdb_set_global("_dbversion", DBVERSION, 0);
+ if(newdb) {
+ char buf[32];
+
+ snprintf(buf, sizeof buf, "%ld", config->dbversion);
+ trackdb_set_global("_dbversion", buf, 0);
+ }
D(("opened databases"));
}
struct ev_source;
-/* Database version string */
-#define DBVERSION "2.0"
-
extern const struct cache_type cache_files_type;
extern unsigned long cache_files_hits, cache_files_misses;
/* Cache entry type and tracking for regexp-based lookups */