From 8818b7fca12456e62410ef914a7bef250a0633c9 Mon Sep 17 00:00:00 2001 Message-Id: <8818b7fca12456e62410ef914a7bef250a0633c9.1714161298.git.mdw@distorted.org.uk> From: Mark Wooding Date: Tue, 20 Nov 2007 18:13:56 +0000 Subject: [PATCH] utf32_word_split() and utf8_word_split() splits a string into words using the UAX #29 word boundary algorithm. words() is therefore now a wrapper around this. There is scope for improvement in the use of this function as currently we do some needless converting back and forth between encoding forms. Organization: Straylight/Edgeware From: Richard Kettlewell casefold() now uses the compatibility case-folding algorithm, which seems more appropriate for searching. dbversions are now integers not strings. Some dbversion=2 functionality can be selectively disabled for testing purposes. README.dbversions documents the differences between the dbversions. --- lib/configuration.c | 2 + lib/configuration.h | 3 ++ lib/test.c | 64 +++++++++++++++++++++++++ lib/unicode.c | 92 +++++++++++++++++++++++++++++++++++ lib/unicode.h | 3 ++ lib/vector.h | 2 + lib/words.c | 100 +++------------------------------------ server/Makefile.am | 2 +- server/README.dbversions | 42 ++++++++++++++++ server/rescan.c | 10 ++-- server/trackdb.c | 21 ++++---- server/trackdb.h | 3 -- 12 files changed, 234 insertions(+), 110 deletions(-) create mode 100644 server/README.dbversions diff --git a/lib/configuration.c b/lib/configuration.c index 35ed090..221be9c 100644 --- a/lib/configuration.c +++ b/lib/configuration.c @@ -884,6 +884,7 @@ static const struct conf conf[] = { { C(checkpoint_min), &type_integer, validate_non_negative }, { C(collection), &type_collections, validate_any }, { C(connect), &type_stringlist, validate_addrport }, + { C(dbversion), &type_integer, validate_positive }, { C(device), &type_string, validate_any }, { C(gap), &type_integer, validate_non_negative }, { C(history), &type_integer, validate_positive }, @@ -1039,6 +1040,7 @@ static struct config *config_default(void) { c->short_display = 32; c->mixer = xstrdup("/dev/mixer"); c->channel = xstrdup("pcm"); + c->dbversion = 2; return c; } diff --git a/lib/configuration.h b/lib/configuration.h index 4ef7862..a4ffa63 100644 --- a/lib/configuration.h +++ b/lib/configuration.h @@ -246,6 +246,9 @@ struct config { /* derived values: */ int nparts; /* number of distinct name parts */ char **parts; /* name part list */ + + /* undocumented, for testing only */ + long dbversion; }; extern struct config *config; diff --git a/lib/test.c b/lib/test.c index a5e3295..0790efc 100644 --- a/lib/test.c +++ b/lib/test.c @@ -414,6 +414,69 @@ static void test_casefold(void) { check_string(casefold(""), ""); } +struct { + const char *in; + const char *expect[10]; +} wtest[] = { + /* Empty string */ + { "", { 0 } }, + /* Only whitespace and punctuation */ + { " ", { 0 } }, + { " ' ", { 0 } }, + { " ! ", { 0 } }, + { " \"\" ", { 0 } }, + { " @ ", { 0 } }, + /* Basics */ + { "wibble", { "wibble", 0 } }, + { " wibble", { "wibble", 0 } }, + { " wibble ", { "wibble", 0 } }, + { "wibble ", { "wibble", 0 } }, + { "wibble spong", { "wibble", "spong", 0 } }, + { " wibble spong", { "wibble", "spong", 0 } }, + { " wibble spong ", { "wibble", "spong", 0 } }, + { "wibble spong ", { "wibble", "spong", 0 } }, + { "wibble spong splat foo zot ", { "wibble", "spong", "splat", "foo", "zot", 0 } }, + /* Apostrophes */ + { "wibble 'spong", { "wibble", "spong", 0 } }, + { " wibble's", { "wibble's", 0 } }, + { " wibblespong' ", { "wibblespong", 0 } }, + { "wibble sp''ong ", { "wibble", "sp", "ong", 0 } }, +}; +#define NWTEST (sizeof wtest / sizeof *wtest) + +static void test_words(void) { + size_t t, nexpect, ngot, i; + int right; + + fprintf(stderr, "test_words\n"); + for(t = 0; t < NWTEST; ++t) { + char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot); + + for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect) + ; + if(nexpect == ngot) { + for(i = 0; i < ngot; ++i) + if(strcmp(wtest[t].expect[i], got[i])) + break; + right = i == ngot; + } else + right = 0; + if(!right) { + fprintf(stderr, "word split %zu failed\n", t); + fprintf(stderr, "input: %s\n", wtest[t].in); + fprintf(stderr, " | %-30s | %-30s\n", + "expected", "got"); + for(i = 0; i < nexpect || i < ngot; ++i) { + const char *e = i < nexpect ? wtest[t].expect[i] : ""; + const char *g = i < ngot ? got[i] : ""; + fprintf(stderr, " %2zu | %-30s | %-30s\n", i, e, g); + } + count_error(); + } + ++tests; + } +} + /** @brief Less-than comparison function for integer heap */ static inline int int_lt(int a, int b) { return a < b; } @@ -657,6 +720,7 @@ int main(void) { /* vector.c */ /* words.c */ test_casefold(); + test_words(); /* XXX words() */ /* wstat.c */ fprintf(stderr, "%d errors out of %d tests\n", errors, tests); diff --git a/lib/unicode.c b/lib/unicode.c index 4f4f2ca..b5b520c 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -1271,6 +1271,59 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) { return utf32_iterator_word_boundary(it); } +/** @brief Split [s,ns) into multiple words + * @param s Pointer to start of string + * @param ns Length of string + * @param nwp Where to store word count, or NULL + * @return Pointer to array of pointers to words + * + * The returned array is terminated by a NULL pointer and individual + * strings are 0-terminated. + */ +uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) { + struct utf32_iterator_data it[1]; + size_t b1 = 0, b2 = 0 ,i; + int isword; + struct vector32 v32[1]; + uint32_t *w; + + vector32_init(v32); + utf32__iterator_init(it, s, ns, 0); + /* Work our way through the string stopping at each word break. */ + do { + if(utf32_iterator_word_boundary(it)) { + /* We've found a new boundary */ + b1 = b2; + b2 = it->n; + /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/ + /* Inspect the characters between the boundary and form an opinion as to + * whether they are a word or not */ + isword = 0; + for(i = b1; i < b2; ++i) { + switch(utf32__word_break(it->s[i])) { + case unicode_Word_Break_ALetter: + case unicode_Word_Break_Numeric: + case unicode_Word_Break_Katakana: + isword = 1; + break; + default: + break; + } + } + /* If it's a word add it to the list of results */ + if(isword) { + w = xcalloc(b2 - b1 + 1, sizeof(uint32_t)); + memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t)); + vector32_append(v32, w); + } + } + } while(!utf32_iterator_advance(it, 1)); + vector32_terminate(v32); + if(nwp) + *nwp = v32->nvec; + return v32->vec; +} + /*@}*/ /** @defgroup utf8 Functions that operate on UTF-8 strings */ /*@{*/ @@ -1411,6 +1464,45 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) { utf8__transform(utf32_casefold_compat); } +/** @brief Split [s,ns) into multiple words + * @param s Pointer to start of string + * @param ns Length of string + * @param nwp Where to store word count, or NULL + * @return Pointer to array of pointers to words + * + * The returned array is terminated by a NULL pointer and individual + * strings are 0-terminated. + */ +char **utf8_word_split(const char *s, size_t ns, size_t *nwp) { + uint32_t *to32 = 0, **v32 = 0; + size_t nto32, nv, n; + char **v8 = 0, **ret = 0; + + if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; + if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error; + v8 = xcalloc(sizeof (char *), nv + 1); + for(n = 0; n < nv; ++n) + if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0))) + goto error; + ret = v8; + *nwp = nv; + v8 = 0; /* don't free */ +error: + if(v8) { + for(n = 0; n < nv; ++n) + xfree(v8[n]); + xfree(v8); + } + if(v32) { + for(n = 0; n < nv; ++n) + xfree(v32[n]); + xfree(v32); + } + xfree(to32); + return ret; +} + + /*@}*/ /* diff --git a/lib/unicode.h b/lib/unicode.h index a996844..7f32207 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -74,6 +74,9 @@ uint32_t utf32_iterator_code(utf32_iterator it); int utf32_iterator_grapheme_boundary(utf32_iterator it); int utf32_iterator_word_boundary(utf32_iterator it); +uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp); +char **utf8_word_split(const char *s, size_t ns, size_t *nwp); + /** @brief Convert 0-terminated UTF-32 to UTF-8 * @param s 0-terminated UTF-32 string * @return 0-terminated UTF-8 string or 0 on error diff --git a/lib/vector.h b/lib/vector.h index 081a71d..bb944a2 100644 --- a/lib/vector.h +++ b/lib/vector.h @@ -80,6 +80,8 @@ VECTOR_TYPE(vector, char *, xrealloc); VECTOR_TYPE(dynstr, char, xrealloc_noptr); /** @brief A dynamic unicode string */ VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr); +/** @brief A dynamic array of pointers to unicode string */ +VECTOR_TYPE(vector32, uint32_t *, xrealloc); /** @brief Append many strings to a @ref vector */ void vector_append_many(struct vector *v, char **vec, int nvec); diff --git a/lib/words.c b/lib/words.c index 2638ea6..89174cd 100644 --- a/lib/words.c +++ b/lib/words.c @@ -36,104 +36,16 @@ #include "unicode.h" const char *casefold(const char *ptr) { - return utf8_casefold_canon(ptr, strlen(ptr), 0); + return utf8_casefold_compat(ptr, strlen(ptr), 0); } -static enum unicode_General_Category cat(uint32_t c) { - if(c < UNICODE_NCHARS) { - const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; - return ud->general_category; - } else - return unicode_General_Category_Cn; -} - -/* XXX this is a bit kludgy */ - char **words(const char *s, int *nvecp) { - struct vector v; - struct dynstr d; - const char *start; - uint32_t c; - int in_word = 0; - - vector_init(&v); - while(*s) { - start = s; - PARSE_UTF8(s, c, return 0); - /* special cases first */ - switch(c) { - case '/': - case '.': - case '+': - case '&': - case ':': - case '_': - case '-': - goto separator; - } - /* do the rest on category */ - switch(cat(c)) { - case unicode_General_Category_Ll: - case unicode_General_Category_Lm: - case unicode_General_Category_Lo: - case unicode_General_Category_Lt: - case unicode_General_Category_Lu: - case unicode_General_Category_Nd: - case unicode_General_Category_Nl: - case unicode_General_Category_No: - case unicode_General_Category_Sc: - case unicode_General_Category_Sk: - case unicode_General_Category_Sm: - case unicode_General_Category_So: - /* letters, digits and symbols are considered to be part of - * words */ - if(!in_word) { - dynstr_init(&d); - in_word = 1; - } - dynstr_append_bytes(&d, start, s - start); - break; - - case unicode_General_Category_Cc: - case unicode_General_Category_Cf: - case unicode_General_Category_Co: - case unicode_General_Category_Cs: - case unicode_General_Category_Zl: - case unicode_General_Category_Zp: - case unicode_General_Category_Zs: - case unicode_General_Category_Pe: - case unicode_General_Category_Ps: - separator: - if(in_word) { - dynstr_terminate(&d); - vector_append(&v, d.vec); - in_word = 0; - } - break; - - case unicode_General_Category_Mc: - case unicode_General_Category_Me: - case unicode_General_Category_Mn: - case unicode_General_Category_Pc: - case unicode_General_Category_Pd: - case unicode_General_Category_Pf: - case unicode_General_Category_Pi: - case unicode_General_Category_Po: - case unicode_General_Category_Cn: - /* control and punctuation is completely ignored */ - break; + size_t nv; + char **v; - } - } - if(in_word) { - /* pick up the final word */ - dynstr_terminate(&d); - vector_append(&v, d.vec); - } - vector_terminate(&v); - if(nvecp) - *nvecp = v.nvec; - return v.vec; + v = utf8_word_split(s, strlen(s), &nv); + *nvecp = nv; + return v; } /* diff --git a/server/Makefile.am b/server/Makefile.am index e3a1a0a..bdb5071 100644 --- a/server/Makefile.am +++ b/server/Makefile.am @@ -118,7 +118,7 @@ cgi.o: ../lib/definitions.h # for Mac OS X >=10.4 SEDFILES=uk.org.greenend.rjk.disorder.plist include ${top_srcdir}/scripts/sedfiles.make -EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in +EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in README.dbversions LAUNCHD=/Library/LaunchDaemons #install-data-hook: diff --git a/server/README.dbversions b/server/README.dbversions new file mode 100644 index 0000000..4c77db6 --- /dev/null +++ b/server/README.dbversions @@ -0,0 +1,42 @@ +DisOrder Database Versions +========================== + +If no _dbversion global preference is found then database version 1 is +assumed. Database versions 2 and above always have a _dbversion +global preference. + +Old database versions can be PARTIALLY emulated for testing purposes +by setting the undocument dbversion configuration item. Setting it on +a production system would be a terrible idea. + +Database Version 1 +------------------ + +Path names are in UTF-8, but with no normalization applied: you get +whatever the filesystem gives you. + +Search terms are split according to the old words() function. + - "/", ".", "+", "&", ":", "_" and "-" are considered to be separators + - anything in General_Category Cc, Cf, Co, Cs, Zl, Cp, Sz, Pe or Ps + is considered to be a separator + - anything else in General_Category Ll, Lm, Lo, Lt, Lu, Nd, Nl, No, + Sc, Sk, Sm or So is considered to be part of a word + - everything else is ignored + +Search terms are case-folded by applying the CaseFolding.txt mapping, +without any attempt at normalization. + +Database Version 2 +------------------ + +Path names are in UTF-8, normalized to NFC. + +Search terms are split according to the default Unicode word boundary +detection algorithm. + +Search terms are case-folded using the Unicode case-folding algorithm, +normalizing to NFKD. + +Things that haven't been done yet: + - undump support for new dbversion + - automatic upgrade from dbversion 1 diff --git a/server/rescan.c b/server/rescan.c index b9be72d..cc31888 100644 --- a/server/rescan.c +++ b/server/rescan.c @@ -152,10 +152,12 @@ static void rescan_collection(const struct collection *c) { error(0, "cannot convert track path to UTF-8: %s", path); continue; } - /* We use NFC track names */ - if(!(track = utf8_compose_canon(track, strlen(track), 0))) { - error(0, "cannot convert track path to NFC: %s", path); - continue; + if(config->dbversion > 1) { + /* We use NFC track names */ + if(!(track = utf8_compose_canon(track, strlen(track), 0))) { + error(0, "cannot convert track path to NFC: %s", path); + continue; + } } D(("track %s", track)); /* only tracks with a known player are admitted */ diff --git a/server/trackdb.c b/server/trackdb.c index e1848c4..4be5f25 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -295,15 +295,16 @@ void trackdb_open(void) { trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666); if(trackdb_globaldb) { /* This is an existing database */ - const char *oldversion; + const char *s; + long oldversion; - oldversion = trackdb_get_global("_dbversion"); - if(!oldversion) - oldversion = "1.x"; - if(strcmp(oldversion, DBVERSION)) { + s = trackdb_get_global("_dbversion"); + oldversion = s ? atol(s) : 1; + if(oldversion != config->dbversion) { /* This database needs upgrading. This isn't implemented yet so we just * fail. */ - fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION); + fatal(0, "database needs upgrading from %ld to %ld", + oldversion, config->dbversion); } newdb = 0; /* Close the database again, we'll open it property below */ @@ -326,8 +327,12 @@ void trackdb_open(void) { trackdb_noticeddb = open_db("noticed.db", DB_DUPSORT, DB_BTREE, DB_CREATE, 0666); /* Stash the database version */ - if(newdb) - trackdb_set_global("_dbversion", DBVERSION, 0); + if(newdb) { + char buf[32]; + + snprintf(buf, sizeof buf, "%ld", config->dbversion); + trackdb_set_global("_dbversion", buf, 0); + } D(("opened databases")); } diff --git a/server/trackdb.h b/server/trackdb.h index 854f63c..fe43474 100644 --- a/server/trackdb.h +++ b/server/trackdb.h @@ -23,9 +23,6 @@ struct ev_source; -/* Database version string */ -#define DBVERSION "2.0" - extern const struct cache_type cache_files_type; extern unsigned long cache_files_hits, cache_files_misses; /* Cache entry type and tracking for regexp-based lookups */ -- [mdw]