From: Richard Kettlewell Date: Tue, 20 Nov 2007 18:13:56 +0000 (+0000) Subject: utf32_word_split() and utf8_word_split() splits a string into words X-Git-Tag: debian-1_5_99dev9~1^2~19 X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/commitdiff_plain/8818b7fca12456e62410ef914a7bef250a0633c9?hp=7bbe944b70a8a904dd15905fbf351b5e906224ff utf32_word_split() and utf8_word_split() splits a string into words using the UAX #29 word boundary algorithm. words() is therefore now a wrapper around this. There is scope for improvement in the use of this function as currently we do some needless converting back and forth between encoding forms. casefold() now uses the compatibility case-folding algorithm, which seems more appropriate for searching. dbversions are now integers not strings. Some dbversion=2 functionality can be selectively disabled for testing purposes. README.dbversions documents the differences between the dbversions. --- diff --git a/lib/configuration.c b/lib/configuration.c index 35ed090..221be9c 100644 --- a/lib/configuration.c +++ b/lib/configuration.c @@ -884,6 +884,7 @@ static const struct conf conf[] = { { C(checkpoint_min), &type_integer, validate_non_negative }, { C(collection), &type_collections, validate_any }, { C(connect), &type_stringlist, validate_addrport }, + { C(dbversion), &type_integer, validate_positive }, { C(device), &type_string, validate_any }, { C(gap), &type_integer, validate_non_negative }, { C(history), &type_integer, validate_positive }, @@ -1039,6 +1040,7 @@ static struct config *config_default(void) { c->short_display = 32; c->mixer = xstrdup("/dev/mixer"); c->channel = xstrdup("pcm"); + c->dbversion = 2; return c; } diff --git a/lib/configuration.h b/lib/configuration.h index 4ef7862..a4ffa63 100644 --- a/lib/configuration.h +++ b/lib/configuration.h @@ -246,6 +246,9 @@ struct config { /* derived values: */ int nparts; /* number of distinct name parts */ char **parts; /* name part list */ + + /* undocumented, for testing only */ + long dbversion; }; extern struct config *config; diff --git a/lib/test.c b/lib/test.c index a5e3295..0790efc 100644 --- a/lib/test.c +++ b/lib/test.c @@ -414,6 +414,69 @@ static void test_casefold(void) { check_string(casefold(""), ""); } +struct { + const char *in; + const char *expect[10]; +} wtest[] = { + /* Empty string */ + { "", { 0 } }, + /* Only whitespace and punctuation */ + { " ", { 0 } }, + { " ' ", { 0 } }, + { " ! ", { 0 } }, + { " \"\" ", { 0 } }, + { " @ ", { 0 } }, + /* Basics */ + { "wibble", { "wibble", 0 } }, + { " wibble", { "wibble", 0 } }, + { " wibble ", { "wibble", 0 } }, + { "wibble ", { "wibble", 0 } }, + { "wibble spong", { "wibble", "spong", 0 } }, + { " wibble spong", { "wibble", "spong", 0 } }, + { " wibble spong ", { "wibble", "spong", 0 } }, + { "wibble spong ", { "wibble", "spong", 0 } }, + { "wibble spong splat foo zot ", { "wibble", "spong", "splat", "foo", "zot", 0 } }, + /* Apostrophes */ + { "wibble 'spong", { "wibble", "spong", 0 } }, + { " wibble's", { "wibble's", 0 } }, + { " wibblespong' ", { "wibblespong", 0 } }, + { "wibble sp''ong ", { "wibble", "sp", "ong", 0 } }, +}; +#define NWTEST (sizeof wtest / sizeof *wtest) + +static void test_words(void) { + size_t t, nexpect, ngot, i; + int right; + + fprintf(stderr, "test_words\n"); + for(t = 0; t < NWTEST; ++t) { + char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot); + + for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect) + ; + if(nexpect == ngot) { + for(i = 0; i < ngot; ++i) + if(strcmp(wtest[t].expect[i], got[i])) + break; + right = i == ngot; + } else + right = 0; + if(!right) { + fprintf(stderr, "word split %zu failed\n", t); + fprintf(stderr, "input: %s\n", wtest[t].in); + fprintf(stderr, " | %-30s | %-30s\n", + "expected", "got"); + for(i = 0; i < nexpect || i < ngot; ++i) { + const char *e = i < nexpect ? wtest[t].expect[i] : ""; + const char *g = i < ngot ? got[i] : ""; + fprintf(stderr, " %2zu | %-30s | %-30s\n", i, e, g); + } + count_error(); + } + ++tests; + } +} + /** @brief Less-than comparison function for integer heap */ static inline int int_lt(int a, int b) { return a < b; } @@ -657,6 +720,7 @@ int main(void) { /* vector.c */ /* words.c */ test_casefold(); + test_words(); /* XXX words() */ /* wstat.c */ fprintf(stderr, "%d errors out of %d tests\n", errors, tests); diff --git a/lib/unicode.c b/lib/unicode.c index 4f4f2ca..b5b520c 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -1271,6 +1271,59 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) { return utf32_iterator_word_boundary(it); } +/** @brief Split [s,ns) into multiple words + * @param s Pointer to start of string + * @param ns Length of string + * @param nwp Where to store word count, or NULL + * @return Pointer to array of pointers to words + * + * The returned array is terminated by a NULL pointer and individual + * strings are 0-terminated. + */ +uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) { + struct utf32_iterator_data it[1]; + size_t b1 = 0, b2 = 0 ,i; + int isword; + struct vector32 v32[1]; + uint32_t *w; + + vector32_init(v32); + utf32__iterator_init(it, s, ns, 0); + /* Work our way through the string stopping at each word break. */ + do { + if(utf32_iterator_word_boundary(it)) { + /* We've found a new boundary */ + b1 = b2; + b2 = it->n; + /*fprintf(stderr, "[%zu, %zu) is a candidate word\n", b1, b2);*/ + /* Inspect the characters between the boundary and form an opinion as to + * whether they are a word or not */ + isword = 0; + for(i = b1; i < b2; ++i) { + switch(utf32__word_break(it->s[i])) { + case unicode_Word_Break_ALetter: + case unicode_Word_Break_Numeric: + case unicode_Word_Break_Katakana: + isword = 1; + break; + default: + break; + } + } + /* If it's a word add it to the list of results */ + if(isword) { + w = xcalloc(b2 - b1 + 1, sizeof(uint32_t)); + memcpy(w, it->s + b1, (b2 - b1) * sizeof (uint32_t)); + vector32_append(v32, w); + } + } + } while(!utf32_iterator_advance(it, 1)); + vector32_terminate(v32); + if(nwp) + *nwp = v32->nvec; + return v32->vec; +} + /*@}*/ /** @defgroup utf8 Functions that operate on UTF-8 strings */ /*@{*/ @@ -1411,6 +1464,45 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) { utf8__transform(utf32_casefold_compat); } +/** @brief Split [s,ns) into multiple words + * @param s Pointer to start of string + * @param ns Length of string + * @param nwp Where to store word count, or NULL + * @return Pointer to array of pointers to words + * + * The returned array is terminated by a NULL pointer and individual + * strings are 0-terminated. + */ +char **utf8_word_split(const char *s, size_t ns, size_t *nwp) { + uint32_t *to32 = 0, **v32 = 0; + size_t nto32, nv, n; + char **v8 = 0, **ret = 0; + + if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; + if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error; + v8 = xcalloc(sizeof (char *), nv + 1); + for(n = 0; n < nv; ++n) + if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0))) + goto error; + ret = v8; + *nwp = nv; + v8 = 0; /* don't free */ +error: + if(v8) { + for(n = 0; n < nv; ++n) + xfree(v8[n]); + xfree(v8); + } + if(v32) { + for(n = 0; n < nv; ++n) + xfree(v32[n]); + xfree(v32); + } + xfree(to32); + return ret; +} + + /*@}*/ /* diff --git a/lib/unicode.h b/lib/unicode.h index a996844..7f32207 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -74,6 +74,9 @@ uint32_t utf32_iterator_code(utf32_iterator it); int utf32_iterator_grapheme_boundary(utf32_iterator it); int utf32_iterator_word_boundary(utf32_iterator it); +uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp); +char **utf8_word_split(const char *s, size_t ns, size_t *nwp); + /** @brief Convert 0-terminated UTF-32 to UTF-8 * @param s 0-terminated UTF-32 string * @return 0-terminated UTF-8 string or 0 on error diff --git a/lib/vector.h b/lib/vector.h index 081a71d..bb944a2 100644 --- a/lib/vector.h +++ b/lib/vector.h @@ -80,6 +80,8 @@ VECTOR_TYPE(vector, char *, xrealloc); VECTOR_TYPE(dynstr, char, xrealloc_noptr); /** @brief A dynamic unicode string */ VECTOR_TYPE(dynstr_ucs4, uint32_t, xrealloc_noptr); +/** @brief A dynamic array of pointers to unicode string */ +VECTOR_TYPE(vector32, uint32_t *, xrealloc); /** @brief Append many strings to a @ref vector */ void vector_append_many(struct vector *v, char **vec, int nvec); diff --git a/lib/words.c b/lib/words.c index 2638ea6..89174cd 100644 --- a/lib/words.c +++ b/lib/words.c @@ -36,104 +36,16 @@ #include "unicode.h" const char *casefold(const char *ptr) { - return utf8_casefold_canon(ptr, strlen(ptr), 0); + return utf8_casefold_compat(ptr, strlen(ptr), 0); } -static enum unicode_General_Category cat(uint32_t c) { - if(c < UNICODE_NCHARS) { - const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; - return ud->general_category; - } else - return unicode_General_Category_Cn; -} - -/* XXX this is a bit kludgy */ - char **words(const char *s, int *nvecp) { - struct vector v; - struct dynstr d; - const char *start; - uint32_t c; - int in_word = 0; - - vector_init(&v); - while(*s) { - start = s; - PARSE_UTF8(s, c, return 0); - /* special cases first */ - switch(c) { - case '/': - case '.': - case '+': - case '&': - case ':': - case '_': - case '-': - goto separator; - } - /* do the rest on category */ - switch(cat(c)) { - case unicode_General_Category_Ll: - case unicode_General_Category_Lm: - case unicode_General_Category_Lo: - case unicode_General_Category_Lt: - case unicode_General_Category_Lu: - case unicode_General_Category_Nd: - case unicode_General_Category_Nl: - case unicode_General_Category_No: - case unicode_General_Category_Sc: - case unicode_General_Category_Sk: - case unicode_General_Category_Sm: - case unicode_General_Category_So: - /* letters, digits and symbols are considered to be part of - * words */ - if(!in_word) { - dynstr_init(&d); - in_word = 1; - } - dynstr_append_bytes(&d, start, s - start); - break; - - case unicode_General_Category_Cc: - case unicode_General_Category_Cf: - case unicode_General_Category_Co: - case unicode_General_Category_Cs: - case unicode_General_Category_Zl: - case unicode_General_Category_Zp: - case unicode_General_Category_Zs: - case unicode_General_Category_Pe: - case unicode_General_Category_Ps: - separator: - if(in_word) { - dynstr_terminate(&d); - vector_append(&v, d.vec); - in_word = 0; - } - break; - - case unicode_General_Category_Mc: - case unicode_General_Category_Me: - case unicode_General_Category_Mn: - case unicode_General_Category_Pc: - case unicode_General_Category_Pd: - case unicode_General_Category_Pf: - case unicode_General_Category_Pi: - case unicode_General_Category_Po: - case unicode_General_Category_Cn: - /* control and punctuation is completely ignored */ - break; + size_t nv; + char **v; - } - } - if(in_word) { - /* pick up the final word */ - dynstr_terminate(&d); - vector_append(&v, d.vec); - } - vector_terminate(&v); - if(nvecp) - *nvecp = v.nvec; - return v.vec; + v = utf8_word_split(s, strlen(s), &nv); + *nvecp = nv; + return v; } /* diff --git a/server/Makefile.am b/server/Makefile.am index e3a1a0a..bdb5071 100644 --- a/server/Makefile.am +++ b/server/Makefile.am @@ -118,7 +118,7 @@ cgi.o: ../lib/definitions.h # for Mac OS X >=10.4 SEDFILES=uk.org.greenend.rjk.disorder.plist include ${top_srcdir}/scripts/sedfiles.make -EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in +EXTRA_DIST=uk.org.greenend.rjk.disorder.plist.in README.dbversions LAUNCHD=/Library/LaunchDaemons #install-data-hook: diff --git a/server/README.dbversions b/server/README.dbversions new file mode 100644 index 0000000..4c77db6 --- /dev/null +++ b/server/README.dbversions @@ -0,0 +1,42 @@ +DisOrder Database Versions +========================== + +If no _dbversion global preference is found then database version 1 is +assumed. Database versions 2 and above always have a _dbversion +global preference. + +Old database versions can be PARTIALLY emulated for testing purposes +by setting the undocument dbversion configuration item. Setting it on +a production system would be a terrible idea. + +Database Version 1 +------------------ + +Path names are in UTF-8, but with no normalization applied: you get +whatever the filesystem gives you. + +Search terms are split according to the old words() function. + - "/", ".", "+", "&", ":", "_" and "-" are considered to be separators + - anything in General_Category Cc, Cf, Co, Cs, Zl, Cp, Sz, Pe or Ps + is considered to be a separator + - anything else in General_Category Ll, Lm, Lo, Lt, Lu, Nd, Nl, No, + Sc, Sk, Sm or So is considered to be part of a word + - everything else is ignored + +Search terms are case-folded by applying the CaseFolding.txt mapping, +without any attempt at normalization. + +Database Version 2 +------------------ + +Path names are in UTF-8, normalized to NFC. + +Search terms are split according to the default Unicode word boundary +detection algorithm. + +Search terms are case-folded using the Unicode case-folding algorithm, +normalizing to NFKD. + +Things that haven't been done yet: + - undump support for new dbversion + - automatic upgrade from dbversion 1 diff --git a/server/rescan.c b/server/rescan.c index b9be72d..cc31888 100644 --- a/server/rescan.c +++ b/server/rescan.c @@ -152,10 +152,12 @@ static void rescan_collection(const struct collection *c) { error(0, "cannot convert track path to UTF-8: %s", path); continue; } - /* We use NFC track names */ - if(!(track = utf8_compose_canon(track, strlen(track), 0))) { - error(0, "cannot convert track path to NFC: %s", path); - continue; + if(config->dbversion > 1) { + /* We use NFC track names */ + if(!(track = utf8_compose_canon(track, strlen(track), 0))) { + error(0, "cannot convert track path to NFC: %s", path); + continue; + } } D(("track %s", track)); /* only tracks with a known player are admitted */ diff --git a/server/trackdb.c b/server/trackdb.c index e1848c4..4be5f25 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -295,15 +295,16 @@ void trackdb_open(void) { trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666); if(trackdb_globaldb) { /* This is an existing database */ - const char *oldversion; + const char *s; + long oldversion; - oldversion = trackdb_get_global("_dbversion"); - if(!oldversion) - oldversion = "1.x"; - if(strcmp(oldversion, DBVERSION)) { + s = trackdb_get_global("_dbversion"); + oldversion = s ? atol(s) : 1; + if(oldversion != config->dbversion) { /* This database needs upgrading. This isn't implemented yet so we just * fail. */ - fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION); + fatal(0, "database needs upgrading from %ld to %ld", + oldversion, config->dbversion); } newdb = 0; /* Close the database again, we'll open it property below */ @@ -326,8 +327,12 @@ void trackdb_open(void) { trackdb_noticeddb = open_db("noticed.db", DB_DUPSORT, DB_BTREE, DB_CREATE, 0666); /* Stash the database version */ - if(newdb) - trackdb_set_global("_dbversion", DBVERSION, 0); + if(newdb) { + char buf[32]; + + snprintf(buf, sizeof buf, "%ld", config->dbversion); + trackdb_set_global("_dbversion", buf, 0); + } D(("opened databases")); } diff --git a/server/trackdb.h b/server/trackdb.h index 854f63c..fe43474 100644 --- a/server/trackdb.h +++ b/server/trackdb.h @@ -23,9 +23,6 @@ struct ev_source; -/* Database version string */ -#define DBVERSION "2.0" - extern const struct cache_type cache_files_type; extern unsigned long cache_files_hits, cache_files_misses; /* Cache entry type and tracking for regexp-based lookups */