From: Richard Kettlewell Date: Sat, 15 Dec 2007 16:40:19 +0000 (+0000) Subject: normalize tags and exercise this X-Git-Tag: 2.0~6 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/commitdiff_plain/a37771233e9710ccf5127e6aff8b9c287fd0816a?hp=462a0ee104929be94101de13d6e5ea98cd858235 normalize tags and exercise this --- diff --git a/server/trackdb.c b/server/trackdb.c index a46223a..5ffa19e 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -71,6 +71,7 @@ static char **trackdb_new_tid(int *ntracksp, int maxtracks, DB_TXN *tid); static int trackdb_expire_noticed_tid(time_t earliest, DB_TXN *tid); +static char *normalize_tag(const char *s, size_t ns); const struct cache_type cache_files_type = { 86400 }; unsigned long cache_files_hits, cache_files_misses; @@ -678,6 +679,40 @@ static void word_split(struct vector *v, vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0)); } +/** @brief Normalize a tag + * @param s Tag + * @param ns Length of tag + * @return Normalized string or NULL on error + * + * The return value will be: + * - case-folded + * - have no leading or trailing space + * - have no combining characters + * - all spacing between words will be a single U+0020 SPACE + */ +static char *normalize_tag(const char *s, size_t ns) { + uint32_t *s32, **w32; + size_t ns32, nw32, i; + struct dynstr d[1]; + + if(!(s32 = utf8_to_utf32(s, ns, &ns32))) + return 0; + if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */ + return 0; + ns32 = remove_combining_chars(s32, ns32); + /* Split into words, no Word_Break tailoring */ + w32 = utf32_word_split(s32, ns32, &nw32, 0); + /* Compose back into a string */ + dynstr_init(d); + for(i = 0; i < nw32; ++i) { + if(i) + dynstr_append(d, ' '); + dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0)); + } + dynstr_terminate(d); + return d->vec; +} + /* compute the words of a track name */ static char **track_to_words(const char *track, const struct kvp *p) { @@ -743,7 +778,8 @@ static char **parsetags(const char *s) { /* strip trailing spaces */ while(s > t && s[-1] == ' ') --s; - vector_append(&v, xstrndup(t, s - t)); + /* add tag to list */ + vector_append(&v, normalize_tag(t, (size_t)(s - t))); /* skip intermediate and trailing separators */ while(*s && (!tagchar(*s) || *s == ' ')) ++s; @@ -1865,13 +1901,18 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { size_t nw32; w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0); - if(checktag(w[n])) ++ntags; /* count up tags */ - /* Strip out combining characters (AFTER checking whether it's a tag) */ - if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) - return 0; - nw32 = remove_combining_chars(w32, nw32); - if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) - return 0; + if(checktag(w[n])) { + ++ntags; /* count up tags */ + /* Normalize the tag */ + w[n] = normalize_tag(w[n], strlen(w[n])); + } else { + /* Normalize the search term by removing combining characters */ + if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) + return 0; + nw32 = remove_combining_chars(w32, nw32); + if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) + return 0; + } } /* find the longest non-stopword */ for(n = 0; n < nwordlist; ++n) diff --git a/tests/dump.py b/tests/dump.py index 2db1505..f8f2074 100755 --- a/tests/dump.py +++ b/tests/dump.py @@ -34,11 +34,11 @@ def test(): assert c.getglobal("foo") == "before", "checking global foo=before" print "adding a tag" # Exercise the tags-changed code - c.set(track, "tags", "first tag, another tag") + c.set(track, "tags", " first tag, Another Tag") assert dtest.lists_have_same_contents(c.tags(), [u"another tag", u"first tag"]),\ "checking tag list(1)" - c.set(track, "tags", "wibble, another tag") + c.set(track, "tags", "wibble, another tag ") assert dtest.lists_have_same_contents(c.tags(), [u"another tag", u"wibble"]),\ "checking tag list(2)"