From: Richard Kettlewell Date: Sat, 15 Dec 2007 17:06:06 +0000 (+0000) Subject: more tag normalization work X-Git-Tag: 2.0~5 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/commitdiff_plain/d1d4a182d95ffeb0fc607c90842256d9b4ab7c43 more tag normalization work --- diff --git a/server/trackdb.c b/server/trackdb.c index 5ffa19e..d1878c0 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -1883,6 +1883,7 @@ static const char *checktag(const char *s) { char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { const char **w, *best = 0, *tag; char **twords, **tags; + char *istag; int i, j, n, err, what; DBC *cursor = 0; DBT k, d; @@ -1896,6 +1897,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { *ntracks = 0; /* for early returns */ /* normalize all the words */ w = xmalloc(nwordlist * sizeof (char *)); + istag = xmalloc_noptr(nwordlist); for(n = 0; n < nwordlist; ++n) { uint32_t *w32; size_t nw32; @@ -1904,7 +1906,8 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { if(checktag(w[n])) { ++ntags; /* count up tags */ /* Normalize the tag */ - w[n] = normalize_tag(w[n], strlen(w[n])); + w[n] = normalize_tag(w[n] + 4, strlen(w[n] + 4)); + istag[n] = 1; } else { /* Normalize the search term by removing combining characters */ if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) @@ -1912,11 +1915,12 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { nw32 = remove_combining_chars(w32, nw32); if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) return 0; + istag[n] = 0; } } /* find the longest non-stopword */ for(n = 0; n < nwordlist; ++n) - if(!stopword(w[n]) && !checktag(w[n])) + if(!istag[n] && !stopword(w[n])) if(!best || strlen(w[n]) > strlen(best)) best = w[n]; /* TODO: we should at least in principal be able to identify the word or tag @@ -1925,7 +1929,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { if(ntags && !best) { /* Only tags are listed. We limit to the first and narrow down with the * rest. */ - best = checktag(w[0]); + best = istag[0] ? w[0] : 0; db = trackdb_tagsdb; dbname = "tags"; } else if(best) { @@ -1974,7 +1978,8 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { twords = track_to_words(v.vec[n], p); tags = parsetags(kvp_get(p, "tags")); for(i = 0; i < nwordlist; ++i) { - if((tag = checktag(w[i]))) { + if(istag[i]) { + tag = w[i]; /* Track must have this tag */ for(j = 0; tags[j]; ++j) if(!strcmp(tag, tags[j])) break; /* tag found */ diff --git a/tests/dump.py b/tests/dump.py index f8f2074..ff28c63 100755 --- a/tests/dump.py +++ b/tests/dump.py @@ -44,8 +44,11 @@ def test(): "checking tag list(2)" print "checking track appears in tag search" tracks = c.search(["tag:wibble"]) - assert len(tracks) == 1, "checking there is exactly one search result" - assert tracks[0] == track, "checking for right search result" + assert len(tracks) == 1, "checking there is exactly one search result(1)" + assert tracks[0] == track, "checking for right search result(1)" + tracks = c.search(["tag: another tAg "]) + assert len(tracks) == 1, "checking there is exactly one search result(2)" + assert tracks[0] == track, "checking for right search result(2)" print "dumping database" print dtest.command(["disorder-dump", "--config", disorder._configfile, "--dump", dump]) @@ -76,7 +79,7 @@ def test(): print "checking tag search still works" tracks = c.search(["tag:wibble"]) assert len(tracks) == 1, "checking there is exactly one search result" - assert tracks[0] == track, "checking for right search result" + assert tracks[0] == track, "checking for right search result(3)" assert dtest.lists_have_same_contents(c.tags(), [u"another tag", u"wibble"]),\ "checking tag list(3)"