From: Richard Kettlewell Date: Fri, 23 Nov 2007 11:05:56 +0000 (+0000) Subject: Remove combining characters from search strings. The effect is that X-Git-Tag: debian-1_5_99dev9~1^2~4 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/commitdiff_plain/3c82b5043cf2524037308299164c36215ec3b2f7?ds=sidebyside Remove combining characters from search strings. The effect is that accents are ignored in searching. This is what English speakers usually expect; if DisOrder becomes popular among a wider audience then only trackdb_search() needs to be updated to filter out false positives, the database can keep the accent-free keys. --- diff --git a/lib/unicode.c b/lib/unicode.c index 40b9854..5803926 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -109,6 +109,16 @@ static inline int utf32__combining_class(uint32_t c) { return utf32__unidata(c)->ccc; } +/** @brief Return the combining class of @p c + * @param c Code point + * @return Combining class of @p c + * + * @p c can be any 32-bit value, a sensible value will be returned regardless. + */ +int utf32_combining_class(uint32_t c) { + return utf32__combining_class(c); +} + /** @brief Return the General_Category value for @p c * @param c Code point * @return General_Category property value diff --git a/lib/unicode.h b/lib/unicode.h index e9e58ca..982921b 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -48,6 +48,8 @@ char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd); uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd); int utf8_valid(const char *s, size_t ns); +int utf32_combining_class(uint32_t c); + size_t utf32_len(const uint32_t *s); int utf32_cmp(const uint32_t *a, const uint32_t *b); diff --git a/server/trackdb.c b/server/trackdb.c index cb6b45a..bc526ca 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -637,6 +637,22 @@ static int tailor_underscore_Word_Break_Other(uint32_t c) { } } +/** @brief Remove all combining characters in-place + * @param s Pointer to start of string + * @param ns Length of string + * @return New, possiblby reduced, length + */ +static size_t remove_combining_chars(uint32_t *s, size_t ns) { + uint32_t *start = s, *t = s, *end = s + ns; + + while(s < end) { + const uint32_t c = *s++; + if(!utf32_combining_class(c)) + *t++ = c; + } + return t - start; +} + /** @brief Normalize and split a string using a given tailoring */ static void word_split(struct vector *v, const char *s, @@ -650,6 +666,8 @@ static void word_split(struct vector *v, /* Erase case distinctions */ if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32))) return; + /* Drop combining characters */ + nt32 = remove_combining_chars(t32, nt32); /* Split into words, treating _ as a space */ w32 = utf32_word_split(t32, nt32, &nw, pt); /* Convert words back to UTF-8 and append to result */ @@ -1816,11 +1834,20 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { const char *dbname; *ntracks = 0; /* for early returns */ - /* casefold all the words */ + /* normalize all the words */ w = xmalloc(nwordlist * sizeof (char *)); for(n = 0; n < nwordlist; ++n) { + uint32_t *w32; + size_t nw32; + w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0); if(checktag(w[n])) ++ntags; /* count up tags */ + /* Strip out combining characters (AFTER checking whether it's a tag) */ + if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) + return 0; + nw32 = remove_combining_chars(w32, nw32); + if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) + return 0; } /* find the longest non-stopword */ for(n = 0; n < nwordlist; ++n) diff --git a/tests/search.py b/tests/search.py index 6e9afb7..01ab893 100755 --- a/tests/search.py +++ b/tests/search.py @@ -44,51 +44,50 @@ def test(): time.sleep(2) # give rescan a chance global client client = disorder.client() + first = ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg", + "Joe Bloggs/First Album/02:Second track.ogg", + "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg", + "Joe Bloggs/First Album/04:Fourth track.ogg", + "Joe Bloggs/First Album/05:Fifth track.ogg", + "Joe Bloggs/Second Album/01:First track.ogg", + "Joe Bloggs/Third Album/01:First_track.ogg"] + second = ["Joe Bloggs/First Album/02:Second track.ogg", + "Joe Bloggs/Second Album/01:First track.ogg", + "Joe Bloggs/Second Album/02:Second track.ogg", + "Joe Bloggs/Second Album/03:Third track.ogg", + "Joe Bloggs/Second Album/04:Fourth track.ogg", + "Joe Bloggs/Second Album/05:Fifth track.ogg", + "Joe Bloggs/Third Album/02:Second_track.ogg"] + third = ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg", + "Joe Bloggs/Second Album/03:Third track.ogg", + "Joe Bloggs/Third Album/01:First_track.ogg", + "Joe Bloggs/Third Album/02:Second_track.ogg", + "Joe Bloggs/Third Album/03:Third_track.ogg", + "Joe Bloggs/Third Album/04:Fourth_track.ogg", + "Joe Bloggs/Third Album/05:Fifth_track.ogg"] + first_and_second = filter(lambda s: s in second, first) # ASCII matches - check_search_results(["first"], - ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg", - "Joe Bloggs/First Album/02:Second track.ogg", - "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg", - "Joe Bloggs/First Album/04:Fourth track.ogg", - "Joe Bloggs/First Album/05:Fifth track.ogg", - "Joe Bloggs/Second Album/01:First track.ogg", - "Joe Bloggs/Third Album/01:First_track.ogg"]) - check_search_results(["Second"], - ["Joe Bloggs/First Album/02:Second track.ogg", - "Joe Bloggs/Second Album/01:First track.ogg", - "Joe Bloggs/Second Album/02:Second track.ogg", - "Joe Bloggs/Second Album/03:Third track.ogg", - "Joe Bloggs/Second Album/04:Fourth track.ogg", - "Joe Bloggs/Second Album/05:Fifth track.ogg", - "Joe Bloggs/Third Album/02:Second_track.ogg"]) + check_search_results(["first"], first) + check_search_results(["Second"], second) + check_search_results(["THIRD"], third) # ASCII Conjunctions - check_search_results(["FIRST", "SECOND"], - ["Joe Bloggs/First Album/02:Second track.ogg", - "Joe Bloggs/Second Album/01:First track.ogg"]) + check_search_results(["FIRST", "SECOND"], first_and_second) # Non-ASCII Characters # 00CC is LATIN CAPITAL LETTER I WITH GRAVE # 00EC is LATIN SMALL LETTER I WITH GRAVE - check_search_results([u"F\u00CCRST"], - ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"]) - check_search_results([u"f\u00ECrst"], - ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"]) + check_search_results([u"F\u00CCRST"], first) + check_search_results([u"f\u00ECrst"], first) # 00CD is LATIN CAPITAL LETTER I WITH ACUTE # 00ED is LATIN SMALL LETTER I WITH ACUTE - check_search_results([u"TH\u00CDRD"], - ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"]) - check_search_results([u"th\u00EDrd"], - ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"]) + check_search_results([u"TH\u00CDRD"], third) + check_search_results([u"th\u00EDrd"], third) # ...and again in denormalized form # 0300 is COMBINING GRAVE ACCENT # 0301 is COMBINING ACUTE ACCENT - check_search_results([u"FI\u0300RST"], - ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"]) - check_search_results([u"fi\u0300rst"], - ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"]) - check_search_results([u"THI\u0301RD"], - ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"]) - check_search_results([u"thI\u0301rd"], - ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"]) + check_search_results([u"FI\u0300RST"], first) + check_search_results([u"fi\u0300rst"], first) + check_search_results([u"THI\u0301RD"], third) + check_search_results([u"thI\u0301rd"], third) if failures > 0: sys.exit(1)