Remove combining characters from search strings. The effect is that

author Richard Kettlewell <rjk@greenend.org.uk>

Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)

committer Richard Kettlewell <rjk@greenend.org.uk>

Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)
author Richard Kettlewell <rjk@greenend.org.uk>
Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)
committer Richard Kettlewell <rjk@greenend.org.uk>
Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)
diff --git a/lib/unicode.c b/lib/unicode.c

index 40b98549238f6ee45b06f3ae8a6eb9ef43ba2588..5803926cdce6f124b111f7b37fdf88c32ff0fb14 100644 (file)
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -109,6 +109,16 @@ static inline int utf32__combining_class(uint32_t c) {
    return utf32__unidata(c)->ccc;
  }
  
+/** @brief Return the combining class of @p c
+ * @param c Code point
+ * @return Combining class of @p c
+ *
+ * @p c can be any 32-bit value, a sensible value will be returned regardless.
+ */
+int utf32_combining_class(uint32_t c) {
+  return utf32__combining_class(c);
+}
+
  /** @brief Return the General_Category value for @p c
   * @param c Code point
   * @return General_Category property value
diff --git a/lib/unicode.h b/lib/unicode.h

index e9e58ca672fe23e385c53465b88be0d908e94a2e..982921b33bdef432293c80cad21a2b3136f0540c 100644 (file)
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -48,6 +48,8 @@ char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd);
  uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd);
  int utf8_valid(const char *s, size_t ns);
  
+int utf32_combining_class(uint32_t c);
+
  size_t utf32_len(const uint32_t *s);
  int utf32_cmp(const uint32_t *a, const uint32_t *b);
  
diff --git a/server/trackdb.c b/server/trackdb.c

index cb6b45a8191d33b4542c20f3445f71ba6a8f905c..bc526caad6acff31d0e952c028df74d0f75f0035 100644 (file)
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -637,6 +637,22 @@ static int tailor_underscore_Word_Break_Other(uint32_t c) {
    }
  }
  
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+  uint32_t *start = s, *t = s, *end = s + ns;
+
+  while(s < end) {
+    const uint32_t c = *s++;
+    if(!utf32_combining_class(c))
+      *t++ = c;
+  }
+  return t - start;
+}
+
  /** @brief Normalize and split a string using a given tailoring */
  static void word_split(struct vector *v,
                         const char *s,
@@ -650,6 +666,8 @@ static void word_split(struct vector *v,
    /* Erase case distinctions */
    if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
      return;
+  /* Drop combining characters */
+  nt32 = remove_combining_chars(t32, nt32);
    /* Split into words, treating _ as a space */
    w32 = utf32_word_split(t32, nt32, &nw, pt);
    /* Convert words back to UTF-8 and append to result */
@@ -1816,11 +1834,20 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    const char *dbname;
  
    *ntracks = 0;                                /* for early returns */
-  /* casefold all the words */
+  /* normalize all the words */
    w = xmalloc(nwordlist * sizeof (char *));
    for(n = 0; n < nwordlist; ++n) {
+    uint32_t *w32;
+    size_t nw32;
+    
      w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
      if(checktag(w[n])) ++ntags;         /* count up tags */
+    /* Strip out combining characters (AFTER checking whether it's a tag) */
+    if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32)))
+      return 0;
+    nw32 = remove_combining_chars(w32, nw32);
+    if(!(w[n] = utf32_to_utf8(w32, nw32, 0)))
+      return 0;
    }
    /* find the longest non-stopword */
    for(n = 0; n < nwordlist; ++n)
diff --git a/tests/search.py b/tests/search.py

index 6e9afb779bcf4b972aa0f21011e3d4b21d9e74f8..01ab893efa5360f742a2bb738ec6be95fdfd78c0 100755 (executable)
--- a/tests/search.py
+++ b/tests/search.py
@@ -44,51 +44,50 @@ def test():
      time.sleep(2)                       # give rescan a chance
      global client
      client = disorder.client()
+    first = ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg",
+             "Joe Bloggs/First Album/02:Second track.ogg",
+             "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
+             "Joe Bloggs/First Album/04:Fourth track.ogg",
+             "Joe Bloggs/First Album/05:Fifth track.ogg",
+             "Joe Bloggs/Second Album/01:First track.ogg",
+             "Joe Bloggs/Third Album/01:First_track.ogg"]
+    second = ["Joe Bloggs/First Album/02:Second track.ogg",
+              "Joe Bloggs/Second Album/01:First track.ogg",
+              "Joe Bloggs/Second Album/02:Second track.ogg",
+              "Joe Bloggs/Second Album/03:Third track.ogg",
+              "Joe Bloggs/Second Album/04:Fourth track.ogg",
+              "Joe Bloggs/Second Album/05:Fifth track.ogg",
+              "Joe Bloggs/Third Album/02:Second_track.ogg"]
+    third = ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
+             "Joe Bloggs/Second Album/03:Third track.ogg",
+             "Joe Bloggs/Third Album/01:First_track.ogg",
+             "Joe Bloggs/Third Album/02:Second_track.ogg",
+             "Joe Bloggs/Third Album/03:Third_track.ogg",
+             "Joe Bloggs/Third Album/04:Fourth_track.ogg",
+             "Joe Bloggs/Third Album/05:Fifth_track.ogg"]
+    first_and_second = filter(lambda s: s in second, first)
      # ASCII matches
-    check_search_results(["first"],
-                         ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg",
-                          "Joe Bloggs/First Album/02:Second track.ogg",
-                          "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
-                          "Joe Bloggs/First Album/04:Fourth track.ogg",
-                          "Joe Bloggs/First Album/05:Fifth track.ogg",
-                          "Joe Bloggs/Second Album/01:First track.ogg",
-                          "Joe Bloggs/Third Album/01:First_track.ogg"])
-    check_search_results(["Second"],
-                         ["Joe Bloggs/First Album/02:Second track.ogg",
-                          "Joe Bloggs/Second Album/01:First track.ogg",
-                          "Joe Bloggs/Second Album/02:Second track.ogg",
-                          "Joe Bloggs/Second Album/03:Third track.ogg",
-                          "Joe Bloggs/Second Album/04:Fourth track.ogg",
-                          "Joe Bloggs/Second Album/05:Fifth track.ogg",
-                          "Joe Bloggs/Third Album/02:Second_track.ogg"])
+    check_search_results(["first"], first)
+    check_search_results(["Second"], second)
+    check_search_results(["THIRD"], third)
      # ASCII Conjunctions
-    check_search_results(["FIRST", "SECOND"],
-                         ["Joe Bloggs/First Album/02:Second track.ogg",
-                          "Joe Bloggs/Second Album/01:First track.ogg"])
+    check_search_results(["FIRST", "SECOND"], first_and_second)
      # Non-ASCII Characters
      # 00CC is LATIN CAPITAL LETTER I WITH GRAVE
      # 00EC is LATIN SMALL LETTER I WITH GRAVE
-    check_search_results([u"F\u00CCRST"],
-                         ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
-    check_search_results([u"f\u00ECrst"],
-                         ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
+    check_search_results([u"F\u00CCRST"], first)
+    check_search_results([u"f\u00ECrst"], first)
      # 00CD is LATIN CAPITAL LETTER I WITH ACUTE
      # 00ED is LATIN SMALL LETTER I WITH ACUTE
-    check_search_results([u"TH\u00CDRD"],
-                          ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
-    check_search_results([u"th\u00EDrd"],
-                          ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
+    check_search_results([u"TH\u00CDRD"], third)
+    check_search_results([u"th\u00EDrd"], third)
      # ...and again in denormalized form
      # 0300 is COMBINING GRAVE ACCENT
      # 0301 is COMBINING ACUTE ACCENT
-    check_search_results([u"FI\u0300RST"],
-                         ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
-    check_search_results([u"fi\u0300rst"],
-                         ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
-    check_search_results([u"THI\u0301RD"],
-                          ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
-    check_search_results([u"thI\u0301rd"],
-                          ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
+    check_search_results([u"FI\u0300RST"], first)
+    check_search_results([u"fi\u0300rst"], first)
+    check_search_results([u"THI\u0301RD"], third)
+    check_search_results([u"thI\u0301rd"], third)
      
      if failures > 0:
          sys.exit(1)
author	Richard Kettlewell <rjk@greenend.org.uk>
	Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)
committer	Richard Kettlewell <rjk@greenend.org.uk>
	Fri, 23 Nov 2007 11:05:56 +0000 (11:05 +0000)
lib/unicode.c		patch \| blob \| blame \| history
lib/unicode.h		patch \| blob \| blame \| history
server/trackdb.c		patch \| blob \| blame \| history
tests/search.py		patch \| blob \| blame \| history