}
}
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+ uint32_t *start = s, *t = s, *end = s + ns;
+
+ while(s < end) {
+ const uint32_t c = *s++;
+ if(!utf32_combining_class(c))
+ *t++ = c;
+ }
+ return t - start;
+}
+
/** @brief Normalize and split a string using a given tailoring */
static void word_split(struct vector *v,
const char *s,
/* Erase case distinctions */
if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
return;
+ /* Drop combining characters */
+ nt32 = remove_combining_chars(t32, nt32);
/* Split into words, treating _ as a space */
w32 = utf32_word_split(t32, nt32, &nw, pt);
/* Convert words back to UTF-8 and append to result */
const char *dbname;
*ntracks = 0; /* for early returns */
- /* casefold all the words */
+ /* normalize all the words */
w = xmalloc(nwordlist * sizeof (char *));
for(n = 0; n < nwordlist; ++n) {
+ uint32_t *w32;
+ size_t nw32;
+
w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
if(checktag(w[n])) ++ntags; /* count up tags */
+ /* Strip out combining characters (AFTER checking whether it's a tag) */
+ if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32)))
+ return 0;
+ nw32 = remove_combining_chars(w32, nw32);
+ if(!(w[n] = utf32_to_utf8(w32, nw32, 0)))
+ return 0;
}
/* find the longest non-stopword */
for(n = 0; n < nwordlist; ++n)
time.sleep(2) # give rescan a chance
global client
client = disorder.client()
+ first = ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg",
+ "Joe Bloggs/First Album/02:Second track.ogg",
+ "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
+ "Joe Bloggs/First Album/04:Fourth track.ogg",
+ "Joe Bloggs/First Album/05:Fifth track.ogg",
+ "Joe Bloggs/Second Album/01:First track.ogg",
+ "Joe Bloggs/Third Album/01:First_track.ogg"]
+ second = ["Joe Bloggs/First Album/02:Second track.ogg",
+ "Joe Bloggs/Second Album/01:First track.ogg",
+ "Joe Bloggs/Second Album/02:Second track.ogg",
+ "Joe Bloggs/Second Album/03:Third track.ogg",
+ "Joe Bloggs/Second Album/04:Fourth track.ogg",
+ "Joe Bloggs/Second Album/05:Fifth track.ogg",
+ "Joe Bloggs/Third Album/02:Second_track.ogg"]
+ third = ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
+ "Joe Bloggs/Second Album/03:Third track.ogg",
+ "Joe Bloggs/Third Album/01:First_track.ogg",
+ "Joe Bloggs/Third Album/02:Second_track.ogg",
+ "Joe Bloggs/Third Album/03:Third_track.ogg",
+ "Joe Bloggs/Third Album/04:Fourth_track.ogg",
+ "Joe Bloggs/Third Album/05:Fifth_track.ogg"]
+ first_and_second = filter(lambda s: s in second, first)
# ASCII matches
- check_search_results(["first"],
- ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg",
- "Joe Bloggs/First Album/02:Second track.ogg",
- "Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg",
- "Joe Bloggs/First Album/04:Fourth track.ogg",
- "Joe Bloggs/First Album/05:Fifth track.ogg",
- "Joe Bloggs/Second Album/01:First track.ogg",
- "Joe Bloggs/Third Album/01:First_track.ogg"])
- check_search_results(["Second"],
- ["Joe Bloggs/First Album/02:Second track.ogg",
- "Joe Bloggs/Second Album/01:First track.ogg",
- "Joe Bloggs/Second Album/02:Second track.ogg",
- "Joe Bloggs/Second Album/03:Third track.ogg",
- "Joe Bloggs/Second Album/04:Fourth track.ogg",
- "Joe Bloggs/Second Album/05:Fifth track.ogg",
- "Joe Bloggs/Third Album/02:Second_track.ogg"])
+ check_search_results(["first"], first)
+ check_search_results(["Second"], second)
+ check_search_results(["THIRD"], third)
# ASCII Conjunctions
- check_search_results(["FIRST", "SECOND"],
- ["Joe Bloggs/First Album/02:Second track.ogg",
- "Joe Bloggs/Second Album/01:First track.ogg"])
+ check_search_results(["FIRST", "SECOND"], first_and_second)
# Non-ASCII Characters
# 00CC is LATIN CAPITAL LETTER I WITH GRAVE
# 00EC is LATIN SMALL LETTER I WITH GRAVE
- check_search_results([u"F\u00CCRST"],
- ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
- check_search_results([u"f\u00ECrst"],
- ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
+ check_search_results([u"F\u00CCRST"], first)
+ check_search_results([u"f\u00ECrst"], first)
# 00CD is LATIN CAPITAL LETTER I WITH ACUTE
# 00ED is LATIN SMALL LETTER I WITH ACUTE
- check_search_results([u"TH\u00CDRD"],
- ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
- check_search_results([u"th\u00EDrd"],
- ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
+ check_search_results([u"TH\u00CDRD"], third)
+ check_search_results([u"th\u00EDrd"], third)
# ...and again in denormalized form
# 0300 is COMBINING GRAVE ACCENT
# 0301 is COMBINING ACUTE ACCENT
- check_search_results([u"FI\u0300RST"],
- ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
- check_search_results([u"fi\u0300rst"],
- ["Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg"])
- check_search_results([u"THI\u0301RD"],
- ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
- check_search_results([u"thI\u0301rd"],
- ["Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg"])
+ check_search_results([u"FI\u0300RST"], first)
+ check_search_results([u"fi\u0300rst"], first)
+ check_search_results([u"THI\u0301RD"], third)
+ check_search_results([u"thI\u0301rd"], third)
if failures > 0:
sys.exit(1)