vacopy.h \
vector.c vector.h \
wav.h wav.c \
- words.c words.h \
wstat.c wstat.h \
disorder.h
#include "charset.h"
#include "mime.h"
#include "hex.h"
-#include "words.h"
#include "heap.h"
#include "unicode.h"
#include "inputline.h"
++tests;
}
}
- check_string(casefold(""), "");
+ check_string(utf8_casefold_canon("", 0, 0), "");
}
struct {
fprintf(stderr, "test_words\n");
for(t = 0; t < NWTEST; ++t) {
- char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+ char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot, 0);
for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
;
#include "regsub.h"
#include "log.h"
#include "filepart.h"
-#include "words.h"
+#include "unicode.h"
const struct collection *find_track_collection(const char *track) {
int n;
const char *ta, const char *tb) {
int c;
- if((c = strcmp(casefold(sa), casefold(sb)))) return c;
+ if((c = strcmp(utf8_casefold_canon(sa, strlen(sa), 0),
+ utf8_casefold_canon(sb, strlen(sb), 0))))
+ return c;
if((c = strcmp(sa, sb))) return c;
- if((c = strcmp(casefold(da), casefold(db)))) return c;
+ if((c = strcmp(utf8_casefold_canon(da, strlen(da), 0),
+ utf8_casefold_canon(db, strlen(db), 0))))
+ return c;
if((c = strcmp(da, db))) return c;
return compare_path(ta, tb);
}
int compare_path_raw(const unsigned char *ap, size_t an,
const unsigned char *bp, size_t bn) {
+ /* Don't change this function! The database sort order depends on it */
while(an > 0 && bn > 0) {
if(*ap == *bp) {
ap++;
* the value is (uint32_t)-1.
*/
uint32_t last[2];
-};
-/** @brief Create a new iterator pointing at the start of a string
- * @param s Start of string
- * @param ns Length of string
- * @return New iterator
- */
-utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
- utf32_iterator it = xmalloc(sizeof *it);
- it->s = s;
- it->ns = ns;
- it->n = 0;
- it->last[0] = it->last[1] = -1;
- return it;
-}
+ /** @brief Tailoring for Word_Break */
+ unicode_property_tailor *word_break;
+};
/** @brief Initialize an internal private iterator
* @param it Iterator
it->ns = ns;
it->n = 0;
it->last[0] = it->last[1] = -1;
+ it->word_break = 0;
utf32_iterator_set(it, n);
}
+/** @brief Create a new iterator pointing at the start of a string
+ * @param s Start of string
+ * @param ns Length of string
+ * @return New iterator
+ */
+utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
+ utf32_iterator it = xmalloc(sizeof *it);
+ utf32__iterator_init(it, s, ns, 0);
+ return it;
+}
+
+/** @brief Tailor this iterator's interpretation of the Word_Break property.
+ * @param it Iterator
+ * @param pt Property tailor function or NULL
+ *
+ * After calling this the iterator will call @p pt to determine the Word_Break
+ * property of each code point. If it returns -1 the default value will be
+ * used otherwise the returned value will be used.
+ *
+ * @p pt can be NULL to revert to the default value of the property.
+ *
+ * It is safe to call this function at any time; the iterator's internal state
+ * will be reset to suit the new tailoring.
+ */
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+ unicode_property_tailor *pt) {
+ it->word_break = pt;
+ utf32_iterator_set(it, it->n);
+}
+
+static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
+ uint32_t c) {
+ if(!it->word_break)
+ return utf32__word_break(c);
+ else {
+ const int t = it->word_break(c);
+
+ if(t < 0)
+ return utf32__word_break(c);
+ else
+ return t;
+ }
+}
+
/** @brief Destroy an iterator
* @param it Iterator
*/
return -1;
/* Walk backwards skipping ignorable code points */
m = n;
- while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+ while(m > 0
+ && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+ it->s[m-1]))))
--m;
/* Either m=0 or s[m-1] is not ignorable */
if(m > 0) {
--m;
/* s[m] is our first non-ignorable code; look for a second in the same
way **/
- while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+ while(m > 0
+ && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+ it->s[m-1]))))
--m;
/* Either m=0 or s[m-1] is not ignorable */
if(m > 0)
if(count <= it->ns - it->n) {
while(count > 0) {
const uint32_t c = it->s[it->n];
- const enum unicode_Word_Break wb = utf32__word_break(c);
+ const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
if(it->last[1] == (uint32_t)-1
|| !utf32__boundary_ignorable(wb)) {
it->last[0] = it->last[1];
/* WB4 */
/* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
if(utf32__sentence_break(it->s[it->n-1]) != unicode_Sentence_Break_Sep
- && utf32__boundary_ignorable(utf32__word_break(it->s[it->n])))
+ && utf32__boundary_ignorable(utf32__iterator_word_break(it, it->s[it->n])))
return 0;
/* Gather the property values we'll need for the rest of the test taking the
* s6.2 changes into account */
/* First we look at the code points after the proposed boundary */
nn = it->n; /* <it->ns */
- after = utf32__word_break(it->s[nn++]);
+ after = utf32__iterator_word_break(it, it->s[nn++]);
if(!utf32__boundary_ignorable(after)) {
/* X (Extend|Format)* -> X */
while(nn < it->ns
- && utf32__boundary_ignorable(utf32__word_break(it->s[nn])))
+ && utf32__boundary_ignorable(utf32__iterator_word_break(it,
+ it->s[nn])))
++nn;
}
/* It's possible now that nn=ns */
if(nn < it->ns)
- twoafter = utf32__word_break(it->s[nn]);
+ twoafter = utf32__iterator_word_break(it, it->s[nn]);
else
twoafter = unicode_Word_Break_Other;
/* We've already recorded the non-ignorable code points before the proposed
* boundary */
- before = utf32__word_break(it->last[1]);
- twobefore = utf32__word_break(it->last[0]);
+ before = utf32__iterator_word_break(it, it->last[1]);
+ twobefore = utf32__iterator_word_break(it, it->last[0]);
/* WB5 */
if(before == unicode_Word_Break_ALetter
&& before == unicode_Word_Break_MidLetter
&& after == unicode_Word_Break_ALetter)
return 0;
- /* WB8 */
+ /* WB8 */
if(before == unicode_Word_Break_Numeric
&& after == unicode_Word_Break_Numeric)
return 0;
* @param s Pointer to start of string
* @param ns Length of string
* @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
* @return Pointer to array of pointers to words
*
* The returned array is terminated by a NULL pointer and individual
* strings are 0-terminated.
*/
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+ unicode_property_tailor *wbreak) {
struct utf32_iterator_data it[1];
size_t b1 = 0, b2 = 0 ,i;
int isword;
vector32_init(v32);
utf32__iterator_init(it, s, ns, 0);
+ it->word_break = wbreak;
/* Work our way through the string stopping at each word break. */
do {
if(utf32_iterator_word_boundary(it)) {
* whether they are a word or not */
isword = 0;
for(i = b1; i < b2; ++i) {
- switch(utf32__word_break(it->s[i])) {
+ switch(utf32__iterator_word_break(it, it->s[i])) {
case unicode_Word_Break_ALetter:
case unicode_Word_Break_Numeric:
case unicode_Word_Break_Katakana:
* @param s Pointer to start of string
* @param ns Length of string
* @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
* @return Pointer to array of pointers to words
*
* The returned array is terminated by a NULL pointer and individual
* strings are 0-terminated.
*/
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+ unicode_property_tailor *wbreak) {
uint32_t *to32 = 0, **v32 = 0;
size_t nto32, nv, n;
char **v8 = 0, **ret = 0;
-
+
if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
- if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+ if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
v8 = xcalloc(sizeof (char *), nv + 1);
for(n = 0; n < nv; ++n)
if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
ret = v8;
*nwp = nv;
v8 = 0; /* don't free */
-error:
+error:
if(v8) {
for(n = 0; n < nv; ++n)
xfree(v8[n]);
*/
typedef struct utf32_iterator_data *utf32_iterator;
+/** @brief Property tailor function
+ * @param c Code point
+ * @return Tailored property or -1 to use standard value
+ *
+ * See also utf32_iterator_tailor_word_break().
+ */
+typedef int unicode_property_tailor(uint32_t c);
+
char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd);
uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd);
int utf8_valid(const char *s, size_t ns);
uint32_t utf32_iterator_code(utf32_iterator it);
int utf32_iterator_grapheme_boundary(utf32_iterator it);
int utf32_iterator_word_boundary(utf32_iterator it);
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+ unicode_property_tailor *pt);
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+ unicode_property_tailor *wbreak);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+ unicode_property_tailor *wbreak);
/** @brief Convert 0-terminated UTF-32 to UTF-8
* @param s 0-terminated UTF-32 string
+++ /dev/null
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004, 2007 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#include <config.h>
-#include "types.h"
-
-#include <string.h>
-#include <stddef.h>
-
-#include "mem.h"
-#include "vector.h"
-#include "table.h"
-#include "words.h"
-#include "utf8.h"
-#include "log.h"
-#include "charset.h"
-
-#include "unidata.h"
-#include "unicode.h"
-
-const char *casefold(const char *ptr) {
- return utf8_casefold_compat(ptr, strlen(ptr), 0);
-}
-
-char **words(const char *s, int *nvecp) {
- size_t nv;
- char **v;
-
- v = utf8_word_split(s, strlen(s), &nv);
- *nvecp = nv;
- return v;
-}
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
+++ /dev/null
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#ifndef WORDS_H
-#define WORDS_H
-
-const char *casefold(const char *s);
-/* return a case-folded version of UTF-8 string @s@, or the original
- * string if malformed. */
-
-char **words(const char *s, int *nvecp);
-/* return the words found in UTF-8 string @s@, with punctuation
- * stripped out. (Doesn't casefold.) */
-
-#endif /* WORDS_H */
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
#include "queue.h"
#include "plugin.h"
#include "split.h"
-#include "words.h"
#include "wstat.h"
#include "kvp.h"
#include "syscalls.h"
#include "configuration.h"
#include "syscalls.h"
#include "wstat.h"
-#include "words.h"
#include "printf.h"
#include "filepart.h"
#include "trackname.h"
#include "cache.h"
#include "eventlog.h"
#include "hash.h"
+#include "unicode.h"
+#include "unidata.h"
#define RESCAN "disorder-rescan"
#define DEADLOCK "disorder-deadlock"
return !strncmp(name, prefix, (sizeof prefix) - 1);
}
+/** @brief Word_Break property tailor that treats underscores as spaces */
+static int tailor_underscore_Word_Break_Other(uint32_t c) {
+ switch(c) {
+ default:
+ return -1;
+ case 0x005F: /* LOW LINE (SPACING UNDERSCORE) */
+ return unicode_Word_Break_Other;
+ }
+}
+
+/** @brief Normalize and split a string using a given tailoring */
+static void word_split(struct vector *v,
+ const char *s,
+ unicode_property_tailor *pt) {
+ size_t nw, nt32, i;
+ uint32_t *t32, **w32;
+
+ /* Convert to UTF-32 */
+ if(!(t32 = utf8_to_utf32(s, strlen(s), &nt32)))
+ return;
+ /* Erase case distinctions */
+ if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
+ return;
+ /* Split into words, treating _ as a space */
+ w32 = utf32_word_split(t32, nt32, &nw, pt);
+ /* Convert words back to UTF-8 and append to result */
+ for(i = 0; i < nw; ++i)
+ vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+}
+
/* compute the words of a track name */
static char **track_to_words(const char *track,
const struct kvp *p) {
struct vector v;
- char **w;
- int nw;
const char *rootless = track_rootless(track);
if(!rootless)
rootless = track; /* bodge */
vector_init(&v);
- if((w = words(casefold(strip_extension(rootless)), &nw)))
- vector_append_many(&v, w, nw);
-
+ rootless = strip_extension(rootless);
+ word_split(&v, strip_extension(rootless), tailor_underscore_Word_Break_Other);
for(; p; p = p->next)
if(is_display_pref(p->name))
- if((w = words(casefold(p->value), &nw)))
- vector_append_many(&v, w, nw);
+ word_split(&v, p->value, 0);
vector_terminate(&v);
return dedupe(v.vec, v.nvec);
}
/* casefold all the words */
w = xmalloc(nwordlist * sizeof (char *));
for(n = 0; n < nwordlist; ++n) {
- w[n] = casefold(wordlist[n]);
+ w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
if(checktag(w[n])) ++ntags; /* count up tags */
}
/* find the longest non-stopword */