unicode.c iterators can now have their notion of Word_Break tailored.

author Richard Kettlewell <rjk@greenend.org.uk>

Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)

committer Richard Kettlewell <rjk@greenend.org.uk>

Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)
author Richard Kettlewell <rjk@greenend.org.uk>
Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)
committer Richard Kettlewell <rjk@greenend.org.uk>
Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)
diff --git a/lib/Makefile.am b/lib/Makefile.am

index 0eeccd7ce674480e28b8bf04ef9b0678b9f97c61..b642c895db345eb65b1ee181089a6645e1562a53 100644 (file)
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -69,7 +69,6 @@ libdisorder_a_SOURCES=charset.c charset.h             \
         vacopy.h                                        \
         vector.c vector.h                               \
         wav.h wav.c                                     \
-       words.c words.h                                 \
         wstat.c wstat.h                                 \
         disorder.h
  
diff --git a/lib/test.c b/lib/test.c

index 0790efccc01c35865e6388298ccf6b7eccb28dc2..f4043b92f587078ceef690ecf0b1540d2898a845 100644 (file)
--- a/lib/test.c
+++ b/lib/test.c
@@ -38,7 +38,6 @@
  #include "charset.h"
  #include "mime.h"
  #include "hex.h"
-#include "words.h"
  #include "heap.h"
  #include "unicode.h"
  #include "inputline.h"
@@ -411,7 +410,7 @@ static void test_casefold(void) {
        ++tests;
      }
    }
-  check_string(casefold(""), "");
+  check_string(utf8_casefold_canon("", 0, 0), "");
  }
  
  struct {
@@ -450,7 +449,7 @@ static void test_words(void) {
    
    fprintf(stderr, "test_words\n");
    for(t = 0; t < NWTEST; ++t) {
-    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot);
+    char **got = utf8_word_split(wtest[t].in, strlen(wtest[t].in), &ngot, 0);
  
      for(nexpect = 0; wtest[t].expect[nexpect]; ++nexpect)
        ;
diff --git a/lib/trackname.c b/lib/trackname.c

index 53824d73df8b593fd8275a2583ca64e32fe60d2d..bb07e6468cae3e963ab24fa75ed60efd45b5718c 100644 (file)
--- a/lib/trackname.c
+++ b/lib/trackname.c
@@ -31,7 +31,7 @@
  #include "regsub.h"
  #include "log.h"
  #include "filepart.h"
-#include "words.h"
+#include "unicode.h"
  
  const struct collection *find_track_collection(const char *track) {
    int n;
@@ -114,15 +114,20 @@ int compare_tracks(const char *sa, const char *sb,
                    const char *ta, const char *tb) {
    int c;
  
-  if((c = strcmp(casefold(sa), casefold(sb)))) return c;
+  if((c = strcmp(utf8_casefold_canon(sa, strlen(sa), 0),
+                utf8_casefold_canon(sb, strlen(sb), 0))))
+    return c;
    if((c = strcmp(sa, sb))) return c;
-  if((c = strcmp(casefold(da), casefold(db)))) return c;
+  if((c = strcmp(utf8_casefold_canon(da, strlen(da), 0),
+                utf8_casefold_canon(db, strlen(db), 0))))
+    return c;
    if((c = strcmp(da, db))) return c;
    return compare_path(ta, tb);
  }
  
  int compare_path_raw(const unsigned char *ap, size_t an,
                      const unsigned char *bp, size_t bn) {
+  /* Don't change this function!  The database sort order depends on it */
    while(an > 0 && bn > 0) {
      if(*ap == *bp) {
        ap++;
diff --git a/lib/unicode.c b/lib/unicode.c

index b5b520cf07991a60f04a15d1ebb60ac966170676..40b98549238f6ee45b06f3ae8a6eb9ef43ba2588 100644 (file)
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -372,21 +372,10 @@ struct utf32_iterator_data {
     * the value is (uint32_t)-1.
     */
    uint32_t last[2];
-};
  
-/** @brief Create a new iterator pointing at the start of a string
- * @param s Start of string
- * @param ns Length of string
- * @return New iterator
- */
-utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
-  utf32_iterator it = xmalloc(sizeof *it);
-  it->s = s;
-  it->ns = ns;
-  it->n = 0;
-  it->last[0] = it->last[1] = -1;
-  return it;
-}
+  /** @brief Tailoring for Word_Break */
+  unicode_property_tailor *word_break;
+};
  
  /** @brief Initialize an internal private iterator
   * @param it Iterator
@@ -400,9 +389,54 @@ static void utf32__iterator_init(utf32_iterator it,
    it->ns = ns;
    it->n = 0;
    it->last[0] = it->last[1] = -1;
+  it->word_break = 0;
    utf32_iterator_set(it, n);
  }
  
+/** @brief Create a new iterator pointing at the start of a string
+ * @param s Start of string
+ * @param ns Length of string
+ * @return New iterator
+ */
+utf32_iterator utf32_iterator_new(const uint32_t *s, size_t ns) {
+  utf32_iterator it = xmalloc(sizeof *it);
+  utf32__iterator_init(it, s, ns, 0);
+  return it;
+}
+
+/** @brief Tailor this iterator's interpretation of the Word_Break property.
+ * @param it Iterator
+ * @param pt Property tailor function or NULL
+ *
+ * After calling this the iterator will call @p pt to determine the Word_Break
+ * property of each code point.  If it returns -1 the default value will be
+ * used otherwise the returned value will be used.
+ *
+ * @p pt can be NULL to revert to the default value of the property.
+ *
+ * It is safe to call this function at any time; the iterator's internal state
+ * will be reset to suit the new tailoring.
+ */
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+                                      unicode_property_tailor *pt) {
+  it->word_break = pt;
+  utf32_iterator_set(it, it->n);
+}
+
+static inline enum unicode_Word_Break utf32__iterator_word_break(utf32_iterator it,
+                                                                 uint32_t c) {
+  if(!it->word_break)
+    return utf32__word_break(c);
+  else {
+    const int t = it->word_break(c);
+
+    if(t < 0)
+      return utf32__word_break(c);
+    else
+      return t;
+  }
+}
+
  /** @brief Destroy an iterator
   * @param it Iterator
   */
@@ -444,14 +478,18 @@ int utf32_iterator_set(utf32_iterator it, size_t n) {
      return -1;
    /* Walk backwards skipping ignorable code points */
    m = n;
-  while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+  while(m > 0
+        && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                 it->s[m-1]))))
      --m;
    /* Either m=0 or s[m-1] is not ignorable */
    if(m > 0) {
      --m;
      /* s[m] is our first non-ignorable code; look for a second in the same
         way **/
-    while(m > 0 && (utf32__boundary_ignorable(utf32__word_break(it->s[m-1]))))
+    while(m > 0
+          && (utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                   it->s[m-1]))))
        --m;
      /* Either m=0 or s[m-1] is not ignorable */
      if(m > 0)
@@ -478,7 +516,7 @@ int utf32_iterator_advance(utf32_iterator it, size_t count) {
    if(count <= it->ns - it->n) {
      while(count > 0) {
        const uint32_t c = it->s[it->n];
-      const enum unicode_Word_Break wb = utf32__word_break(c);
+      const enum unicode_Word_Break wb = utf32__iterator_word_break(it, c);
        if(it->last[1] == (uint32_t)-1
           || !utf32__boundary_ignorable(wb)) {
          it->last[0] = it->last[1];
@@ -588,29 +626,30 @@ int utf32_iterator_word_boundary(utf32_iterator it) {
    /* WB4 */
    /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
    if(utf32__sentence_break(it->s[it->n-1]) != unicode_Sentence_Break_Sep
-     && utf32__boundary_ignorable(utf32__word_break(it->s[it->n])))
+     && utf32__boundary_ignorable(utf32__iterator_word_break(it, it->s[it->n])))
      return 0;
    /* Gather the property values we'll need for the rest of the test taking the
     * s6.2 changes into account */
    /* First we look at the code points after the proposed boundary */
    nn = it->n;                           /* <it->ns */
-  after = utf32__word_break(it->s[nn++]);
+  after = utf32__iterator_word_break(it, it->s[nn++]);
    if(!utf32__boundary_ignorable(after)) {
      /* X (Extend|Format)* -> X */
      while(nn < it->ns
-          && utf32__boundary_ignorable(utf32__word_break(it->s[nn])))
+          && utf32__boundary_ignorable(utf32__iterator_word_break(it,
+                                                                  it->s[nn])))
        ++nn;
    }
    /* It's possible now that nn=ns */
    if(nn < it->ns)
-    twoafter = utf32__word_break(it->s[nn]);
+    twoafter = utf32__iterator_word_break(it, it->s[nn]);
    else
      twoafter = unicode_Word_Break_Other;
  
    /* We've already recorded the non-ignorable code points before the proposed
     * boundary */
-  before = utf32__word_break(it->last[1]);
-  twobefore = utf32__word_break(it->last[0]);
+  before = utf32__iterator_word_break(it, it->last[1]);
+  twobefore = utf32__iterator_word_break(it, it->last[0]);
  
    /* WB5 */
    if(before == unicode_Word_Break_ALetter
@@ -626,7 +665,7 @@ int utf32_iterator_word_boundary(utf32_iterator it) {
       && before == unicode_Word_Break_MidLetter
       && after == unicode_Word_Break_ALetter)
      return 0;
-  /* WB8 */  
+  /* WB8 */
    if(before == unicode_Word_Break_Numeric
       && after == unicode_Word_Break_Numeric)
      return 0;
@@ -1275,12 +1314,14 @@ int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
   * @param s Pointer to start of string
   * @param ns Length of string
   * @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
   * @return Pointer to array of pointers to words
   *
   * The returned array is terminated by a NULL pointer and individual
   * strings are 0-terminated.
   */
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak) {
    struct utf32_iterator_data it[1];
    size_t b1 = 0, b2 = 0 ,i;
    int isword;
@@ -1289,6 +1330,7 @@ uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
  
    vector32_init(v32);
    utf32__iterator_init(it, s, ns, 0);
+  it->word_break = wbreak;
    /* Work our way through the string stopping at each word break. */
    do {
      if(utf32_iterator_word_boundary(it)) {
@@ -1300,7 +1342,7 @@ uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp) {
         * whether they are a word or not */
        isword = 0;
        for(i = b1; i < b2; ++i) {
-        switch(utf32__word_break(it->s[i])) {
+        switch(utf32__iterator_word_break(it, it->s[i])) {
          case unicode_Word_Break_ALetter:
          case unicode_Word_Break_Numeric:
          case unicode_Word_Break_Katakana:
@@ -1468,18 +1510,20 @@ char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
   * @param s Pointer to start of string
   * @param ns Length of string
   * @param nwp Where to store word count, or NULL
+ * @param wbreak Word_Break property tailor, or NULL
   * @return Pointer to array of pointers to words
   *
   * The returned array is terminated by a NULL pointer and individual
   * strings are 0-terminated.
   */
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+                       unicode_property_tailor *wbreak) {
    uint32_t *to32 = 0, **v32 = 0;
    size_t nto32, nv, n;
    char **v8 = 0, **ret = 0;
-                                                                
+
    if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;
-  if(!(v32 = utf32_word_split(to32, nto32, &nv))) goto error;
+  if(!(v32 = utf32_word_split(to32, nto32, &nv, wbreak))) goto error;
    v8 = xcalloc(sizeof (char *), nv + 1);
    for(n = 0; n < nv; ++n)
      if(!(v8[n] = utf32_to_utf8(v32[n], utf32_len(v32[n]), 0)))
@@ -1487,7 +1531,7 @@ char **utf8_word_split(const char *s, size_t ns, size_t *nwp) {
    ret = v8;
    *nwp = nv;
    v8 = 0;                               /* don't free */
-error:                                                          
+error:
    if(v8) {
      for(n = 0; n < nv; ++n)
        xfree(v8[n]);
diff --git a/lib/unicode.h b/lib/unicode.h

index 7f3220773537d0447c2fbc810a1e517b61e1dcbf..e9e58ca672fe23e385c53465b88be0d908e94a2e 100644 (file)
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -36,6 +36,14 @@
   */
  typedef struct utf32_iterator_data *utf32_iterator;
  
+/** @brief Property tailor function
+ * @param c Code point
+ * @return Tailored property or -1 to use standard value
+ *
+ * See also utf32_iterator_tailor_word_break().
+ */
+typedef int unicode_property_tailor(uint32_t c);
+
  char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd);
  uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd);
  int utf8_valid(const char *s, size_t ns);
@@ -73,9 +81,13 @@ int utf32_iterator_advance(utf32_iterator it, size_t n);
  uint32_t utf32_iterator_code(utf32_iterator it);
  int utf32_iterator_grapheme_boundary(utf32_iterator it);
  int utf32_iterator_word_boundary(utf32_iterator it);
+void utf32_iterator_tailor_word_break(utf32_iterator it,
+                                      unicode_property_tailor *pt);
  
-uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp);
-char **utf8_word_split(const char *s, size_t ns, size_t *nwp);
+uint32_t **utf32_word_split(const uint32_t *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak);
+char **utf8_word_split(const char *s, size_t ns, size_t *nwp,
+                            unicode_property_tailor *wbreak);
  
  /** @brief Convert 0-terminated UTF-32 to UTF-8
   * @param s 0-terminated UTF-32 string
diff --git a/lib/words.c b/lib/words.c

deleted file mode 100644 (file)

index 89174cd..0000000
--- a/lib/words.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004, 2007 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#include <config.h>
-#include "types.h"
-
-#include <string.h>
-#include <stddef.h>
-
-#include "mem.h"
-#include "vector.h"
-#include "table.h"
-#include "words.h"
-#include "utf8.h"
-#include "log.h"
-#include "charset.h"
-
-#include "unidata.h"
-#include "unicode.h"
-
-const char *casefold(const char *ptr) {
-  return utf8_casefold_compat(ptr, strlen(ptr), 0);
-}
-
-char **words(const char *s, int *nvecp) {
-  size_t nv;
-  char **v;
-
-  v = utf8_word_split(s, strlen(s), &nv);
-  *nvecp = nv;
-  return v;
-}
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
diff --git a/lib/words.h b/lib/words.h

deleted file mode 100644 (file)

index 9fb7448..0000000
--- a/lib/words.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of DisOrder
- * Copyright (C) 2004 Richard Kettlewell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- */
-
-#ifndef WORDS_H
-#define WORDS_H
-
-const char *casefold(const char *s);
-/* return a case-folded version of UTF-8 string @s@, or the original
- * string if malformed. */
-
-char **words(const char *s, int *nvecp);
-/* return the words found in UTF-8 string @s@, with punctuation
- * stripped out.  (Doesn't casefold.) */
-
-#endif /* WORDS_H */
-
-/*
-Local Variables:
-c-basic-offset:2
-comment-column:40
-End:
-*/
diff --git a/server/dcgi.c b/server/dcgi.c

index 8eff93a936c52d6a26fd95f843606c7ee285f640..07e1825e705f409e9cb1f211c856a010c9725219 100644 (file)
--- a/server/dcgi.c
+++ b/server/dcgi.c
@@ -46,7 +46,6 @@
  #include "queue.h"
  #include "plugin.h"
  #include "split.h"
-#include "words.h"
  #include "wstat.h"
  #include "kvp.h"
  #include "syscalls.h"
diff --git a/server/trackdb.c b/server/trackdb.c

index 4be5f25802187cc422dc4ae8fa38f03af895bb4f..b4e0de927e37ec21c04574017d2b39b3335a5ad0 100644 (file)
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -44,7 +44,6 @@
  #include "configuration.h"
  #include "syscalls.h"
  #include "wstat.h"
-#include "words.h"
  #include "printf.h"
  #include "filepart.h"
  #include "trackname.h"
@@ -53,6 +52,8 @@
  #include "cache.h"
  #include "eventlog.h"
  #include "hash.h"
+#include "unicode.h"
+#include "unidata.h"
  
  #define RESCAN "disorder-rescan"
  #define DEADLOCK "disorder-deadlock"
@@ -573,24 +574,50 @@ static int is_display_pref(const char *name) {
    return !strncmp(name, prefix, (sizeof prefix) - 1);
  }
  
+/** @brief Word_Break property tailor that treats underscores as spaces */
+static int tailor_underscore_Word_Break_Other(uint32_t c) {
+  switch(c) {
+  default:
+    return -1;
+  case 0x005F: /* LOW LINE (SPACING UNDERSCORE) */
+    return unicode_Word_Break_Other;
+  }
+}
+
+/** @brief Normalize and split a string using a given tailoring */
+static void word_split(struct vector *v,
+                       const char *s,
+                       unicode_property_tailor *pt) {
+  size_t nw, nt32, i;
+  uint32_t *t32, **w32;
+
+  /* Convert to UTF-32 */
+  if(!(t32 = utf8_to_utf32(s, strlen(s), &nt32)))
+    return;
+  /* Erase case distinctions */
+  if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
+    return;
+  /* Split into words, treating _ as a space */
+  w32 = utf32_word_split(t32, nt32, &nw, pt);
+  /* Convert words back to UTF-8 and append to result */
+  for(i = 0; i < nw; ++i)
+    vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+}
+
  /* compute the words of a track name */
  static char **track_to_words(const char *track,
                               const struct kvp *p) {
    struct vector v;
-  char **w;
-  int nw;
    const char *rootless = track_rootless(track);
  
    if(!rootless)
      rootless = track;                   /* bodge */
    vector_init(&v);
-  if((w = words(casefold(strip_extension(rootless)), &nw)))
-    vector_append_many(&v, w, nw);
-
+  rootless = strip_extension(rootless);
+  word_split(&v, strip_extension(rootless), tailor_underscore_Word_Break_Other);
    for(; p; p = p->next)
      if(is_display_pref(p->name))
-      if((w = words(casefold(p->value), &nw)))
-        vector_append_many(&v, w, nw);
+      word_split(&v, p->value, 0);
    vector_terminate(&v);
    return dedupe(v.vec, v.nvec);
  }
@@ -1739,7 +1766,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    /* casefold all the words */
    w = xmalloc(nwordlist * sizeof (char *));
    for(n = 0; n < nwordlist; ++n) {
-    w[n] = casefold(wordlist[n]);
+    w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
      if(checktag(w[n])) ++ntags;         /* count up tags */
    }
    /* find the longest non-stopword */
author	Richard Kettlewell <rjk@greenend.org.uk>
	Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)
committer	Richard Kettlewell <rjk@greenend.org.uk>
	Tue, 20 Nov 2007 20:32:06 +0000 (20:32 +0000)
lib/Makefile.am		patch \| blob \| blame \| history
lib/test.c		patch \| blob \| blame \| history
lib/trackname.c		patch \| blob \| blame \| history
lib/unicode.c		patch \| blob \| blame \| history
lib/unicode.h		patch \| blob \| blame \| history
lib/words.c	[deleted file]	patch \| blob \| blame \| history
lib/words.h	[deleted file]	patch \| blob \| blame \| history
server/dcgi.c		patch \| blob \| blame \| history
server/trackdb.c		patch \| blob \| blame \| history