chiark - git - mdw - disorder/blob - lib/words.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2004, 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20
  21 #include <config.h>
  22 #include "types.h"
  23
  24 #include <string.h>
  25 #include <stddef.h>
  26
  27 #include "mem.h"
  28 #include "vector.h"
  29 #include "table.h"
  30 #include "words.h"
  31 #include "utf8.h"
  32 #include "log.h"
  33 #include "charset.h"
  34
  35 #include "unidata.h"
  36 #include "unicode.h"
  37
  38 const char *casefold(const char *ptr) {
  39   return utf8_casefold_canon(ptr, strlen(ptr), 0);
  40 }
  41
  42 static enum unicode_gc_cat cat(uint32_t c) {
  43   if(c < UNICODE_NCHARS) {
  44     /* If this a known character, convert it to lower case */
  45     const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
  46     return ud->gc;
  47   } else
  48     return unicode_gc_Cn;
  49 }
  50
  51 /* XXX this is a bit kludgy */
  52
  53 char **words(const char *s, int *nvecp) {
  54   struct vector v;
  55   struct dynstr d;
  56   const char *start;
  57   uint32_t c;
  58   int in_word = 0;
  59
  60   vector_init(&v);
  61   while(*s) {
  62     start = s;
  63     PARSE_UTF8(s, c, return 0);
  64     /* special cases first */
  65     switch(c) {
  66     case '/':
  67     case '.':
  68     case '+':
  69     case '&':
  70     case ':':
  71     case '_':
  72     case '-':
  73       goto separator;
  74     }
  75     /* do the rest on category */
  76     switch(cat(c)) {
  77     case unicode_gc_Ll:
  78     case unicode_gc_Lm:
  79     case unicode_gc_Lo:
  80     case unicode_gc_Lt:
  81     case unicode_gc_Lu:
  82     case unicode_gc_Nd:
  83     case unicode_gc_Nl:
  84     case unicode_gc_No:
  85     case unicode_gc_Sc:
  86     case unicode_gc_Sk:
  87     case unicode_gc_Sm:
  88     case unicode_gc_So:
  89       /* letters, digits and symbols are considered to be part of
  90        * words */
  91       if(!in_word) {
  92         dynstr_init(&d);
  93         in_word = 1;
  94       }
  95       dynstr_append_bytes(&d, start, s - start);
  96       break;
  97
  98     case unicode_gc_Cc:
  99     case unicode_gc_Cf:
 100     case unicode_gc_Co:
 101     case unicode_gc_Cs:
 102     case unicode_gc_Zl:
 103     case unicode_gc_Zp:
 104     case unicode_gc_Zs:
 105     case unicode_gc_Pe:
 106     case unicode_gc_Ps:
 107     separator:
 108       if(in_word) {
 109         dynstr_terminate(&d);
 110         vector_append(&v, d.vec);
 111         in_word = 0;
 112       }
 113       break;
 114
 115     case unicode_gc_Mc:
 116     case unicode_gc_Me:
 117     case unicode_gc_Mn:
 118     case unicode_gc_Pc:
 119     case unicode_gc_Pd:
 120     case unicode_gc_Pf:
 121     case unicode_gc_Pi:
 122     case unicode_gc_Po:
 123     case unicode_gc_Cn:
 124       /* control and punctuation is completely ignored */
 125       break;
 126
 127     }
 128   }
 129   if(in_word) {
 130     /* pick up the final word */
 131     dynstr_terminate(&d);
 132     vector_append(&v, d.vec);
 133   }
 134   vector_terminate(&v);
 135   if(nvecp)
 136     *nvecp = v.nvec;
 137   return v.vec;
 138 }
 139
 140 /*
 141 Local Variables:
 142 c-basic-offset:2
 143 comment-column:40
 144 End:
 145 */