chiark - git - mdw - disorder/blob - lib/words.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2004, 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20
  21 #include <config.h>
  22 #include "types.h"
  23
  24 #include <string.h>
  25 #include <stddef.h>
  26
  27 #include "mem.h"
  28 #include "vector.h"
  29 #include "table.h"
  30 #include "words.h"
  31 #include "utf8.h"
  32 #include "log.h"
  33 #include "charset.h"
  34
  35 #include "unidata.h"
  36 #include "unicode.h"
  37
  38 const char *casefold(const char *ptr) {
  39   return utf8_casefold_canon(ptr, strlen(ptr), 0);
  40 }
  41
  42 static enum unicode_gc_cat cat(uint32_t c) {
  43   if(c < UNICODE_NCHARS) {
  44     const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
  45     return ud->gc;
  46   } else
  47     return unicode_gc_Cn;
  48 }
  49
  50 /* XXX this is a bit kludgy */
  51
  52 char **words(const char *s, int *nvecp) {
  53   struct vector v;
  54   struct dynstr d;
  55   const char *start;
  56   uint32_t c;
  57   int in_word = 0;
  58
  59   vector_init(&v);
  60   while(*s) {
  61     start = s;
  62     PARSE_UTF8(s, c, return 0);
  63     /* special cases first */
  64     switch(c) {
  65     case '/':
  66     case '.':
  67     case '+':
  68     case '&':
  69     case ':':
  70     case '_':
  71     case '-':
  72       goto separator;
  73     }
  74     /* do the rest on category */
  75     switch(cat(c)) {
  76     case unicode_gc_Ll:
  77     case unicode_gc_Lm:
  78     case unicode_gc_Lo:
  79     case unicode_gc_Lt:
  80     case unicode_gc_Lu:
  81     case unicode_gc_Nd:
  82     case unicode_gc_Nl:
  83     case unicode_gc_No:
  84     case unicode_gc_Sc:
  85     case unicode_gc_Sk:
  86     case unicode_gc_Sm:
  87     case unicode_gc_So:
  88       /* letters, digits and symbols are considered to be part of
  89        * words */
  90       if(!in_word) {
  91         dynstr_init(&d);
  92         in_word = 1;
  93       }
  94       dynstr_append_bytes(&d, start, s - start);
  95       break;
  96
  97     case unicode_gc_Cc:
  98     case unicode_gc_Cf:
  99     case unicode_gc_Co:
 100     case unicode_gc_Cs:
 101     case unicode_gc_Zl:
 102     case unicode_gc_Zp:
 103     case unicode_gc_Zs:
 104     case unicode_gc_Pe:
 105     case unicode_gc_Ps:
 106     separator:
 107       if(in_word) {
 108         dynstr_terminate(&d);
 109         vector_append(&v, d.vec);
 110         in_word = 0;
 111       }
 112       break;
 113
 114     case unicode_gc_Mc:
 115     case unicode_gc_Me:
 116     case unicode_gc_Mn:
 117     case unicode_gc_Pc:
 118     case unicode_gc_Pd:
 119     case unicode_gc_Pf:
 120     case unicode_gc_Pi:
 121     case unicode_gc_Po:
 122     case unicode_gc_Cn:
 123       /* control and punctuation is completely ignored */
 124       break;
 125
 126     }
 127   }
 128   if(in_word) {
 129     /* pick up the final word */
 130     dynstr_terminate(&d);
 131     vector_append(&v, d.vec);
 132   }
 133   vector_terminate(&v);
 134   if(nvecp)
 135     *nvecp = v.nvec;
 136   return v.vec;
 137 }
 138
 139 /*
 140 Local Variables:
 141 c-basic-offset:2
 142 comment-column:40
 143 End:
 144 */