chiark / gitweb /
more efficient utf32_iterator_set()
[disorder] / lib / words.c
CommitLineData
460b9539 1/*
2 * This file is part of DisOrder
61507e3c 3 * Copyright (C) 2004, 2007 Richard Kettlewell
460b9539 4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20
21#include <config.h>
22#include "types.h"
23
24#include <string.h>
25#include <stddef.h>
26
27#include "mem.h"
28#include "vector.h"
29#include "table.h"
30#include "words.h"
31#include "utf8.h"
61507e3c
RK
32#include "log.h"
33#include "charset.h"
460b9539 34
61507e3c 35#include "unidata.h"
e5a5a138 36#include "unicode.h"
460b9539 37
38const char *casefold(const char *ptr) {
e5a5a138 39 return utf8_casefold_canon(ptr, strlen(ptr), 0);
460b9539 40}
41
14523635 42static enum unicode_General_Category cat(uint32_t c) {
61507e3c 43 if(c < UNICODE_NCHARS) {
e5a5a138 44 const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
14523635 45 return ud->general_category;
61507e3c 46 } else
14523635 47 return unicode_General_Category_Cn;
460b9539 48}
49
50/* XXX this is a bit kludgy */
51
52char **words(const char *s, int *nvecp) {
53 struct vector v;
54 struct dynstr d;
55 const char *start;
56 uint32_t c;
57 int in_word = 0;
58
59 vector_init(&v);
60 while(*s) {
61 start = s;
62 PARSE_UTF8(s, c, return 0);
63 /* special cases first */
64 switch(c) {
65 case '/':
66 case '.':
67 case '+':
68 case '&':
69 case ':':
70 case '_':
71 case '-':
72 goto separator;
73 }
74 /* do the rest on category */
75 switch(cat(c)) {
14523635
RK
76 case unicode_General_Category_Ll:
77 case unicode_General_Category_Lm:
78 case unicode_General_Category_Lo:
79 case unicode_General_Category_Lt:
80 case unicode_General_Category_Lu:
81 case unicode_General_Category_Nd:
82 case unicode_General_Category_Nl:
83 case unicode_General_Category_No:
84 case unicode_General_Category_Sc:
85 case unicode_General_Category_Sk:
86 case unicode_General_Category_Sm:
87 case unicode_General_Category_So:
460b9539 88 /* letters, digits and symbols are considered to be part of
89 * words */
90 if(!in_word) {
91 dynstr_init(&d);
92 in_word = 1;
93 }
94 dynstr_append_bytes(&d, start, s - start);
95 break;
96
14523635
RK
97 case unicode_General_Category_Cc:
98 case unicode_General_Category_Cf:
99 case unicode_General_Category_Co:
100 case unicode_General_Category_Cs:
101 case unicode_General_Category_Zl:
102 case unicode_General_Category_Zp:
103 case unicode_General_Category_Zs:
104 case unicode_General_Category_Pe:
105 case unicode_General_Category_Ps:
460b9539 106 separator:
107 if(in_word) {
108 dynstr_terminate(&d);
109 vector_append(&v, d.vec);
110 in_word = 0;
111 }
112 break;
113
14523635
RK
114 case unicode_General_Category_Mc:
115 case unicode_General_Category_Me:
116 case unicode_General_Category_Mn:
117 case unicode_General_Category_Pc:
118 case unicode_General_Category_Pd:
119 case unicode_General_Category_Pf:
120 case unicode_General_Category_Pi:
121 case unicode_General_Category_Po:
122 case unicode_General_Category_Cn:
460b9539 123 /* control and punctuation is completely ignored */
124 break;
125
126 }
127 }
128 if(in_word) {
129 /* pick up the final word */
130 dynstr_terminate(&d);
131 vector_append(&v, d.vec);
132 }
133 vector_terminate(&v);
134 if(nvecp)
135 *nvecp = v.nvec;
136 return v.vec;
137}
138
139/*
140Local Variables:
141c-basic-offset:2
142comment-column:40
143End:
144*/