chiark / gitweb /
typo
[disorder] / lib / words.c
CommitLineData
460b9539 1/*
2 * This file is part of DisOrder
3 * Copyright (C) 2004 Richard Kettlewell
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20
21#include <config.h>
22#include "types.h"
23
24#include <string.h>
25#include <stddef.h>
26
27#include "mem.h"
28#include "vector.h"
29#include "table.h"
30#include "words.h"
31#include "utf8.h"
32
33#include "casefold.h"
34#include "unicodegc.h"
35
36const char *casefold(const char *ptr) {
37 struct dynstr d;
38 int l, r, m;
39 uint32_t c;
40 const struct cm *t;
41 const char *start, *s = ptr;
42
43 dynstr_init(&d);
44 while(*s) {
45 start = s;
46 PARSE_UTF8(s, c, return ptr);
47 /* seek the folded equivalent */
48 t = cm[c & CM_MASK];
49 l = 0;
50 r = cmn[c & CM_MASK] - 1;
51 while(l <= r && c != t[m = (l + r) / 2].ch)
52 if(c < t[m].ch)
53 r = m - 1;
54 else
55 l = m + 1;
56 if(l <= r)
57 dynstr_append_string(&d, t[m].tr);
58 else
59 dynstr_append_bytes(&d, start, s - start);
60 }
61 dynstr_terminate(&d);
62 return d.vec;
63}
64
65static enum unicode_gc_cat cat(uint32_t c) {
66 int l, r, m;
67
68 l = 0;
69 r = sizeof gcs / sizeof *gcs;
70 while(l <= r) {
71 m = (l + r) / 2;
72 if(c < gcs[m].l)
73 r = m - 1;
74 else if(c > gcs[m].h)
75 l = m + 1;
76 else
77 return gcs[m].cat;
78 }
79 return unicode_gc_none;
80}
81
82/* XXX this is a bit kludgy */
83
84char **words(const char *s, int *nvecp) {
85 struct vector v;
86 struct dynstr d;
87 const char *start;
88 uint32_t c;
89 int in_word = 0;
90
91 vector_init(&v);
92 while(*s) {
93 start = s;
94 PARSE_UTF8(s, c, return 0);
95 /* special cases first */
96 switch(c) {
97 case '/':
98 case '.':
99 case '+':
100 case '&':
101 case ':':
102 case '_':
103 case '-':
104 goto separator;
105 }
106 /* do the rest on category */
107 switch(cat(c)) {
108 case unicode_gc_Ll:
109 case unicode_gc_Lm:
110 case unicode_gc_Lo:
111 case unicode_gc_Lt:
112 case unicode_gc_Lu:
113 case unicode_gc_Nd:
114 case unicode_gc_Nl:
115 case unicode_gc_No:
116 case unicode_gc_Sc:
117 case unicode_gc_Sk:
118 case unicode_gc_Sm:
119 case unicode_gc_So:
120 /* letters, digits and symbols are considered to be part of
121 * words */
122 if(!in_word) {
123 dynstr_init(&d);
124 in_word = 1;
125 }
126 dynstr_append_bytes(&d, start, s - start);
127 break;
128
129 case unicode_gc_Cc:
130 case unicode_gc_Cf:
131 case unicode_gc_Co:
132 case unicode_gc_Cs:
133 case unicode_gc_Zl:
134 case unicode_gc_Zp:
135 case unicode_gc_Zs:
136 case unicode_gc_Pe:
137 case unicode_gc_Ps:
138 separator:
139 if(in_word) {
140 dynstr_terminate(&d);
141 vector_append(&v, d.vec);
142 in_word = 0;
143 }
144 break;
145
146 case unicode_gc_Mc:
147 case unicode_gc_Me:
148 case unicode_gc_Mn:
149 case unicode_gc_Pc:
150 case unicode_gc_Pd:
151 case unicode_gc_Pf:
152 case unicode_gc_Pi:
153 case unicode_gc_Po:
154 case unicode_gc_none:
155 /* control and punctuation is completely ignored */
156 break;
157
158 }
159 }
160 if(in_word) {
161 /* pick up the final word */
162 dynstr_terminate(&d);
163 vector_append(&v, d.vec);
164 }
165 vector_terminate(&v);
166 if(nvecp)
167 *nvecp = v.nvec;
168 return v.vec;
169}
170
171/*
172Local Variables:
173c-basic-offset:2
174comment-column:40
175End:
176*/