chiark / gitweb /
build fix
[disorder] / lib / words.c
CommitLineData
460b9539 1/*
2 * This file is part of DisOrder
61507e3c 3 * Copyright (C) 2004, 2007 Richard Kettlewell
460b9539 4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20
21#include <config.h>
22#include "types.h"
23
24#include <string.h>
25#include <stddef.h>
26
27#include "mem.h"
28#include "vector.h"
29#include "table.h"
30#include "words.h"
31#include "utf8.h"
61507e3c
RK
32#include "log.h"
33#include "charset.h"
460b9539 34
61507e3c 35#include "unidata.h"
460b9539 36
37const char *casefold(const char *ptr) {
38 struct dynstr d;
460b9539 39 uint32_t c;
61507e3c 40 const char *s = ptr;
460b9539 41
42 dynstr_init(&d);
43 while(*s) {
61507e3c 44 /* Convert UTF-8 to UCS-32 */
460b9539 45 PARSE_UTF8(s, c, return ptr);
61507e3c
RK
46 /* Normalize */
47 if(c < UNICODE_NCHARS) {
48 /* If this a known character, convert it to lower case */
49 const struct unidata *const ud = &unidata[c / 256][c % 256];
50 c += ud->lower_offset;
51 }
52 /* Convert UCS-4 back to UTF-8 */
53 one_ucs42utf8(c, &d);
460b9539 54 }
55 dynstr_terminate(&d);
56 return d.vec;
57}
58
59static enum unicode_gc_cat cat(uint32_t c) {
61507e3c
RK
60 if(c < UNICODE_NCHARS) {
61 /* If this a known character, convert it to lower case */
62 const struct unidata *const ud = &unidata[c / 256][c % 256];
63 return ud->gc;
64 } else
65 return unicode_gc_Cn;
460b9539 66}
67
68/* XXX this is a bit kludgy */
69
70char **words(const char *s, int *nvecp) {
71 struct vector v;
72 struct dynstr d;
73 const char *start;
74 uint32_t c;
75 int in_word = 0;
76
77 vector_init(&v);
78 while(*s) {
79 start = s;
80 PARSE_UTF8(s, c, return 0);
81 /* special cases first */
82 switch(c) {
83 case '/':
84 case '.':
85 case '+':
86 case '&':
87 case ':':
88 case '_':
89 case '-':
90 goto separator;
91 }
92 /* do the rest on category */
93 switch(cat(c)) {
94 case unicode_gc_Ll:
95 case unicode_gc_Lm:
96 case unicode_gc_Lo:
97 case unicode_gc_Lt:
98 case unicode_gc_Lu:
99 case unicode_gc_Nd:
100 case unicode_gc_Nl:
101 case unicode_gc_No:
102 case unicode_gc_Sc:
103 case unicode_gc_Sk:
104 case unicode_gc_Sm:
105 case unicode_gc_So:
106 /* letters, digits and symbols are considered to be part of
107 * words */
108 if(!in_word) {
109 dynstr_init(&d);
110 in_word = 1;
111 }
112 dynstr_append_bytes(&d, start, s - start);
113 break;
114
115 case unicode_gc_Cc:
116 case unicode_gc_Cf:
117 case unicode_gc_Co:
118 case unicode_gc_Cs:
119 case unicode_gc_Zl:
120 case unicode_gc_Zp:
121 case unicode_gc_Zs:
122 case unicode_gc_Pe:
123 case unicode_gc_Ps:
124 separator:
125 if(in_word) {
126 dynstr_terminate(&d);
127 vector_append(&v, d.vec);
128 in_word = 0;
129 }
130 break;
131
132 case unicode_gc_Mc:
133 case unicode_gc_Me:
134 case unicode_gc_Mn:
135 case unicode_gc_Pc:
136 case unicode_gc_Pd:
137 case unicode_gc_Pf:
138 case unicode_gc_Pi:
139 case unicode_gc_Po:
61507e3c 140 case unicode_gc_Cn:
460b9539 141 /* control and punctuation is completely ignored */
142 break;
143
144 }
145 }
146 if(in_word) {
147 /* pick up the final word */
148 dynstr_terminate(&d);
149 vector_append(&v, d.vec);
150 }
151 vector_terminate(&v);
152 if(nvecp)
153 *nvecp = v.nvec;
154 return v.vec;
155}
156
157/*
158Local Variables:
159c-basic-offset:2
160comment-column:40
161End:
162*/