Commit | Line | Data |
---|---|---|
460b9539 | 1 | /* |
2 | * This file is part of DisOrder | |
61507e3c | 3 | * Copyright (C) 2004, 2007 Richard Kettlewell |
460b9539 | 4 | * |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
18 | * USA | |
19 | */ | |
20 | ||
21 | #include <config.h> | |
22 | #include "types.h" | |
23 | ||
24 | #include <string.h> | |
25 | #include <stddef.h> | |
26 | ||
27 | #include "mem.h" | |
28 | #include "vector.h" | |
29 | #include "table.h" | |
30 | #include "words.h" | |
31 | #include "utf8.h" | |
61507e3c RK |
32 | #include "log.h" |
33 | #include "charset.h" | |
460b9539 | 34 | |
61507e3c | 35 | #include "unidata.h" |
460b9539 | 36 | |
37 | const char *casefold(const char *ptr) { | |
38 | struct dynstr d; | |
460b9539 | 39 | uint32_t c; |
61507e3c | 40 | const char *s = ptr; |
460b9539 | 41 | |
42 | dynstr_init(&d); | |
43 | while(*s) { | |
61507e3c | 44 | /* Convert UTF-8 to UCS-32 */ |
460b9539 | 45 | PARSE_UTF8(s, c, return ptr); |
61507e3c RK |
46 | /* Normalize */ |
47 | if(c < UNICODE_NCHARS) { | |
48 | /* If this a known character, convert it to lower case */ | |
49 | const struct unidata *const ud = &unidata[c / 256][c % 256]; | |
50 | c += ud->lower_offset; | |
51 | } | |
52 | /* Convert UCS-4 back to UTF-8 */ | |
53 | one_ucs42utf8(c, &d); | |
460b9539 | 54 | } |
55 | dynstr_terminate(&d); | |
56 | return d.vec; | |
57 | } | |
58 | ||
59 | static enum unicode_gc_cat cat(uint32_t c) { | |
61507e3c RK |
60 | if(c < UNICODE_NCHARS) { |
61 | /* If this a known character, convert it to lower case */ | |
62 | const struct unidata *const ud = &unidata[c / 256][c % 256]; | |
63 | return ud->gc; | |
64 | } else | |
65 | return unicode_gc_Cn; | |
460b9539 | 66 | } |
67 | ||
68 | /* XXX this is a bit kludgy */ | |
69 | ||
70 | char **words(const char *s, int *nvecp) { | |
71 | struct vector v; | |
72 | struct dynstr d; | |
73 | const char *start; | |
74 | uint32_t c; | |
75 | int in_word = 0; | |
76 | ||
77 | vector_init(&v); | |
78 | while(*s) { | |
79 | start = s; | |
80 | PARSE_UTF8(s, c, return 0); | |
81 | /* special cases first */ | |
82 | switch(c) { | |
83 | case '/': | |
84 | case '.': | |
85 | case '+': | |
86 | case '&': | |
87 | case ':': | |
88 | case '_': | |
89 | case '-': | |
90 | goto separator; | |
91 | } | |
92 | /* do the rest on category */ | |
93 | switch(cat(c)) { | |
94 | case unicode_gc_Ll: | |
95 | case unicode_gc_Lm: | |
96 | case unicode_gc_Lo: | |
97 | case unicode_gc_Lt: | |
98 | case unicode_gc_Lu: | |
99 | case unicode_gc_Nd: | |
100 | case unicode_gc_Nl: | |
101 | case unicode_gc_No: | |
102 | case unicode_gc_Sc: | |
103 | case unicode_gc_Sk: | |
104 | case unicode_gc_Sm: | |
105 | case unicode_gc_So: | |
106 | /* letters, digits and symbols are considered to be part of | |
107 | * words */ | |
108 | if(!in_word) { | |
109 | dynstr_init(&d); | |
110 | in_word = 1; | |
111 | } | |
112 | dynstr_append_bytes(&d, start, s - start); | |
113 | break; | |
114 | ||
115 | case unicode_gc_Cc: | |
116 | case unicode_gc_Cf: | |
117 | case unicode_gc_Co: | |
118 | case unicode_gc_Cs: | |
119 | case unicode_gc_Zl: | |
120 | case unicode_gc_Zp: | |
121 | case unicode_gc_Zs: | |
122 | case unicode_gc_Pe: | |
123 | case unicode_gc_Ps: | |
124 | separator: | |
125 | if(in_word) { | |
126 | dynstr_terminate(&d); | |
127 | vector_append(&v, d.vec); | |
128 | in_word = 0; | |
129 | } | |
130 | break; | |
131 | ||
132 | case unicode_gc_Mc: | |
133 | case unicode_gc_Me: | |
134 | case unicode_gc_Mn: | |
135 | case unicode_gc_Pc: | |
136 | case unicode_gc_Pd: | |
137 | case unicode_gc_Pf: | |
138 | case unicode_gc_Pi: | |
139 | case unicode_gc_Po: | |
61507e3c | 140 | case unicode_gc_Cn: |
460b9539 | 141 | /* control and punctuation is completely ignored */ |
142 | break; | |
143 | ||
144 | } | |
145 | } | |
146 | if(in_word) { | |
147 | /* pick up the final word */ | |
148 | dynstr_terminate(&d); | |
149 | vector_append(&v, d.vec); | |
150 | } | |
151 | vector_terminate(&v); | |
152 | if(nvecp) | |
153 | *nvecp = v.nvec; | |
154 | return v.vec; | |
155 | } | |
156 | ||
157 | /* | |
158 | Local Variables: | |
159 | c-basic-offset:2 | |
160 | comment-column:40 | |
161 | End: | |
162 | */ |