chiark / gitweb /
Start of Unicode support rewrite
[disorder] / lib / words.c
CommitLineData
460b9539 1/*
2 * This file is part of DisOrder
61507e3c 3 * Copyright (C) 2004, 2007 Richard Kettlewell
460b9539 4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20
21#include <config.h>
22#include "types.h"
23
24#include <string.h>
25#include <stddef.h>
26
27#include "mem.h"
28#include "vector.h"
29#include "table.h"
30#include "words.h"
31#include "utf8.h"
61507e3c
RK
32#include "log.h"
33#include "charset.h"
460b9539 34
61507e3c 35#include "unidata.h"
e5a5a138 36#include "unicode.h"
460b9539 37
38const char *casefold(const char *ptr) {
e5a5a138 39 return utf8_casefold_canon(ptr, strlen(ptr), 0);
460b9539 40}
41
42static enum unicode_gc_cat cat(uint32_t c) {
61507e3c
RK
43 if(c < UNICODE_NCHARS) {
44 /* If this a known character, convert it to lower case */
e5a5a138 45 const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
61507e3c
RK
46 return ud->gc;
47 } else
48 return unicode_gc_Cn;
460b9539 49}
50
51/* XXX this is a bit kludgy */
52
53char **words(const char *s, int *nvecp) {
54 struct vector v;
55 struct dynstr d;
56 const char *start;
57 uint32_t c;
58 int in_word = 0;
59
60 vector_init(&v);
61 while(*s) {
62 start = s;
63 PARSE_UTF8(s, c, return 0);
64 /* special cases first */
65 switch(c) {
66 case '/':
67 case '.':
68 case '+':
69 case '&':
70 case ':':
71 case '_':
72 case '-':
73 goto separator;
74 }
75 /* do the rest on category */
76 switch(cat(c)) {
77 case unicode_gc_Ll:
78 case unicode_gc_Lm:
79 case unicode_gc_Lo:
80 case unicode_gc_Lt:
81 case unicode_gc_Lu:
82 case unicode_gc_Nd:
83 case unicode_gc_Nl:
84 case unicode_gc_No:
85 case unicode_gc_Sc:
86 case unicode_gc_Sk:
87 case unicode_gc_Sm:
88 case unicode_gc_So:
89 /* letters, digits and symbols are considered to be part of
90 * words */
91 if(!in_word) {
92 dynstr_init(&d);
93 in_word = 1;
94 }
95 dynstr_append_bytes(&d, start, s - start);
96 break;
97
98 case unicode_gc_Cc:
99 case unicode_gc_Cf:
100 case unicode_gc_Co:
101 case unicode_gc_Cs:
102 case unicode_gc_Zl:
103 case unicode_gc_Zp:
104 case unicode_gc_Zs:
105 case unicode_gc_Pe:
106 case unicode_gc_Ps:
107 separator:
108 if(in_word) {
109 dynstr_terminate(&d);
110 vector_append(&v, d.vec);
111 in_word = 0;
112 }
113 break;
114
115 case unicode_gc_Mc:
116 case unicode_gc_Me:
117 case unicode_gc_Mn:
118 case unicode_gc_Pc:
119 case unicode_gc_Pd:
120 case unicode_gc_Pf:
121 case unicode_gc_Pi:
122 case unicode_gc_Po:
61507e3c 123 case unicode_gc_Cn:
460b9539 124 /* control and punctuation is completely ignored */
125 break;
126
127 }
128 }
129 if(in_word) {
130 /* pick up the final word */
131 dynstr_terminate(&d);
132 vector_append(&v, d.vec);
133 }
134 vector_terminate(&v);
135 if(nvecp)
136 *nvecp = v.nvec;
137 return v.vec;
138}
139
140/*
141Local Variables:
142c-basic-offset:2
143comment-column:40
144End:
145*/