1 /* Copyright (C) 1995, 1996, 1997, 2002, 2004, 2005, 2006
2 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 #include <sys/param.h>
29 #include <gnu/option-groups.h>
32 # define STRING_TYPE char
33 # define USTRING_TYPE unsigned char
34 # define STRXFRM __strxfrm_l
35 # define STRCMP strcmp
36 # define STRLEN strlen
37 # define STPNCPY __stpncpy
38 # define WEIGHT_H "../locale/weight.h"
43 #define CONCAT(a,b) CONCAT1(a,b)
44 #define CONCAT1(a,b) a##b
46 #include "../locale/localeinfo.h"
49 #ifndef WIDE_CHAR_VERSION
51 /* We need UTF-8 encoding of numbers. */
53 utf8_encode (char *buf, int val)
66 for (step = 2; step < 6; ++step)
67 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
71 *buf = (unsigned char) (~0xff >> step);
75 buf[step] = 0x80 | (val & 0x3f);
88 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
90 struct locale_data *current = l->__locales[LC_COLLATE];
91 #if __OPTION_EGLIBC_LOCALE_CODE
92 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
94 const uint_fast32_t nrules = 0;
96 /* We don't assign the following values right away since it might be
97 unnecessary in case there are no rules. */
98 const unsigned char *rulesets;
100 const USTRING_TYPE *weights;
101 const USTRING_TYPE *extra;
102 const int32_t *indirect;
106 const USTRING_TYPE *usrc;
107 size_t srclen = STRLEN (src);
109 unsigned char *rulearr;
119 STPNCPY (dest, src, MIN (srclen + 1, n));
124 rulesets = (const unsigned char *)
125 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
126 table = (const int32_t *)
127 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
128 weights = (const USTRING_TYPE *)
129 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
130 extra = (const USTRING_TYPE *)
131 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
132 indirect = (const int32_t *)
133 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
136 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
137 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
138 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
139 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
141 /* Handle an empty string as a special case. */
149 /* We need the elements of the string as unsigned values since they
150 are used as indeces. */
151 usrc = (const USTRING_TYPE *) src;
153 /* Perform the first pass over the string and while doing this find
154 and store the weights for each character. Since we want this to
155 be as fast as possible we are using `alloca' to store the temporary
156 values. But since there is no limit on the length of the string
157 we have to use `malloc' if the string is too long. We should be
158 very conservative here. */
159 if (! __libc_use_alloca (srclen))
161 idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
162 rulearr = (unsigned char *) &idxarr[srclen];
165 /* No memory. Well, go with the stack then.
167 XXX Once this implementation is stable we will handle this
168 differently. Instead of precomputing the indeces we will
169 do this in time. This means, though, that this happens for
177 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
178 rulearr = (unsigned char *) alloca (srclen + 1);
184 int32_t tmp = findidx (&usrc);
185 rulearr[idxmax] = tmp >> 24;
186 idxarr[idxmax] = tmp & 0xffffff;
190 while (*usrc != L('\0'));
192 /* This element is only read, the value never used but to determine
193 another value which then is ignored. */
194 rulearr[idxmax] = '\0';
196 /* Now the passes over the weights. We now use the indeces we found
199 for (pass = 0; pass < nrules; ++pass)
201 size_t backw_stop = ~0ul;
202 int rule = rulesets[rulearr[0] * nrules + pass];
203 /* We assume that if a rule has defined `position' in one section
204 this is true for all of them. */
205 int position = rule & sort_position;
207 last_needed = needed;
210 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
212 if ((rule & sort_forward) != 0)
216 if (backw_stop != ~0ul)
218 /* Handle the pushed elements now. */
221 for (backw = idxcnt; backw > backw_stop; )
224 len = weights[idxarr[backw]++];
226 if (needed + len < n)
228 dest[needed++] = weights[idxarr[backw]++];
231 /* No more characters fit into the buffer. */
233 idxarr[backw] += len;
240 /* Now handle the forward element. */
241 len = weights[idxarr[idxcnt]++];
242 if (needed + len < n)
244 dest[needed++] = weights[idxarr[idxcnt]++];
247 /* No more characters fit into the buffer. */
249 idxarr[idxcnt] += len;
254 /* Remember where the backwards series started. */
255 if (backw_stop == ~0ul)
259 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
263 if (backw_stop != ~0ul)
265 /* Handle the pushed elements now. */
269 while (backw > backw_stop)
271 size_t len = weights[idxarr[--backw]++];
273 if (needed + len < n)
275 dest[needed++] = weights[idxarr[backw]++];
278 /* No more characters fit into the buffer. */
280 idxarr[backw] += len;
288 #ifndef WIDE_CHAR_VERSION
294 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
296 if ((rule & sort_forward) != 0)
300 if (backw_stop != ~0ul)
302 /* Handle the pushed elements now. */
305 for (backw = idxcnt; backw > backw_stop; )
308 len = weights[idxarr[backw]++];
311 #ifdef WIDE_CHAR_VERSION
312 if (needed + 1 + len < n)
315 for (i = 0; i < len; ++i)
316 dest[needed + 1 + i] =
317 weights[idxarr[backw] + i];
321 buflen = utf8_encode (buf, val);
322 if (needed + buflen + len < n)
324 for (i = 0; i < buflen; ++i)
325 dest[needed + i] = buf[i];
326 for (i = 0; i < len; ++i)
327 dest[needed + buflen + i] =
328 weights[idxarr[backw] + i];
330 needed += buflen + len;
332 idxarr[backw] += len;
342 /* Now handle the forward element. */
343 len = weights[idxarr[idxcnt]++];
346 #ifdef WIDE_CHAR_VERSION
347 if (needed + 1+ len < n)
350 for (i = 0; i < len; ++i)
351 dest[needed + 1 + i] =
352 weights[idxarr[idxcnt] + i];
356 buflen = utf8_encode (buf, val);
357 if (needed + buflen + len < n)
359 for (i = 0; i < buflen; ++i)
360 dest[needed + i] = buf[i];
361 for (i = 0; i < len; ++i)
362 dest[needed + buflen + i] =
363 weights[idxarr[idxcnt] + i];
365 needed += buflen + len;
367 idxarr[idxcnt] += len;
371 /* Note that we don't have to increment `idxarr[idxcnt]'
372 since the length is zero. */
377 /* Remember where the backwards series started. */
378 if (backw_stop == ~0ul)
382 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
385 if (backw_stop != ~0ul)
387 /* Handle the pushed elements now. */
391 while (backw > backw_stop)
393 size_t len = weights[idxarr[--backw]++];
396 #ifdef WIDE_CHAR_VERSION
397 if (needed + 1 + len < n)
400 for (i = 0; i < len; ++i)
401 dest[needed + 1 + i] =
402 weights[idxarr[backw] + i];
406 buflen = utf8_encode (buf, val);
407 if (needed + buflen + len < n)
409 for (i = 0; i < buflen; ++i)
410 dest[needed + i] = buf[i];
411 for (i = 0; i < len; ++i)
412 dest[needed + buflen + i] =
413 weights[idxarr[backw] + i];
415 needed += buflen + len;
417 idxarr[backw] += len;
426 /* Finally store the byte to separate the passes or terminate
429 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
433 /* This is a little optimization: many collation specifications have
434 a `position' rule at the end and if no non-ignored character
435 is found the last \1 byte is immediately followed by a \0 byte
436 signalling this. We can avoid the \1 byte(s). */
437 if (needed > 2 && needed == last_needed + 1)
439 /* Remove the \1 byte. */
441 dest[needed - 1] = L('\0');
444 /* Free the memory if needed. */
448 /* Return the number of bytes/words we need, but don't count the NUL
449 byte/word at the end. */
452 libc_hidden_def (STRXFRM)
454 #ifndef WIDE_CHAR_VERSION
455 weak_alias (__strxfrm_l, strxfrm_l)