string/strxfrm_l.c

   1 /* Copyright (C) 1995, 1996, 1997, 2002, 2004, 2005, 2006
   2    Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <langinfo.h>
  23 #include <locale.h>
  24 #include <stddef.h>
  25 #include <stdint.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <sys/param.h>
  29 #include <gnu/option-groups.h>
  30
  31 #ifndef STRING_TYPE
  32 # define STRING_TYPE char
  33 # define USTRING_TYPE unsigned char
  34 # define STRXFRM __strxfrm_l
  35 # define STRCMP strcmp
  36 # define STRLEN strlen
  37 # define STPNCPY __stpncpy
  38 # define WEIGHT_H "../locale/weight.h"
  39 # define SUFFIX MB
  40 # define L(arg) arg
  41 #endif
  42
  43 #define CONCAT(a,b) CONCAT1(a,b)
  44 #define CONCAT1(a,b) a##b
  45
  46 #include "../locale/localeinfo.h"
  47
  48
  49 #ifndef WIDE_CHAR_VERSION
  50
  51 /* We need UTF-8 encoding of numbers.  */
  52 static int
  53 utf8_encode (char *buf, int val)
  54 {
  55   int retval;
  56
  57   if (val < 0x80)
  58     {
  59       *buf++ = (char) val;
  60       retval = 1;
  61     }
  62   else
  63     {
  64       int step;
  65
  66       for (step = 2; step < 6; ++step)
  67         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  68           break;
  69       retval = step;
  70
  71       *buf = (unsigned char) (~0xff >> step);
  72       --step;
  73       do
  74         {
  75           buf[step] = 0x80 | (val & 0x3f);
  76           val >>= 6;
  77         }
  78       while (--step > 0);
  79       *buf |= val;
  80     }
  81
  82   return retval;
  83 }
  84 #endif
  85
  86
  87 size_t
  88 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  89 {
  90   struct locale_data *current = l->__locales[LC_COLLATE];
  91 #if __OPTION_EGLIBC_LOCALE_CODE
  92   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  93 #else
  94   const uint_fast32_t nrules = 0;
  95 #endif
  96   /* We don't assign the following values right away since it might be
  97      unnecessary in case there are no rules.  */
  98   const unsigned char *rulesets;
  99   const int32_t *table;
 100   const USTRING_TYPE *weights;
 101   const USTRING_TYPE *extra;
 102   const int32_t *indirect;
 103   uint_fast32_t pass;
 104   size_t needed;
 105   size_t last_needed;
 106   const USTRING_TYPE *usrc;
 107   size_t srclen = STRLEN (src);
 108   int32_t *idxarr;
 109   unsigned char *rulearr;
 110   size_t idxmax;
 111   size_t idxcnt;
 112   int use_malloc;
 113
 114 #include WEIGHT_H
 115
 116   if (nrules == 0)
 117     {
 118       if (n != 0)
 119         STPNCPY (dest, src, MIN (srclen + 1, n));
 120
 121       return srclen;
 122     }
 123
 124   rulesets = (const unsigned char *)
 125     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 126   table = (const int32_t *)
 127     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 128   weights = (const USTRING_TYPE *)
 129     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 130   extra = (const USTRING_TYPE *)
 131     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 132   indirect = (const int32_t *)
 133     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 134   use_malloc = 0;
 135
 136   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 137   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 138   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 139   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 140
 141   /* Handle an empty string as a special case.  */
 142   if (srclen == 0)
 143     {
 144       if (n != 0)
 145         *dest = L('\0');
 146       return 0;
 147     }
 148
 149   /* We need the elements of the string as unsigned values since they
 150      are used as indeces.  */
 151   usrc = (const USTRING_TYPE *) src;
 152
 153   /* Perform the first pass over the string and while doing this find
 154      and store the weights for each character.  Since we want this to
 155      be as fast as possible we are using `alloca' to store the temporary
 156      values.  But since there is no limit on the length of the string
 157      we have to use `malloc' if the string is too long.  We should be
 158      very conservative here.  */
 159   if (! __libc_use_alloca (srclen))
 160     {
 161       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 162       rulearr = (unsigned char *) &idxarr[srclen];
 163
 164       if (idxarr == NULL)
 165         /* No memory.  Well, go with the stack then.
 166
 167            XXX Once this implementation is stable we will handle this
 168            differently.  Instead of precomputing the indeces we will
 169            do this in time.  This means, though, that this happens for
 170            every pass again.  */
 171         goto try_stack;
 172       use_malloc = 1;
 173     }
 174   else
 175     {
 176     try_stack:
 177       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 178       rulearr = (unsigned char *) alloca (srclen + 1);
 179     }
 180
 181   idxmax = 0;
 182   do
 183     {
 184       int32_t tmp = findidx (&usrc);
 185       rulearr[idxmax] = tmp >> 24;
 186       idxarr[idxmax] = tmp & 0xffffff;
 187
 188       ++idxmax;
 189     }
 190   while (*usrc != L('\0'));
 191
 192   /* This element is only read, the value never used but to determine
 193      another value which then is ignored.  */
 194   rulearr[idxmax] = '\0';
 195
 196   /* Now the passes over the weights.  We now use the indeces we found
 197      before.  */
 198   needed = 0;
 199   for (pass = 0; pass < nrules; ++pass)
 200     {
 201       size_t backw_stop = ~0ul;
 202       int rule = rulesets[rulearr[0] * nrules + pass];
 203       /* We assume that if a rule has defined `position' in one section
 204          this is true for all of them.  */
 205       int position = rule & sort_position;
 206
 207       last_needed = needed;
 208       if (position == 0)
 209         {
 210           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 211             {
 212               if ((rule & sort_forward) != 0)
 213                 {
 214                   size_t len;
 215
 216                   if (backw_stop != ~0ul)
 217                     {
 218                       /* Handle the pushed elements now.  */
 219                       size_t backw;
 220
 221                       for (backw = idxcnt; backw > backw_stop; )
 222                         {
 223                           --backw;
 224                           len = weights[idxarr[backw]++];
 225
 226                           if (needed + len < n)
 227                             while (len-- > 0)
 228                               dest[needed++] = weights[idxarr[backw]++];
 229                           else
 230                             {
 231                                 /* No more characters fit into the buffer.  */
 232                               needed += len;
 233                               idxarr[backw] += len;
 234                             }
 235                         }
 236
 237                       backw_stop = ~0ul;
 238                     }
 239
 240                   /* Now handle the forward element.  */
 241                   len = weights[idxarr[idxcnt]++];
 242                   if (needed + len < n)
 243                     while (len-- > 0)
 244                       dest[needed++] = weights[idxarr[idxcnt]++];
 245                   else
 246                     {
 247                       /* No more characters fit into the buffer.  */
 248                       needed += len;
 249                       idxarr[idxcnt] += len;
 250                     }
 251                 }
 252               else
 253                 {
 254                   /* Remember where the backwards series started.  */
 255                   if (backw_stop == ~0ul)
 256                     backw_stop = idxcnt;
 257                 }
 258
 259               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 260             }
 261
 262
 263           if (backw_stop != ~0ul)
 264             {
 265               /* Handle the pushed elements now.  */
 266               size_t backw;
 267
 268               backw = idxcnt;
 269               while (backw > backw_stop)
 270                 {
 271                   size_t len = weights[idxarr[--backw]++];
 272
 273                   if (needed + len < n)
 274                     while (len-- > 0)
 275                       dest[needed++] = weights[idxarr[backw]++];
 276                   else
 277                     {
 278                       /* No more characters fit into the buffer.  */
 279                       needed += len;
 280                       idxarr[backw] += len;
 281                     }
 282                 }
 283             }
 284         }
 285       else
 286         {
 287           int val = 1;
 288 #ifndef WIDE_CHAR_VERSION
 289           char buf[7];
 290           size_t buflen;
 291 #endif
 292           size_t i;
 293
 294           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 295             {
 296               if ((rule & sort_forward) != 0)
 297                 {
 298                   size_t len;
 299
 300                   if (backw_stop != ~0ul)
 301                     {
 302                      /* Handle the pushed elements now.  */
 303                       size_t backw;
 304
 305                       for (backw = idxcnt; backw > backw_stop; )
 306                         {
 307                           --backw;
 308                           len = weights[idxarr[backw]++];
 309                           if (len != 0)
 310                             {
 311 #ifdef WIDE_CHAR_VERSION
 312                               if (needed + 1 + len < n)
 313                                 {
 314                                   dest[needed] = val;
 315                                   for (i = 0; i < len; ++i)
 316                                     dest[needed + 1 + i] =
 317                                       weights[idxarr[backw] + i];
 318                                 }
 319                               needed += 1 + len;
 320 #else
 321                               buflen = utf8_encode (buf, val);
 322                               if (needed + buflen + len < n)
 323                                 {
 324                                   for (i = 0; i < buflen; ++i)
 325                                     dest[needed + i] = buf[i];
 326                                   for (i = 0; i < len; ++i)
 327                                     dest[needed + buflen + i] =
 328                                       weights[idxarr[backw] + i];
 329                                 }
 330                               needed += buflen + len;
 331 #endif
 332                               idxarr[backw] += len;
 333                               val = 1;
 334                             }
 335                           else
 336                             ++val;
 337                         }
 338
 339                       backw_stop = ~0ul;
 340                     }
 341
 342                   /* Now handle the forward element.  */
 343                   len = weights[idxarr[idxcnt]++];
 344                   if (len != 0)
 345                     {
 346 #ifdef WIDE_CHAR_VERSION
 347                       if (needed + 1+ len < n)
 348                         {
 349                           dest[needed] = val;
 350                           for (i = 0; i < len; ++i)
 351                             dest[needed + 1 + i] =
 352                               weights[idxarr[idxcnt] + i];
 353                         }
 354                       needed += 1 + len;
 355 #else
 356                       buflen = utf8_encode (buf, val);
 357                       if (needed + buflen + len < n)
 358                         {
 359                           for (i = 0; i < buflen; ++i)
 360                             dest[needed + i] = buf[i];
 361                           for (i = 0; i < len; ++i)
 362                             dest[needed + buflen + i] =
 363                               weights[idxarr[idxcnt] + i];
 364                         }
 365                       needed += buflen + len;
 366 #endif
 367                       idxarr[idxcnt] += len;
 368                       val = 1;
 369                     }
 370                   else
 371                     /* Note that we don't have to increment `idxarr[idxcnt]'
 372                        since the length is zero.  */
 373                     ++val;
 374                 }
 375               else
 376                 {
 377                   /* Remember where the backwards series started.  */
 378                   if (backw_stop == ~0ul)
 379                     backw_stop = idxcnt;
 380                 }
 381
 382               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 383             }
 384
 385           if (backw_stop != ~0ul)
 386             {
 387               /* Handle the pushed elements now.  */
 388               size_t backw;
 389
 390               backw = idxmax - 1;
 391               while (backw > backw_stop)
 392                 {
 393                   size_t len = weights[idxarr[--backw]++];
 394                   if (len != 0)
 395                     {
 396 #ifdef WIDE_CHAR_VERSION
 397                       if (needed + 1 + len < n)
 398                         {
 399                           dest[needed] = val;
 400                           for (i = 0; i < len; ++i)
 401                             dest[needed + 1 + i] =
 402                               weights[idxarr[backw] + i];
 403                         }
 404                       needed += 1 + len;
 405 #else
 406                       buflen = utf8_encode (buf, val);
 407                       if (needed + buflen + len < n)
 408                         {
 409                           for (i = 0; i < buflen; ++i)
 410                             dest[needed + i] = buf[i];
 411                           for (i = 0; i < len; ++i)
 412                             dest[needed + buflen + i] =
 413                               weights[idxarr[backw] + i];
 414                         }
 415                       needed += buflen + len;
 416 #endif
 417                       idxarr[backw] += len;
 418                       val = 1;
 419                     }
 420                   else
 421                     ++val;
 422                 }
 423             }
 424         }
 425
 426       /* Finally store the byte to separate the passes or terminate
 427          the string.  */
 428       if (needed < n)
 429         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 430       ++needed;
 431     }
 432
 433   /* This is a little optimization: many collation specifications have
 434      a `position' rule at the end and if no non-ignored character
 435      is found the last \1 byte is immediately followed by a \0 byte
 436      signalling this.  We can avoid the \1 byte(s).  */
 437   if (needed > 2 && needed == last_needed + 1)
 438     {
 439       /* Remove the \1 byte.  */
 440       if (--needed <= n)
 441         dest[needed - 1] = L('\0');
 442     }
 443
 444   /* Free the memory if needed.  */
 445   if (use_malloc)
 446     free (idxarr);
 447
 448   /* Return the number of bytes/words we need, but don't count the NUL
 449      byte/word at the end.  */
 450   return needed - 1;
 451 }
 452 libc_hidden_def (STRXFRM)
 453
 454 #ifndef WIDE_CHAR_VERSION
 455 weak_alias (__strxfrm_l, strxfrm_l)
 456 #endif