posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <gnu/option-groups.h>
  22
  23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                           size_t length, reg_syntax_t syntax);
  25 static void re_compile_fastmap_iter (regex_t *bufp,
  26                                      const re_dfastate_t *init_state,
  27                                      char *fastmap);
  28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29 #ifdef RE_ENABLE_I18N
  30 static void free_charset (re_charset_t *cset);
  31 #endif /* RE_ENABLE_I18N */
  32 static void free_workarea_compile (regex_t *preg);
  33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34 #ifdef RE_ENABLE_I18N
  35 static void optimize_utf8 (re_dfa_t *dfa);
  36 #endif
  37 static reg_errcode_t analyze (regex_t *preg);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  52 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  53                                    unsigned int constraint);
  54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                          int node, int root);
  57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58 static int fetch_number (re_string_t *input, re_token_t *token,
  59                          reg_syntax_t syntax);
  60 static int peek_token (re_token_t *token, re_string_t *input,
  61                         reg_syntax_t syntax) internal_function;
  62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                           reg_syntax_t syntax, reg_errcode_t *err);
  64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                   re_token_t *token, reg_syntax_t syntax,
  66                                   int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                  re_token_t *token, reg_syntax_t syntax,
  69                                  int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                      re_token_t *token, reg_syntax_t syntax,
  72                                      int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                   re_token_t *token, reg_syntax_t syntax,
  75                                   int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                  re_dfa_t *dfa, re_token_t *token,
  78                                  reg_syntax_t syntax, reg_errcode_t *err);
  79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                       re_token_t *token, reg_syntax_t syntax,
  81                                       reg_errcode_t *err);
  82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                             re_string_t *regexp,
  84                                             re_token_t *token, int token_len,
  85                                             re_dfa_t *dfa,
  86                                             reg_syntax_t syntax,
  87                                             int accept_hyphen);
  88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                           re_string_t *regexp,
  90                                           re_token_t *token);
  91 #ifdef RE_ENABLE_I18N
  92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                         re_charset_t *mbcset,
  94                                         int *equiv_class_alloc,
  95                                         const unsigned char *name);
  96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                       bitset_t sbcset,
  98                                       re_charset_t *mbcset,
  99                                       int *char_class_alloc,
 100                                       const unsigned char *class_name,
 101                                       reg_syntax_t syntax);
 102 #else  /* not RE_ENABLE_I18N */
 103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                         const unsigned char *name);
 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                       bitset_t sbcset,
 107                                       const unsigned char *class_name,
 108                                       reg_syntax_t syntax);
 109 #endif /* not RE_ENABLE_I18N */
 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                        RE_TRANSLATE_TYPE trans,
 112                                        const unsigned char *class_name,
 113                                        const unsigned char *extra,
 114                                        int non_match, reg_errcode_t *err);
 115 static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                 bin_tree_t *left, bin_tree_t *right,
 117                                 re_token_type_t type);
 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                       bin_tree_t *left, bin_tree_t *right,
 120                                       const re_token_t *token);
 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122 static void free_token (re_token_t *node);
 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125 \f
 126 /* This table gives an error message for each of the error codes listed
 127    in regex.h.  Obviously the order here has to be same as there.
 128    POSIX doesn't require that we do anything for REG_NOERROR,
 129    but why not be nice?  */
 130
 131 const char __re_error_msgid[] attribute_hidden =
 132   {
 133 #define REG_NOERROR_IDX 0
 134     gettext_noop ("Success")    /* REG_NOERROR */
 135     "\0"
 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137     gettext_noop ("No match")   /* REG_NOMATCH */
 138     "\0"
 139 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141     "\0"
 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144     "\0"
 145 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147     "\0"
 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150     "\0"
 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153     "\0"
 154 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156     "\0"
 157 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159     "\0"
 160 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162     "\0"
 163 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165     "\0"
 166 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168     "\0"
 169 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171     "\0"
 172 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174     "\0"
 175 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177     "\0"
 178 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180     "\0"
 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183   };
 184
 185 const size_t __re_error_msgid_idx[] attribute_hidden =
 186   {
 187     REG_NOERROR_IDX,
 188     REG_NOMATCH_IDX,
 189     REG_BADPAT_IDX,
 190     REG_ECOLLATE_IDX,
 191     REG_ECTYPE_IDX,
 192     REG_EESCAPE_IDX,
 193     REG_ESUBREG_IDX,
 194     REG_EBRACK_IDX,
 195     REG_EPAREN_IDX,
 196     REG_EBRACE_IDX,
 197     REG_BADBR_IDX,
 198     REG_ERANGE_IDX,
 199     REG_ESPACE_IDX,
 200     REG_BADRPT_IDX,
 201     REG_EEND_IDX,
 202     REG_ESIZE_IDX,
 203     REG_ERPAREN_IDX
 204   };
 205 \f
 206 /* Entry points for GNU code.  */
 207
 208 /* re_compile_pattern is the GNU regular expression compiler: it
 209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 210    Returns 0 if the pattern was valid, otherwise an error string.
 211
 212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 213    are set in BUFP on entry.  */
 214
 215 const char *
 216 re_compile_pattern (pattern, length, bufp)
 217     const char *pattern;
 218     size_t length;
 219     struct re_pattern_buffer *bufp;
 220 {
 221   reg_errcode_t ret;
 222
 223   /* And GNU code determines whether or not to get register information
 224      by passing null for the REGS argument to re_match, etc., not by
 225      setting no_sub, unless RE_NO_SUB is set.  */
 226   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 227
 228   /* Match anchors at newline.  */
 229   bufp->newline_anchor = 1;
 230
 231   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 232
 233   if (!ret)
 234     return NULL;
 235   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 236 }
 237 #ifdef _LIBC
 238 weak_alias (__re_compile_pattern, re_compile_pattern)
 239 #endif
 240
 241 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 242    also be assigned to arbitrarily: each pattern buffer stores its own
 243    syntax, so it can be changed between regex compilations.  */
 244 /* This has no initializer because initialized variables in Emacs
 245    become read-only after dumping.  */
 246 reg_syntax_t re_syntax_options;
 247
 248
 249 /* Specify the precise syntax of regexps for compilation.  This provides
 250    for compatibility for various utilities which historically have
 251    different, incompatible syntaxes.
 252
 253    The argument SYNTAX is a bit mask comprised of the various bits
 254    defined in regex.h.  We return the old syntax.  */
 255
 256 reg_syntax_t
 257 re_set_syntax (syntax)
 258     reg_syntax_t syntax;
 259 {
 260   reg_syntax_t ret = re_syntax_options;
 261
 262   re_syntax_options = syntax;
 263   return ret;
 264 }
 265 #ifdef _LIBC
 266 weak_alias (__re_set_syntax, re_set_syntax)
 267 #endif
 268
 269 int
 270 re_compile_fastmap (bufp)
 271     struct re_pattern_buffer *bufp;
 272 {
 273   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 274   char *fastmap = bufp->fastmap;
 275
 276   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 277   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 278   if (dfa->init_state != dfa->init_state_word)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 280   if (dfa->init_state != dfa->init_state_nl)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 282   if (dfa->init_state != dfa->init_state_begbuf)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 284   bufp->fastmap_accurate = 1;
 285   return 0;
 286 }
 287 #ifdef _LIBC
 288 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 289 #endif
 290
 291 static inline void
 292 __attribute ((always_inline))
 293 re_set_fastmap (char *fastmap, int icase, int ch)
 294 {
 295   fastmap[ch] = 1;
 296   if (icase)
 297     fastmap[tolower (ch)] = 1;
 298 }
 299
 300 /* Helper function for re_compile_fastmap.
 301    Compile fastmap for the initial_state INIT_STATE.  */
 302
 303 static void
 304 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 305                          char *fastmap)
 306 {
 307   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 308   int node_cnt;
 309   int icase = (dfa_mb_cur_max (dfa) == 1 && (bufp->syntax & RE_ICASE));
 310   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 311     {
 312       int node = init_state->nodes.elems[node_cnt];
 313       re_token_type_t type = dfa->nodes[node].type;
 314
 315       if (type == CHARACTER)
 316         {
 317           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 318 #ifdef RE_ENABLE_I18N
 319           if ((bufp->syntax & RE_ICASE) && dfa_mb_cur_max (dfa) > 1)
 320             {
 321               unsigned char *buf = alloca (dfa_mb_cur_max (dfa)), *p;
 322               wchar_t wc;
 323               mbstate_t state;
 324
 325               p = buf;
 326               *p++ = dfa->nodes[node].opr.c;
 327               while (++node < dfa->nodes_len
 328                      && dfa->nodes[node].type == CHARACTER
 329                      && dfa->nodes[node].mb_partial)
 330                 *p++ = dfa->nodes[node].opr.c;
 331               memset (&state, '\0', sizeof (state));
 332               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 333                              &state) == p - buf
 334                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 335                       != (size_t) -1))
 336                 re_set_fastmap (fastmap, 0, buf[0]);
 337             }
 338 #endif
 339         }
 340       else if (type == SIMPLE_BRACKET)
 341         {
 342           int i, ch;
 343           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 344             {
 345               int j;
 346               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 347               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 348                 if (w & ((bitset_word_t) 1 << j))
 349                   re_set_fastmap (fastmap, icase, ch);
 350             }
 351         }
 352
 353       /* When OPTION_EGLIBC_LOCALE_CODE is disabled, the current
 354          locale is always C, which has no rules and no multi-byte
 355          characters.  */
 356 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           int i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa_mb_cur_max (dfa) > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa_mb_cur_max (dfa) > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      `buffer' to the compiled pattern;
 443      `used' to the length of the compiled pattern;
 444      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      `fastmap' to an allocated space for the fastmap;
 449      `fastmap_accurate' to zero;
 450      `re_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *__restrict preg;
 475     const char *__restrict pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *__restrict preg;
 538     char *__restrict errbuf;
 539     size_t errbuf_size;
 540 {
 541   const char *msg;
 542   size_t msg_size;
 543
 544   if (BE (errcode < 0
 545           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 546                                / sizeof (__re_error_msgid_idx[0])), 0))
 547     /* Only error codes returned by the rest of the code should be passed
 548        to this routine.  If we are given anything else, or if other regex
 549        code generates an invalid error code, then the program has a bug.
 550        Dump core so we can fix it.  */
 551     abort ();
 552
 553   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 554
 555   msg_size = strlen (msg) + 1; /* Includes the null.  */
 556
 557   if (BE (errbuf_size != 0, 1))
 558     {
 559       if (BE (msg_size > errbuf_size, 0))
 560         {
 561 #if defined HAVE_MEMPCPY || defined _LIBC
 562           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 563 #else
 564           memcpy (errbuf, msg, errbuf_size - 1);
 565           errbuf[errbuf_size - 1] = 0;
 566 #endif
 567         }
 568       else
 569         memcpy (errbuf, msg, msg_size);
 570     }
 571
 572   return msg_size;
 573 }
 574 #ifdef _LIBC
 575 weak_alias (__regerror, regerror)
 576 #endif
 577
 578
 579 #ifdef RE_ENABLE_I18N
 580 /* This static array is used for the map to single-byte characters when
 581    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 582    it the same all the time.  UTF-8 is the preferred encoding so this is
 583    a worthwhile optimization.  */
 584 static const bitset_t utf8_sb_map =
 585 {
 586   /* Set the first 128 bits.  */
 587   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 588 };
 589 #endif
 590
 591
 592 static void
 593 free_dfa_content (re_dfa_t *dfa)
 594 {
 595   int i, j;
 596
 597   if (dfa->nodes)
 598     for (i = 0; i < dfa->nodes_len; ++i)
 599       free_token (dfa->nodes + i);
 600   re_free (dfa->nexts);
 601   for (i = 0; i < dfa->nodes_len; ++i)
 602     {
 603       if (dfa->eclosures != NULL)
 604         re_node_set_free (dfa->eclosures + i);
 605       if (dfa->inveclosures != NULL)
 606         re_node_set_free (dfa->inveclosures + i);
 607       if (dfa->edests != NULL)
 608         re_node_set_free (dfa->edests + i);
 609     }
 610   re_free (dfa->edests);
 611   re_free (dfa->eclosures);
 612   re_free (dfa->inveclosures);
 613   re_free (dfa->nodes);
 614
 615   if (dfa->state_table)
 616     for (i = 0; i <= dfa->state_hash_mask; ++i)
 617       {
 618         struct re_state_table_entry *entry = dfa->state_table + i;
 619         for (j = 0; j < entry->num; ++j)
 620           {
 621             re_dfastate_t *state = entry->array[j];
 622             free_state (state);
 623           }
 624         re_free (entry->array);
 625       }
 626   re_free (dfa->state_table);
 627 #ifdef RE_ENABLE_I18N
 628   if (dfa->sb_char != utf8_sb_map)
 629     re_free (dfa->sb_char);
 630 #endif
 631   re_free (dfa->subexp_map);
 632 #ifdef DEBUG
 633   re_free (dfa->re_str);
 634 #endif
 635
 636   re_free (dfa);
 637 }
 638
 639
 640 /* Free dynamically allocated space used by PREG.  */
 641
 642 void
 643 regfree (preg)
 644     regex_t *preg;
 645 {
 646   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 647   if (BE (dfa != NULL, 1))
 648     free_dfa_content (dfa);
 649   preg->buffer = NULL;
 650   preg->allocated = 0;
 651
 652   re_free (preg->fastmap);
 653   preg->fastmap = NULL;
 654
 655   re_free (preg->translate);
 656   preg->translate = NULL;
 657 }
 658 #ifdef _LIBC
 659 weak_alias (__regfree, regfree)
 660 #endif
 661 \f
 662 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 663    them unless specifically requested.  */
 664
 665 #if defined _REGEX_RE_COMP || defined _LIBC
 666
 667 /* BSD has one and only one pattern buffer.  */
 668 static struct re_pattern_buffer re_comp_buf;
 669
 670 char *
 671 # ifdef _LIBC
 672 /* Make these definitions weak in libc, so POSIX programs can redefine
 673    these names if they don't use our functions, and still use
 674    regcomp/regexec above without link errors.  */
 675 weak_function
 676 # endif
 677 re_comp (s)
 678      const char *s;
 679 {
 680   reg_errcode_t ret;
 681   char *fastmap;
 682
 683   if (!s)
 684     {
 685       if (!re_comp_buf.buffer)
 686         return gettext ("No previous regular expression");
 687       return 0;
 688     }
 689
 690   if (re_comp_buf.buffer)
 691     {
 692       fastmap = re_comp_buf.fastmap;
 693       re_comp_buf.fastmap = NULL;
 694       __regfree (&re_comp_buf);
 695       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 696       re_comp_buf.fastmap = fastmap;
 697     }
 698
 699   if (re_comp_buf.fastmap == NULL)
 700     {
 701       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 702       if (re_comp_buf.fastmap == NULL)
 703         return (char *) gettext (__re_error_msgid
 704                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 705     }
 706
 707   /* Since `re_exec' always passes NULL for the `regs' argument, we
 708      don't need to initialize the pattern buffer fields which affect it.  */
 709
 710   /* Match anchors at newlines.  */
 711   re_comp_buf.newline_anchor = 1;
 712
 713   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 714
 715   if (!ret)
 716     return NULL;
 717
 718   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 719   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 720 }
 721
 722 #ifdef _LIBC
 723 libc_freeres_fn (free_mem)
 724 {
 725   __regfree (&re_comp_buf);
 726 }
 727 #endif
 728
 729 #endif /* _REGEX_RE_COMP */
 730 \f
 731 /* Internal entry point.
 732    Compile the regular expression PATTERN, whose length is LENGTH.
 733    SYNTAX indicate regular expression's syntax.  */
 734
 735 static reg_errcode_t
 736 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 737                      reg_syntax_t syntax)
 738 {
 739   reg_errcode_t err = REG_NOERROR;
 740   re_dfa_t *dfa;
 741   re_string_t regexp;
 742
 743   /* Initialize the pattern buffer.  */
 744   preg->fastmap_accurate = 0;
 745   preg->syntax = syntax;
 746   preg->not_bol = preg->not_eol = 0;
 747   preg->used = 0;
 748   preg->re_nsub = 0;
 749   preg->can_be_null = 0;
 750   preg->regs_allocated = REGS_UNALLOCATED;
 751
 752   /* Initialize the dfa.  */
 753   dfa = (re_dfa_t *) preg->buffer;
 754   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 755     {
 756       /* If zero allocated, but buffer is non-null, try to realloc
 757          enough space.  This loses if buffer's address is bogus, but
 758          that is the user's responsibility.  If ->buffer is NULL this
 759          is a simple allocation.  */
 760       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 761       if (dfa == NULL)
 762         return REG_ESPACE;
 763       preg->allocated = sizeof (re_dfa_t);
 764       preg->buffer = (unsigned char *) dfa;
 765     }
 766   preg->used = sizeof (re_dfa_t);
 767
 768   err = init_dfa (dfa, length);
 769   if (BE (err != REG_NOERROR, 0))
 770     {
 771       free_dfa_content (dfa);
 772       preg->buffer = NULL;
 773       preg->allocated = 0;
 774       return err;
 775     }
 776 #ifdef DEBUG
 777   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 778   dfa->re_str = re_malloc (char, length + 1);
 779   strncpy (dfa->re_str, pattern, length + 1);
 780 #endif
 781
 782   __libc_lock_init (dfa->lock);
 783
 784   err = re_string_construct (&regexp, pattern, length, preg->translate,
 785                              syntax & RE_ICASE, dfa);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788     re_compile_internal_free_return:
 789       free_workarea_compile (preg);
 790       re_string_destruct (&regexp);
 791       free_dfa_content (dfa);
 792       preg->buffer = NULL;
 793       preg->allocated = 0;
 794       return err;
 795     }
 796
 797   /* Parse the regular expression, and build a structure tree.  */
 798   preg->re_nsub = 0;
 799   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 800   if (BE (dfa->str_tree == NULL, 0))
 801     goto re_compile_internal_free_return;
 802
 803   /* Analyze the tree and create the nfa.  */
 804   err = analyze (preg);
 805   if (BE (err != REG_NOERROR, 0))
 806     goto re_compile_internal_free_return;
 807
 808 #ifdef RE_ENABLE_I18N
 809   /* If possible, do searching in single byte encoding to speed things up.  */
 810   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 811     optimize_utf8 (dfa);
 812 #endif
 813
 814   /* Then create the initial state of the dfa.  */
 815   err = create_initial_state (dfa);
 816
 817   /* Release work areas.  */
 818   free_workarea_compile (preg);
 819   re_string_destruct (&regexp);
 820
 821   if (BE (err != REG_NOERROR, 0))
 822     {
 823       free_dfa_content (dfa);
 824       preg->buffer = NULL;
 825       preg->allocated = 0;
 826     }
 827
 828   return err;
 829 }
 830
 831 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 832    as the initial length of some arrays.  */
 833
 834 static reg_errcode_t
 835 init_dfa (re_dfa_t *dfa, size_t pat_len)
 836 {
 837   unsigned int table_size;
 838 #ifndef _LIBC
 839   char *codeset_name;
 840 #endif
 841
 842   memset (dfa, '\0', sizeof (re_dfa_t));
 843
 844   /* Force allocation of str_tree_storage the first time.  */
 845   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 846
 847   /* Avoid overflows.  */
 848   if (pat_len == SIZE_MAX)
 849     return REG_ESPACE;
 850
 851   dfa->nodes_alloc = pat_len + 1;
 852   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 853
 854   /*  table_size = 2 ^ ceil(log pat_len) */
 855   for (table_size = 1; ; table_size <<= 1)
 856     if (table_size > pat_len)
 857       break;
 858
 859   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 860   dfa->state_hash_mask = table_size - 1;
 861
 862   dfa->mb_cur_max = MB_CUR_MAX;
 863 #ifdef _LIBC
 864   if (dfa_mb_cur_max (dfa) == 6
 865       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 866     dfa->is_utf8 = 1;
 867 # if __OPTION_EGLIBC_LOCALE_CODE
 868   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 869                        != 0);
 870 # else
 871   dfa->map_notascii = 0;
 872 # endif
 873 #else
 874 # ifdef HAVE_LANGINFO_CODESET
 875   codeset_name = nl_langinfo (CODESET);
 876 # else
 877   codeset_name = getenv ("LC_ALL");
 878   if (codeset_name == NULL || codeset_name[0] == '\0')
 879     codeset_name = getenv ("LC_CTYPE");
 880   if (codeset_name == NULL || codeset_name[0] == '\0')
 881     codeset_name = getenv ("LANG");
 882   if (codeset_name == NULL)
 883     codeset_name = "";
 884   else if (strchr (codeset_name, '.') !=  NULL)
 885     codeset_name = strchr (codeset_name, '.') + 1;
 886 # endif
 887
 888   if (strcasecmp (codeset_name, "UTF-8") == 0
 889       || strcasecmp (codeset_name, "UTF8") == 0)
 890     dfa->is_utf8 = 1;
 891
 892   /* We check exhaustively in the loop below if this charset is a
 893      superset of ASCII.  */
 894   dfa->map_notascii = 0;
 895 #endif
 896
 897 #ifdef RE_ENABLE_I18N
 898   if (dfa_mb_cur_max (dfa) > 1)
 899     {
 900       if (dfa->is_utf8)
 901         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 902       else
 903         {
 904           int i, j, ch;
 905
 906           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 907           if (BE (dfa->sb_char == NULL, 0))
 908             return REG_ESPACE;
 909
 910           /* Set the bits corresponding to single byte chars.  */
 911           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 912             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 913               {
 914                 wint_t wch = __btowc (ch);
 915                 if (wch != WEOF)
 916                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 917 # ifndef _LIBC
 918                 if (isascii (ch) && wch != ch)
 919                   dfa->map_notascii = 1;
 920 # endif
 921               }
 922         }
 923     }
 924 #endif
 925
 926   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 927     return REG_ESPACE;
 928   return REG_NOERROR;
 929 }
 930
 931 /* Initialize WORD_CHAR table, which indicate which character is
 932    "word".  In this case "word" means that it is the word construction
 933    character used by some operators like "\<", "\>", etc.  */
 934
 935 static void
 936 internal_function
 937 init_word_char (re_dfa_t *dfa)
 938 {
 939   int i, j, ch;
 940   dfa->word_ops_used = 1;
 941   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 942     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 943       if (isalnum (ch) || ch == '_')
 944         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 945 }
 946
 947 /* Free the work area which are only used while compiling.  */
 948
 949 static void
 950 free_workarea_compile (regex_t *preg)
 951 {
 952   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 953   bin_tree_storage_t *storage, *next;
 954   for (storage = dfa->str_tree_storage; storage; storage = next)
 955     {
 956       next = storage->next;
 957       re_free (storage);
 958     }
 959   dfa->str_tree_storage = NULL;
 960   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 961   dfa->str_tree = NULL;
 962   re_free (dfa->org_indices);
 963   dfa->org_indices = NULL;
 964 }
 965
 966 /* Create initial states for all contexts.  */
 967
 968 static reg_errcode_t
 969 create_initial_state (re_dfa_t *dfa)
 970 {
 971   int first, i;
 972   reg_errcode_t err;
 973   re_node_set init_nodes;
 974
 975   /* Initial states have the epsilon closure of the node which is
 976      the first node of the regular expression.  */
 977   first = dfa->str_tree->first->node_idx;
 978   dfa->init_node = first;
 979   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 980   if (BE (err != REG_NOERROR, 0))
 981     return err;
 982
 983   /* The back-references which are in initial states can epsilon transit,
 984      since in this case all of the subexpressions can be null.
 985      Then we add epsilon closures of the nodes which are the next nodes of
 986      the back-references.  */
 987   if (dfa->nbackref > 0)
 988     for (i = 0; i < init_nodes.nelem; ++i)
 989       {
 990         int node_idx = init_nodes.elems[i];
 991         re_token_type_t type = dfa->nodes[node_idx].type;
 992
 993         int clexp_idx;
 994         if (type != OP_BACK_REF)
 995           continue;
 996         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 997           {
 998             re_token_t *clexp_node;
 999             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1000             if (clexp_node->type == OP_CLOSE_SUBEXP
1001                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1002               break;
1003           }
1004         if (clexp_idx == init_nodes.nelem)
1005           continue;
1006
1007         if (type == OP_BACK_REF)
1008           {
1009             int dest_idx = dfa->edests[node_idx].elems[0];
1010             if (!re_node_set_contains (&init_nodes, dest_idx))
1011               {
1012                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1013                 i = 0;
1014               }
1015           }
1016       }
1017
1018   /* It must be the first time to invoke acquire_state.  */
1019   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1020   /* We don't check ERR here, since the initial state must not be NULL.  */
1021   if (BE (dfa->init_state == NULL, 0))
1022     return err;
1023   if (dfa->init_state->has_constraint)
1024     {
1025       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1026                                                        CONTEXT_WORD);
1027       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1028                                                      CONTEXT_NEWLINE);
1029       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1030                                                          &init_nodes,
1031                                                          CONTEXT_NEWLINE
1032                                                          | CONTEXT_BEGBUF);
1033       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1034               || dfa->init_state_begbuf == NULL, 0))
1035         return err;
1036     }
1037   else
1038     dfa->init_state_word = dfa->init_state_nl
1039       = dfa->init_state_begbuf = dfa->init_state;
1040
1041   re_node_set_free (&init_nodes);
1042   return REG_NOERROR;
1043 }
1044 \f
1045 #ifdef RE_ENABLE_I18N
1046 /* If it is possible to do searching in single byte encoding instead of UTF-8
1047    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1048    DFA nodes where needed.  */
1049
1050 static void
1051 optimize_utf8 (re_dfa_t *dfa)
1052 {
1053   int node, i, mb_chars = 0, has_period = 0;
1054
1055   for (node = 0; node < dfa->nodes_len; ++node)
1056     switch (dfa->nodes[node].type)
1057       {
1058       case CHARACTER:
1059         if (dfa->nodes[node].opr.c >= 0x80)
1060           mb_chars = 1;
1061         break;
1062       case ANCHOR:
1063         switch (dfa->nodes[node].opr.ctx_type)
1064           {
1065           case LINE_FIRST:
1066           case LINE_LAST:
1067           case BUF_FIRST:
1068           case BUF_LAST:
1069             break;
1070           default:
1071             /* Word anchors etc. cannot be handled.  It's okay to test
1072                opr.ctx_type since constraints (for all DFA nodes) are
1073                created by ORing one or more opr.ctx_type values.  */
1074             return;
1075           }
1076         break;
1077       case OP_PERIOD:
1078         has_period = 1;
1079         break;
1080       case OP_BACK_REF:
1081       case OP_ALT:
1082       case END_OF_RE:
1083       case OP_DUP_ASTERISK:
1084       case OP_OPEN_SUBEXP:
1085       case OP_CLOSE_SUBEXP:
1086         break;
1087       case COMPLEX_BRACKET:
1088         return;
1089       case SIMPLE_BRACKET:
1090         /* Just double check.  The non-ASCII range starts at 0x80.  */
1091         assert (0x80 % BITSET_WORD_BITS == 0);
1092         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1093           if (dfa->nodes[node].opr.sbcset[i])
1094             return;
1095         break;
1096       default:
1097         abort ();
1098       }
1099
1100   if (mb_chars || has_period)
1101     for (node = 0; node < dfa->nodes_len; ++node)
1102       {
1103         if (dfa->nodes[node].type == CHARACTER
1104             && dfa->nodes[node].opr.c >= 0x80)
1105           dfa->nodes[node].mb_partial = 0;
1106         else if (dfa->nodes[node].type == OP_PERIOD)
1107           dfa->nodes[node].type = OP_UTF8_PERIOD;
1108       }
1109
1110   /* The search can be in single byte locale.  */
1111   dfa->mb_cur_max = 1;
1112   dfa->is_utf8 = 0;
1113   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1114 }
1115 #endif
1116 \f
1117 /* Analyze the structure tree, and calculate "first", "next", "edest",
1118    "eclosure", and "inveclosure".  */
1119
1120 static reg_errcode_t
1121 analyze (regex_t *preg)
1122 {
1123   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1124   reg_errcode_t ret;
1125
1126   /* Allocate arrays.  */
1127   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1128   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1129   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1130   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1131   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1132           || dfa->eclosures == NULL, 0))
1133     return REG_ESPACE;
1134
1135   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1136   if (dfa->subexp_map != NULL)
1137     {
1138       int i;
1139       for (i = 0; i < preg->re_nsub; i++)
1140         dfa->subexp_map[i] = i;
1141       preorder (dfa->str_tree, optimize_subexps, dfa);
1142       for (i = 0; i < preg->re_nsub; i++)
1143         if (dfa->subexp_map[i] != i)
1144           break;
1145       if (i == preg->re_nsub)
1146         {
1147           free (dfa->subexp_map);
1148           dfa->subexp_map = NULL;
1149         }
1150     }
1151
1152   ret = postorder (dfa->str_tree, lower_subexps, preg);
1153   if (BE (ret != REG_NOERROR, 0))
1154     return ret;
1155   ret = postorder (dfa->str_tree, calc_first, dfa);
1156   if (BE (ret != REG_NOERROR, 0))
1157     return ret;
1158   preorder (dfa->str_tree, calc_next, dfa);
1159   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1160   if (BE (ret != REG_NOERROR, 0))
1161     return ret;
1162   ret = calc_eclosure (dfa);
1163   if (BE (ret != REG_NOERROR, 0))
1164     return ret;
1165
1166   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1167      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1168   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1169       || dfa->nbackref)
1170     {
1171       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1172       if (BE (dfa->inveclosures == NULL, 0))
1173         return REG_ESPACE;
1174       ret = calc_inveclosure (dfa);
1175     }
1176
1177   return ret;
1178 }
1179
1180 /* Our parse trees are very unbalanced, so we cannot use a stack to
1181    implement parse tree visits.  Instead, we use parent pointers and
1182    some hairy code in these two functions.  */
1183 static reg_errcode_t
1184 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1185            void *extra)
1186 {
1187   bin_tree_t *node, *prev;
1188
1189   for (node = root; ; )
1190     {
1191       /* Descend down the tree, preferably to the left (or to the right
1192          if that's the only child).  */
1193       while (node->left || node->right)
1194         if (node->left)
1195           node = node->left;
1196         else
1197           node = node->right;
1198
1199       do
1200         {
1201           reg_errcode_t err = fn (extra, node);
1202           if (BE (err != REG_NOERROR, 0))
1203             return err;
1204           if (node->parent == NULL)
1205             return REG_NOERROR;
1206           prev = node;
1207           node = node->parent;
1208         }
1209       /* Go up while we have a node that is reached from the right.  */
1210       while (node->right == prev || node->right == NULL);
1211       node = node->right;
1212     }
1213 }
1214
1215 static reg_errcode_t
1216 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217           void *extra)
1218 {
1219   bin_tree_t *node;
1220
1221   for (node = root; ; )
1222     {
1223       reg_errcode_t err = fn (extra, node);
1224       if (BE (err != REG_NOERROR, 0))
1225         return err;
1226
1227       /* Go to the left node, or up and to the right.  */
1228       if (node->left)
1229         node = node->left;
1230       else
1231         {
1232           bin_tree_t *prev = NULL;
1233           while (node->right == prev || node->right == NULL)
1234             {
1235               prev = node;
1236               node = node->parent;
1237               if (!node)
1238                 return REG_NOERROR;
1239             }
1240           node = node->right;
1241         }
1242     }
1243 }
1244
1245 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1246    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1247    backreferences as well.  Requires a preorder visit.  */
1248 static reg_errcode_t
1249 optimize_subexps (void *extra, bin_tree_t *node)
1250 {
1251   re_dfa_t *dfa = (re_dfa_t *) extra;
1252
1253   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1254     {
1255       int idx = node->token.opr.idx;
1256       node->token.opr.idx = dfa->subexp_map[idx];
1257       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1258     }
1259
1260   else if (node->token.type == SUBEXP
1261            && node->left && node->left->token.type == SUBEXP)
1262     {
1263       int other_idx = node->left->token.opr.idx;
1264
1265       node->left = node->left->left;
1266       if (node->left)
1267         node->left->parent = node;
1268
1269       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1270       if (other_idx < BITSET_WORD_BITS)
1271           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1272     }
1273
1274   return REG_NOERROR;
1275 }
1276
1277 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1278    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1279 static reg_errcode_t
1280 lower_subexps (void *extra, bin_tree_t *node)
1281 {
1282   regex_t *preg = (regex_t *) extra;
1283   reg_errcode_t err = REG_NOERROR;
1284
1285   if (node->left && node->left->token.type == SUBEXP)
1286     {
1287       node->left = lower_subexp (&err, preg, node->left);
1288       if (node->left)
1289         node->left->parent = node;
1290     }
1291   if (node->right && node->right->token.type == SUBEXP)
1292     {
1293       node->right = lower_subexp (&err, preg, node->right);
1294       if (node->right)
1295         node->right->parent = node;
1296     }
1297
1298   return err;
1299 }
1300
1301 static bin_tree_t *
1302 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1303 {
1304   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1305   bin_tree_t *body = node->left;
1306   bin_tree_t *op, *cls, *tree1, *tree;
1307
1308   if (preg->no_sub
1309       /* We do not optimize empty subexpressions, because otherwise we may
1310          have bad CONCAT nodes with NULL children.  This is obviously not
1311          very common, so we do not lose much.  An example that triggers
1312          this case is the sed "script" /\(\)/x.  */
1313       && node->left != NULL
1314       && (node->token.opr.idx >= BITSET_WORD_BITS
1315           || !(dfa->used_bkref_map
1316                & ((bitset_word_t) 1 << node->token.opr.idx))))
1317     return node->left;
1318
1319   /* Convert the SUBEXP node to the concatenation of an
1320      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1321   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1322   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1323   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1324   tree = create_tree (dfa, op, tree1, CONCAT);
1325   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1326     {
1327       *err = REG_ESPACE;
1328       return NULL;
1329     }
1330
1331   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1332   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1333   return tree;
1334 }
1335
1336 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1337    nodes.  Requires a postorder visit.  */
1338 static reg_errcode_t
1339 calc_first (void *extra, bin_tree_t *node)
1340 {
1341   re_dfa_t *dfa = (re_dfa_t *) extra;
1342   if (node->token.type == CONCAT)
1343     {
1344       node->first = node->left->first;
1345       node->node_idx = node->left->node_idx;
1346     }
1347   else
1348     {
1349       node->first = node;
1350       node->node_idx = re_dfa_add_node (dfa, node->token);
1351       if (BE (node->node_idx == -1, 0))
1352         return REG_ESPACE;
1353       if (node->token.type == ANCHOR)
1354         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1355     }
1356   return REG_NOERROR;
1357 }
1358
1359 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1360 static reg_errcode_t
1361 calc_next (void *extra, bin_tree_t *node)
1362 {
1363   switch (node->token.type)
1364     {
1365     case OP_DUP_ASTERISK:
1366       node->left->next = node;
1367       break;
1368     case CONCAT:
1369       node->left->next = node->right->first;
1370       node->right->next = node->next;
1371       break;
1372     default:
1373       if (node->left)
1374         node->left->next = node->next;
1375       if (node->right)
1376         node->right->next = node->next;
1377       break;
1378     }
1379   return REG_NOERROR;
1380 }
1381
1382 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1383 static reg_errcode_t
1384 link_nfa_nodes (void *extra, bin_tree_t *node)
1385 {
1386   re_dfa_t *dfa = (re_dfa_t *) extra;
1387   int idx = node->node_idx;
1388   reg_errcode_t err = REG_NOERROR;
1389
1390   switch (node->token.type)
1391     {
1392     case CONCAT:
1393       break;
1394
1395     case END_OF_RE:
1396       assert (node->next == NULL);
1397       break;
1398
1399     case OP_DUP_ASTERISK:
1400     case OP_ALT:
1401       {
1402         int left, right;
1403         dfa->has_plural_match = 1;
1404         if (node->left != NULL)
1405           left = node->left->first->node_idx;
1406         else
1407           left = node->next->node_idx;
1408         if (node->right != NULL)
1409           right = node->right->first->node_idx;
1410         else
1411           right = node->next->node_idx;
1412         assert (left > -1);
1413         assert (right > -1);
1414         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1415       }
1416       break;
1417
1418     case ANCHOR:
1419     case OP_OPEN_SUBEXP:
1420     case OP_CLOSE_SUBEXP:
1421       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1422       break;
1423
1424     case OP_BACK_REF:
1425       dfa->nexts[idx] = node->next->node_idx;
1426       if (node->token.type == OP_BACK_REF)
1427         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1428       break;
1429
1430     default:
1431       assert (!IS_EPSILON_NODE (node->token.type));
1432       dfa->nexts[idx] = node->next->node_idx;
1433       break;
1434     }
1435
1436   return err;
1437 }
1438
1439 /* Duplicate the epsilon closure of the node ROOT_NODE.
1440    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1441    to their own constraint.  */
1442
1443 static reg_errcode_t
1444 internal_function
1445 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1446                         int root_node, unsigned int init_constraint)
1447 {
1448   int org_node, clone_node, ret;
1449   unsigned int constraint = init_constraint;
1450   for (org_node = top_org_node, clone_node = top_clone_node;;)
1451     {
1452       int org_dest, clone_dest;
1453       if (dfa->nodes[org_node].type == OP_BACK_REF)
1454         {
1455           /* If the back reference epsilon-transit, its destination must
1456              also have the constraint.  Then duplicate the epsilon closure
1457              of the destination of the back reference, and store it in
1458              edests of the back reference.  */
1459           org_dest = dfa->nexts[org_node];
1460           re_node_set_empty (dfa->edests + clone_node);
1461           clone_dest = duplicate_node (dfa, org_dest, constraint);
1462           if (BE (clone_dest == -1, 0))
1463             return REG_ESPACE;
1464           dfa->nexts[clone_node] = dfa->nexts[org_node];
1465           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466           if (BE (ret < 0, 0))
1467             return REG_ESPACE;
1468         }
1469       else if (dfa->edests[org_node].nelem == 0)
1470         {
1471           /* In case of the node can't epsilon-transit, don't duplicate the
1472              destination and store the original destination as the
1473              destination of the node.  */
1474           dfa->nexts[clone_node] = dfa->nexts[org_node];
1475           break;
1476         }
1477       else if (dfa->edests[org_node].nelem == 1)
1478         {
1479           /* In case of the node can epsilon-transit, and it has only one
1480              destination.  */
1481           org_dest = dfa->edests[org_node].elems[0];
1482           re_node_set_empty (dfa->edests + clone_node);
1483           /* If the node is root_node itself, it means the epsilon clsoure
1484              has a loop.   Then tie it to the destination of the root_node.  */
1485           if (org_node == root_node && clone_node != org_node)
1486             {
1487               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1488               if (BE (ret < 0, 0))
1489                 return REG_ESPACE;
1490               break;
1491             }
1492           /* In case of the node has another constraint, add it.  */
1493           constraint |= dfa->nodes[org_node].constraint;
1494           clone_dest = duplicate_node (dfa, org_dest, constraint);
1495           if (BE (clone_dest == -1, 0))
1496             return REG_ESPACE;
1497           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498           if (BE (ret < 0, 0))
1499             return REG_ESPACE;
1500         }
1501       else /* dfa->edests[org_node].nelem == 2 */
1502         {
1503           /* In case of the node can epsilon-transit, and it has two
1504              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1505           org_dest = dfa->edests[org_node].elems[0];
1506           re_node_set_empty (dfa->edests + clone_node);
1507           /* Search for a duplicated node which satisfies the constraint.  */
1508           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1509           if (clone_dest == -1)
1510             {
1511               /* There is no such duplicated node, create a new one.  */
1512               reg_errcode_t err;
1513               clone_dest = duplicate_node (dfa, org_dest, constraint);
1514               if (BE (clone_dest == -1, 0))
1515                 return REG_ESPACE;
1516               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517               if (BE (ret < 0, 0))
1518                 return REG_ESPACE;
1519               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1520                                             root_node, constraint);
1521               if (BE (err != REG_NOERROR, 0))
1522                 return err;
1523             }
1524           else
1525             {
1526               /* There is a duplicated node which satisfies the constraint,
1527                  use it to avoid infinite loop.  */
1528               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529               if (BE (ret < 0, 0))
1530                 return REG_ESPACE;
1531             }
1532
1533           org_dest = dfa->edests[org_node].elems[1];
1534           clone_dest = duplicate_node (dfa, org_dest, constraint);
1535           if (BE (clone_dest == -1, 0))
1536             return REG_ESPACE;
1537           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538           if (BE (ret < 0, 0))
1539             return REG_ESPACE;
1540         }
1541       org_node = org_dest;
1542       clone_node = clone_dest;
1543     }
1544   return REG_NOERROR;
1545 }
1546
1547 /* Search for a node which is duplicated from the node ORG_NODE, and
1548    satisfies the constraint CONSTRAINT.  */
1549
1550 static int
1551 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1552                         unsigned int constraint)
1553 {
1554   int idx;
1555   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1556     {
1557       if (org_node == dfa->org_indices[idx]
1558           && constraint == dfa->nodes[idx].constraint)
1559         return idx; /* Found.  */
1560     }
1561   return -1; /* Not found.  */
1562 }
1563
1564 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1565    Return the index of the new node, or -1 if insufficient storage is
1566    available.  */
1567
1568 static int
1569 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1570 {
1571   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1572   if (BE (dup_idx != -1, 1))
1573     {
1574       dfa->nodes[dup_idx].constraint = constraint;
1575       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1576       dfa->nodes[dup_idx].duplicated = 1;
1577
1578       /* Store the index of the original node.  */
1579       dfa->org_indices[dup_idx] = org_idx;
1580     }
1581   return dup_idx;
1582 }
1583
1584 static reg_errcode_t
1585 calc_inveclosure (re_dfa_t *dfa)
1586 {
1587   int src, idx, ret;
1588   for (idx = 0; idx < dfa->nodes_len; ++idx)
1589     re_node_set_init_empty (dfa->inveclosures + idx);
1590
1591   for (src = 0; src < dfa->nodes_len; ++src)
1592     {
1593       int *elems = dfa->eclosures[src].elems;
1594       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1595         {
1596           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1597           if (BE (ret == -1, 0))
1598             return REG_ESPACE;
1599         }
1600     }
1601
1602   return REG_NOERROR;
1603 }
1604
1605 /* Calculate "eclosure" for all the node in DFA.  */
1606
1607 static reg_errcode_t
1608 calc_eclosure (re_dfa_t *dfa)
1609 {
1610   int node_idx, incomplete;
1611 #ifdef DEBUG
1612   assert (dfa->nodes_len > 0);
1613 #endif
1614   incomplete = 0;
1615   /* For each nodes, calculate epsilon closure.  */
1616   for (node_idx = 0; ; ++node_idx)
1617     {
1618       reg_errcode_t err;
1619       re_node_set eclosure_elem;
1620       if (node_idx == dfa->nodes_len)
1621         {
1622           if (!incomplete)
1623             break;
1624           incomplete = 0;
1625           node_idx = 0;
1626         }
1627
1628 #ifdef DEBUG
1629       assert (dfa->eclosures[node_idx].nelem != -1);
1630 #endif
1631
1632       /* If we have already calculated, skip it.  */
1633       if (dfa->eclosures[node_idx].nelem != 0)
1634         continue;
1635       /* Calculate epsilon closure of `node_idx'.  */
1636       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1637       if (BE (err != REG_NOERROR, 0))
1638         return err;
1639
1640       if (dfa->eclosures[node_idx].nelem == 0)
1641         {
1642           incomplete = 1;
1643           re_node_set_free (&eclosure_elem);
1644         }
1645     }
1646   return REG_NOERROR;
1647 }
1648
1649 /* Calculate epsilon closure of NODE.  */
1650
1651 static reg_errcode_t
1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1653 {
1654   reg_errcode_t err;
1655   int i;
1656   re_node_set eclosure;
1657   int ret;
1658   int incomplete = 0;
1659   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1660   if (BE (err != REG_NOERROR, 0))
1661     return err;
1662
1663   /* This indicates that we are calculating this node now.
1664      We reference this value to avoid infinite loop.  */
1665   dfa->eclosures[node].nelem = -1;
1666
1667   /* If the current node has constraints, duplicate all nodes
1668      since they must inherit the constraints.  */
1669   if (dfa->nodes[node].constraint
1670       && dfa->edests[node].nelem
1671       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1672     {
1673       err = duplicate_node_closure (dfa, node, node, node,
1674                                     dfa->nodes[node].constraint);
1675       if (BE (err != REG_NOERROR, 0))
1676         return err;
1677     }
1678
1679   /* Expand each epsilon destination nodes.  */
1680   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1681     for (i = 0; i < dfa->edests[node].nelem; ++i)
1682       {
1683         re_node_set eclosure_elem;
1684         int edest = dfa->edests[node].elems[i];
1685         /* If calculating the epsilon closure of `edest' is in progress,
1686            return intermediate result.  */
1687         if (dfa->eclosures[edest].nelem == -1)
1688           {
1689             incomplete = 1;
1690             continue;
1691           }
1692         /* If we haven't calculated the epsilon closure of `edest' yet,
1693            calculate now. Otherwise use calculated epsilon closure.  */
1694         if (dfa->eclosures[edest].nelem == 0)
1695           {
1696             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1697             if (BE (err != REG_NOERROR, 0))
1698               return err;
1699           }
1700         else
1701           eclosure_elem = dfa->eclosures[edest];
1702         /* Merge the epsilon closure of `edest'.  */
1703         re_node_set_merge (&eclosure, &eclosure_elem);
1704         /* If the epsilon closure of `edest' is incomplete,
1705            the epsilon closure of this node is also incomplete.  */
1706         if (dfa->eclosures[edest].nelem == 0)
1707           {
1708             incomplete = 1;
1709             re_node_set_free (&eclosure_elem);
1710           }
1711       }
1712
1713   /* An epsilon closure includes itself.  */
1714   ret = re_node_set_insert (&eclosure, node);
1715   if (BE (ret < 0, 0))
1716     return REG_ESPACE;
1717   if (incomplete && !root)
1718     dfa->eclosures[node].nelem = 0;
1719   else
1720     dfa->eclosures[node] = eclosure;
1721   *new_set = eclosure;
1722   return REG_NOERROR;
1723 }
1724 \f
1725 /* Functions for token which are used in the parser.  */
1726
1727 /* Fetch a token from INPUT.
1728    We must not use this function inside bracket expressions.  */
1729
1730 static void
1731 internal_function
1732 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1733 {
1734   re_string_skip_bytes (input, peek_token (result, input, syntax));
1735 }
1736
1737 /* Peek a token from INPUT, and return the length of the token.
1738    We must not use this function inside bracket expressions.  */
1739
1740 static int
1741 internal_function
1742 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1743 {
1744   unsigned char c;
1745
1746   if (re_string_eoi (input))
1747     {
1748       token->type = END_OF_RE;
1749       return 0;
1750     }
1751
1752   c = re_string_peek_byte (input, 0);
1753   token->opr.c = c;
1754
1755   token->word_char = 0;
1756 #ifdef RE_ENABLE_I18N
1757   token->mb_partial = 0;
1758   if (string_mb_cur_max (input) > 1 &&
1759       !re_string_first_byte (input, re_string_cur_idx (input)))
1760     {
1761       token->type = CHARACTER;
1762       token->mb_partial = 1;
1763       return 1;
1764     }
1765 #endif
1766   if (c == '\\')
1767     {
1768       unsigned char c2;
1769       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1770         {
1771           token->type = BACK_SLASH;
1772           return 1;
1773         }
1774
1775       c2 = re_string_peek_byte_case (input, 1);
1776       token->opr.c = c2;
1777       token->type = CHARACTER;
1778 #ifdef RE_ENABLE_I18N
1779       if (string_mb_cur_max (input) > 1)
1780         {
1781           wint_t wc = re_string_wchar_at (input,
1782                                           re_string_cur_idx (input) + 1);
1783           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1784         }
1785       else
1786 #endif
1787         token->word_char = IS_WORD_CHAR (c2) != 0;
1788
1789       switch (c2)
1790         {
1791         case '|':
1792           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1793             token->type = OP_ALT;
1794           break;
1795         case '1': case '2': case '3': case '4': case '5':
1796         case '6': case '7': case '8': case '9':
1797           if (!(syntax & RE_NO_BK_REFS))
1798             {
1799               token->type = OP_BACK_REF;
1800               token->opr.idx = c2 - '1';
1801             }
1802           break;
1803         case '<':
1804           if (!(syntax & RE_NO_GNU_OPS))
1805             {
1806               token->type = ANCHOR;
1807               token->opr.ctx_type = WORD_FIRST;
1808             }
1809           break;
1810         case '>':
1811           if (!(syntax & RE_NO_GNU_OPS))
1812             {
1813               token->type = ANCHOR;
1814               token->opr.ctx_type = WORD_LAST;
1815             }
1816           break;
1817         case 'b':
1818           if (!(syntax & RE_NO_GNU_OPS))
1819             {
1820               token->type = ANCHOR;
1821               token->opr.ctx_type = WORD_DELIM;
1822             }
1823           break;
1824         case 'B':
1825           if (!(syntax & RE_NO_GNU_OPS))
1826             {
1827               token->type = ANCHOR;
1828               token->opr.ctx_type = NOT_WORD_DELIM;
1829             }
1830           break;
1831         case 'w':
1832           if (!(syntax & RE_NO_GNU_OPS))
1833             token->type = OP_WORD;
1834           break;
1835         case 'W':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             token->type = OP_NOTWORD;
1838           break;
1839         case 's':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             token->type = OP_SPACE;
1842           break;
1843         case 'S':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             token->type = OP_NOTSPACE;
1846           break;
1847         case '`':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = BUF_FIRST;
1852             }
1853           break;
1854         case '\'':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = BUF_LAST;
1859             }
1860           break;
1861         case '(':
1862           if (!(syntax & RE_NO_BK_PARENS))
1863             token->type = OP_OPEN_SUBEXP;
1864           break;
1865         case ')':
1866           if (!(syntax & RE_NO_BK_PARENS))
1867             token->type = OP_CLOSE_SUBEXP;
1868           break;
1869         case '+':
1870           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1871             token->type = OP_DUP_PLUS;
1872           break;
1873         case '?':
1874           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1875             token->type = OP_DUP_QUESTION;
1876           break;
1877         case '{':
1878           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1879             token->type = OP_OPEN_DUP_NUM;
1880           break;
1881         case '}':
1882           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1883             token->type = OP_CLOSE_DUP_NUM;
1884           break;
1885         default:
1886           break;
1887         }
1888       return 2;
1889     }
1890
1891   token->type = CHARACTER;
1892 #ifdef RE_ENABLE_I18N
1893   if (string_mb_cur_max (input) > 1)
1894     {
1895       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1896       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1897     }
1898   else
1899 #endif
1900     token->word_char = IS_WORD_CHAR (token->opr.c);
1901
1902   switch (c)
1903     {
1904     case '\n':
1905       if (syntax & RE_NEWLINE_ALT)
1906         token->type = OP_ALT;
1907       break;
1908     case '|':
1909       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1910         token->type = OP_ALT;
1911       break;
1912     case '*':
1913       token->type = OP_DUP_ASTERISK;
1914       break;
1915     case '+':
1916       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1917         token->type = OP_DUP_PLUS;
1918       break;
1919     case '?':
1920       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1921         token->type = OP_DUP_QUESTION;
1922       break;
1923     case '{':
1924       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1925         token->type = OP_OPEN_DUP_NUM;
1926       break;
1927     case '}':
1928       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1929         token->type = OP_CLOSE_DUP_NUM;
1930       break;
1931     case '(':
1932       if (syntax & RE_NO_BK_PARENS)
1933         token->type = OP_OPEN_SUBEXP;
1934       break;
1935     case ')':
1936       if (syntax & RE_NO_BK_PARENS)
1937         token->type = OP_CLOSE_SUBEXP;
1938       break;
1939     case '[':
1940       token->type = OP_OPEN_BRACKET;
1941       break;
1942     case '.':
1943       token->type = OP_PERIOD;
1944       break;
1945     case '^':
1946       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1947           re_string_cur_idx (input) != 0)
1948         {
1949           char prev = re_string_peek_byte (input, -1);
1950           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1951             break;
1952         }
1953       token->type = ANCHOR;
1954       token->opr.ctx_type = LINE_FIRST;
1955       break;
1956     case '$':
1957       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1958           re_string_cur_idx (input) + 1 != re_string_length (input))
1959         {
1960           re_token_t next;
1961           re_string_skip_bytes (input, 1);
1962           peek_token (&next, input, syntax);
1963           re_string_skip_bytes (input, -1);
1964           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1965             break;
1966         }
1967       token->type = ANCHOR;
1968       token->opr.ctx_type = LINE_LAST;
1969       break;
1970     default:
1971       break;
1972     }
1973   return 1;
1974 }
1975
1976 /* Peek a token from INPUT, and return the length of the token.
1977    We must not use this function out of bracket expressions.  */
1978
1979 static int
1980 internal_function
1981 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1982 {
1983   unsigned char c;
1984   if (re_string_eoi (input))
1985     {
1986       token->type = END_OF_RE;
1987       return 0;
1988     }
1989   c = re_string_peek_byte (input, 0);
1990   token->opr.c = c;
1991
1992 #ifdef RE_ENABLE_I18N
1993   if (string_mb_cur_max (input) > 1 &&
1994       !re_string_first_byte (input, re_string_cur_idx (input)))
1995     {
1996       token->type = CHARACTER;
1997       return 1;
1998     }
1999 #endif /* RE_ENABLE_I18N */
2000
2001   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2002       && re_string_cur_idx (input) + 1 < re_string_length (input))
2003     {
2004       /* In this case, '\' escape a character.  */
2005       unsigned char c2;
2006       re_string_skip_bytes (input, 1);
2007       c2 = re_string_peek_byte (input, 0);
2008       token->opr.c = c2;
2009       token->type = CHARACTER;
2010       return 1;
2011     }
2012   if (c == '[') /* '[' is a special char in a bracket exps.  */
2013     {
2014       unsigned char c2;
2015       int token_len;
2016       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2017         c2 = re_string_peek_byte (input, 1);
2018       else
2019         c2 = 0;
2020       token->opr.c = c2;
2021       token_len = 2;
2022       switch (c2)
2023         {
2024         case '.':
2025           token->type = OP_OPEN_COLL_ELEM;
2026           break;
2027         case '=':
2028           token->type = OP_OPEN_EQUIV_CLASS;
2029           break;
2030         case ':':
2031           if (syntax & RE_CHAR_CLASSES)
2032             {
2033               token->type = OP_OPEN_CHAR_CLASS;
2034               break;
2035             }
2036           /* else fall through.  */
2037         default:
2038           token->type = CHARACTER;
2039           token->opr.c = c;
2040           token_len = 1;
2041           break;
2042         }
2043       return token_len;
2044     }
2045   switch (c)
2046     {
2047     case '-':
2048       token->type = OP_CHARSET_RANGE;
2049       break;
2050     case ']':
2051       token->type = OP_CLOSE_BRACKET;
2052       break;
2053     case '^':
2054       token->type = OP_NON_MATCH_LIST;
2055       break;
2056     default:
2057       token->type = CHARACTER;
2058     }
2059   return 1;
2060 }
2061 \f
2062 /* Functions for parser.  */
2063
2064 /* Entry point of the parser.
2065    Parse the regular expression REGEXP and return the structure tree.
2066    If an error is occured, ERR is set by error code, and return NULL.
2067    This function build the following tree, from regular expression <reg_exp>:
2068            CAT
2069            / \
2070           /   \
2071    <reg_exp>  EOR
2072
2073    CAT means concatenation.
2074    EOR means end of regular expression.  */
2075
2076 static bin_tree_t *
2077 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2078        reg_errcode_t *err)
2079 {
2080   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2081   bin_tree_t *tree, *eor, *root;
2082   re_token_t current_token;
2083   dfa->syntax = syntax;
2084   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2085   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2086   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2087     return NULL;
2088   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2089   if (tree != NULL)
2090     root = create_tree (dfa, tree, eor, CONCAT);
2091   else
2092     root = eor;
2093   if (BE (eor == NULL || root == NULL, 0))
2094     {
2095       *err = REG_ESPACE;
2096       return NULL;
2097     }
2098   return root;
2099 }
2100
2101 /* This function build the following tree, from regular expression
2102    <branch1>|<branch2>:
2103            ALT
2104            / \
2105           /   \
2106    <branch1> <branch2>
2107
2108    ALT means alternative, which represents the operator `|'.  */
2109
2110 static bin_tree_t *
2111 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2112                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2113 {
2114   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115   bin_tree_t *tree, *branch = NULL;
2116   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2117   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118     return NULL;
2119
2120   while (token->type == OP_ALT)
2121     {
2122       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2123       if (token->type != OP_ALT && token->type != END_OF_RE
2124           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2125         {
2126           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2127           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2128             return NULL;
2129         }
2130       else
2131         branch = NULL;
2132       tree = create_tree (dfa, tree, branch, OP_ALT);
2133       if (BE (tree == NULL, 0))
2134         {
2135           *err = REG_ESPACE;
2136           return NULL;
2137         }
2138     }
2139   return tree;
2140 }
2141
2142 /* This function build the following tree, from regular expression
2143    <exp1><exp2>:
2144         CAT
2145         / \
2146        /   \
2147    <exp1> <exp2>
2148
2149    CAT means concatenation.  */
2150
2151 static bin_tree_t *
2152 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2154 {
2155   bin_tree_t *tree, *exp;
2156   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2157   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2158   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2159     return NULL;
2160
2161   while (token->type != OP_ALT && token->type != END_OF_RE
2162          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2163     {
2164       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2165       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2166         {
2167           if (tree != NULL)
2168             postorder (tree, free_tree, NULL);
2169           return NULL;
2170         }
2171       if (tree != NULL && exp != NULL)
2172         {
2173           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2174           if (newtree == NULL)
2175             {
2176               postorder (exp, free_tree, NULL);
2177               postorder (tree, free_tree, NULL);
2178               *err = REG_ESPACE;
2179               return NULL;
2180             }
2181           tree = newtree;
2182         }
2183       else if (tree == NULL)
2184         tree = exp;
2185       /* Otherwise exp == NULL, we don't need to create new tree.  */
2186     }
2187   return tree;
2188 }
2189
2190 /* This function build the following tree, from regular expression a*:
2191          *
2192          |
2193          a
2194 */
2195
2196 static bin_tree_t *
2197 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2198                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2199 {
2200   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2201   bin_tree_t *tree;
2202   switch (token->type)
2203     {
2204     case CHARACTER:
2205       tree = create_token_tree (dfa, NULL, NULL, token);
2206       if (BE (tree == NULL, 0))
2207         {
2208           *err = REG_ESPACE;
2209           return NULL;
2210         }
2211 #ifdef RE_ENABLE_I18N
2212       if (dfa_mb_cur_max (dfa) > 1)
2213         {
2214           while (!re_string_eoi (regexp)
2215                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2216             {
2217               bin_tree_t *mbc_remain;
2218               fetch_token (token, regexp, syntax);
2219               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2220               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2221               if (BE (mbc_remain == NULL || tree == NULL, 0))
2222                 {
2223                   *err = REG_ESPACE;
2224                   return NULL;
2225                 }
2226             }
2227         }
2228 #endif
2229       break;
2230     case OP_OPEN_SUBEXP:
2231       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2232       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233         return NULL;
2234       break;
2235     case OP_OPEN_BRACKET:
2236       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2237       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2238         return NULL;
2239       break;
2240     case OP_BACK_REF:
2241       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2242         {
2243           *err = REG_ESUBREG;
2244           return NULL;
2245         }
2246       dfa->used_bkref_map |= 1 << token->opr.idx;
2247       tree = create_token_tree (dfa, NULL, NULL, token);
2248       if (BE (tree == NULL, 0))
2249         {
2250           *err = REG_ESPACE;
2251           return NULL;
2252         }
2253       ++dfa->nbackref;
2254       dfa->has_mb_node = 1;
2255       break;
2256     case OP_OPEN_DUP_NUM:
2257       if (syntax & RE_CONTEXT_INVALID_DUP)
2258         {
2259           *err = REG_BADRPT;
2260           return NULL;
2261         }
2262       /* FALLTHROUGH */
2263     case OP_DUP_ASTERISK:
2264     case OP_DUP_PLUS:
2265     case OP_DUP_QUESTION:
2266       if (syntax & RE_CONTEXT_INVALID_OPS)
2267         {
2268           *err = REG_BADRPT;
2269           return NULL;
2270         }
2271       else if (syntax & RE_CONTEXT_INDEP_OPS)
2272         {
2273           fetch_token (token, regexp, syntax);
2274           return parse_expression (regexp, preg, token, syntax, nest, err);
2275         }
2276       /* else fall through  */
2277     case OP_CLOSE_SUBEXP:
2278       if ((token->type == OP_CLOSE_SUBEXP) &&
2279           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2280         {
2281           *err = REG_ERPAREN;
2282           return NULL;
2283         }
2284       /* else fall through  */
2285     case OP_CLOSE_DUP_NUM:
2286       /* We treat it as a normal character.  */
2287
2288       /* Then we can these characters as normal characters.  */
2289       token->type = CHARACTER;
2290       /* mb_partial and word_char bits should be initialized already
2291          by peek_token.  */
2292       tree = create_token_tree (dfa, NULL, NULL, token);
2293       if (BE (tree == NULL, 0))
2294         {
2295           *err = REG_ESPACE;
2296           return NULL;
2297         }
2298       break;
2299     case ANCHOR:
2300       if ((token->opr.ctx_type
2301            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2302           && dfa->word_ops_used == 0)
2303         init_word_char (dfa);
2304       if (token->opr.ctx_type == WORD_DELIM
2305           || token->opr.ctx_type == NOT_WORD_DELIM)
2306         {
2307           bin_tree_t *tree_first, *tree_last;
2308           if (token->opr.ctx_type == WORD_DELIM)
2309             {
2310               token->opr.ctx_type = WORD_FIRST;
2311               tree_first = create_token_tree (dfa, NULL, NULL, token);
2312               token->opr.ctx_type = WORD_LAST;
2313             }
2314           else
2315             {
2316               token->opr.ctx_type = INSIDE_WORD;
2317               tree_first = create_token_tree (dfa, NULL, NULL, token);
2318               token->opr.ctx_type = INSIDE_NOTWORD;
2319             }
2320           tree_last = create_token_tree (dfa, NULL, NULL, token);
2321           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2322           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2323             {
2324               *err = REG_ESPACE;
2325               return NULL;
2326             }
2327         }
2328       else
2329         {
2330           tree = create_token_tree (dfa, NULL, NULL, token);
2331           if (BE (tree == NULL, 0))
2332             {
2333               *err = REG_ESPACE;
2334               return NULL;
2335             }
2336         }
2337       /* We must return here, since ANCHORs can't be followed
2338          by repetition operators.
2339          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2340              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2341       fetch_token (token, regexp, syntax);
2342       return tree;
2343     case OP_PERIOD:
2344       tree = create_token_tree (dfa, NULL, NULL, token);
2345       if (BE (tree == NULL, 0))
2346         {
2347           *err = REG_ESPACE;
2348           return NULL;
2349         }
2350       if (dfa_mb_cur_max (dfa) > 1)
2351         dfa->has_mb_node = 1;
2352       break;
2353     case OP_WORD:
2354     case OP_NOTWORD:
2355       tree = build_charclass_op (dfa, regexp->trans,
2356                                  (const unsigned char *) "alnum",
2357                                  (const unsigned char *) "_",
2358                                  token->type == OP_NOTWORD, err);
2359       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2360         return NULL;
2361       break;
2362     case OP_SPACE:
2363     case OP_NOTSPACE:
2364       tree = build_charclass_op (dfa, regexp->trans,
2365                                  (const unsigned char *) "space",
2366                                  (const unsigned char *) "",
2367                                  token->type == OP_NOTSPACE, err);
2368       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2369         return NULL;
2370       break;
2371     case OP_ALT:
2372     case END_OF_RE:
2373       return NULL;
2374     case BACK_SLASH:
2375       *err = REG_EESCAPE;
2376       return NULL;
2377     default:
2378       /* Must not happen?  */
2379 #ifdef DEBUG
2380       assert (0);
2381 #endif
2382       return NULL;
2383     }
2384   fetch_token (token, regexp, syntax);
2385
2386   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2387          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2388     {
2389       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2390       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391         return NULL;
2392       /* In BRE consecutive duplications are not allowed.  */
2393       if ((syntax & RE_CONTEXT_INVALID_DUP)
2394           && (token->type == OP_DUP_ASTERISK
2395               || token->type == OP_OPEN_DUP_NUM))
2396         {
2397           *err = REG_BADRPT;
2398           return NULL;
2399         }
2400     }
2401
2402   return tree;
2403 }
2404
2405 /* This function build the following tree, from regular expression
2406    (<reg_exp>):
2407          SUBEXP
2408             |
2409         <reg_exp>
2410 */
2411
2412 static bin_tree_t *
2413 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2414                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2415 {
2416   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2417   bin_tree_t *tree;
2418   size_t cur_nsub;
2419   cur_nsub = preg->re_nsub++;
2420
2421   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2422
2423   /* The subexpression may be a null string.  */
2424   if (token->type == OP_CLOSE_SUBEXP)
2425     tree = NULL;
2426   else
2427     {
2428       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2429       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2430         {
2431           if (tree != NULL)
2432             postorder (tree, free_tree, NULL);
2433           *err = REG_EPAREN;
2434         }
2435       if (BE (*err != REG_NOERROR, 0))
2436         return NULL;
2437     }
2438
2439   if (cur_nsub <= '9' - '1')
2440     dfa->completed_bkref_map |= 1 << cur_nsub;
2441
2442   tree = create_tree (dfa, tree, NULL, SUBEXP);
2443   if (BE (tree == NULL, 0))
2444     {
2445       *err = REG_ESPACE;
2446       return NULL;
2447     }
2448   tree->token.opr.idx = cur_nsub;
2449   return tree;
2450 }
2451
2452 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2453
2454 static bin_tree_t *
2455 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2456               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2457 {
2458   bin_tree_t *tree = NULL, *old_tree = NULL;
2459   int i, start, end, start_idx = re_string_cur_idx (regexp);
2460   re_token_t start_token = *token;
2461
2462   if (token->type == OP_OPEN_DUP_NUM)
2463     {
2464       end = 0;
2465       start = fetch_number (regexp, token, syntax);
2466       if (start == -1)
2467         {
2468           if (token->type == CHARACTER && token->opr.c == ',')
2469             start = 0; /* We treat "{,m}" as "{0,m}".  */
2470           else
2471             {
2472               *err = REG_BADBR; /* <re>{} is invalid.  */
2473               return NULL;
2474             }
2475         }
2476       if (BE (start != -2, 1))
2477         {
2478           /* We treat "{n}" as "{n,n}".  */
2479           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2480                  : ((token->type == CHARACTER && token->opr.c == ',')
2481                     ? fetch_number (regexp, token, syntax) : -2));
2482         }
2483       if (BE (start == -2 || end == -2, 0))
2484         {
2485           /* Invalid sequence.  */
2486           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2487             {
2488               if (token->type == END_OF_RE)
2489                 *err = REG_EBRACE;
2490               else
2491                 *err = REG_BADBR;
2492
2493               return NULL;
2494             }
2495
2496           /* If the syntax bit is set, rollback.  */
2497           re_string_set_index (regexp, start_idx);
2498           *token = start_token;
2499           token->type = CHARACTER;
2500           /* mb_partial and word_char bits should be already initialized by
2501              peek_token.  */
2502           return elem;
2503         }
2504
2505       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2506         {
2507           /* First number greater than second.  */
2508           *err = REG_BADBR;
2509           return NULL;
2510         }
2511     }
2512   else
2513     {
2514       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2515       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2516     }
2517
2518   fetch_token (token, regexp, syntax);
2519
2520   if (BE (elem == NULL, 0))
2521     return NULL;
2522   if (BE (start == 0 && end == 0, 0))
2523     {
2524       postorder (elem, free_tree, NULL);
2525       return NULL;
2526     }
2527
2528   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2529   if (BE (start > 0, 0))
2530     {
2531       tree = elem;
2532       for (i = 2; i <= start; ++i)
2533         {
2534           elem = duplicate_tree (elem, dfa);
2535           tree = create_tree (dfa, tree, elem, CONCAT);
2536           if (BE (elem == NULL || tree == NULL, 0))
2537             goto parse_dup_op_espace;
2538         }
2539
2540       if (start == end)
2541         return tree;
2542
2543       /* Duplicate ELEM before it is marked optional.  */
2544       elem = duplicate_tree (elem, dfa);
2545       old_tree = tree;
2546     }
2547   else
2548     old_tree = NULL;
2549
2550   if (elem->token.type == SUBEXP)
2551     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2552
2553   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2554   if (BE (tree == NULL, 0))
2555     goto parse_dup_op_espace;
2556
2557   /* This loop is actually executed only when end != -1,
2558      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2559      already created the start+1-th copy.  */
2560   for (i = start + 2; i <= end; ++i)
2561     {
2562       elem = duplicate_tree (elem, dfa);
2563       tree = create_tree (dfa, tree, elem, CONCAT);
2564       if (BE (elem == NULL || tree == NULL, 0))
2565         goto parse_dup_op_espace;
2566
2567       tree = create_tree (dfa, tree, NULL, OP_ALT);
2568       if (BE (tree == NULL, 0))
2569         goto parse_dup_op_espace;
2570     }
2571
2572   if (old_tree)
2573     tree = create_tree (dfa, old_tree, tree, CONCAT);
2574
2575   return tree;
2576
2577  parse_dup_op_espace:
2578   *err = REG_ESPACE;
2579   return NULL;
2580 }
2581
2582 /* Size of the names for collating symbol/equivalence_class/character_class.
2583    I'm not sure, but maybe enough.  */
2584 #define BRACKET_NAME_BUF_SIZE 32
2585
2586 #ifndef _LIBC
2587   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2588      Build the range expression which starts from START_ELEM, and ends
2589      at END_ELEM.  The result are written to MBCSET and SBCSET.
2590      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2591      mbcset->range_ends, is a pointer argument sinse we may
2592      update it.  */
2593
2594 static reg_errcode_t
2595 internal_function
2596 # ifdef RE_ENABLE_I18N
2597 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2598                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2599 # else /* not RE_ENABLE_I18N */
2600 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2601                  bracket_elem_t *end_elem)
2602 # endif /* not RE_ENABLE_I18N */
2603 {
2604   unsigned int start_ch, end_ch;
2605   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2606   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2607           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2608           0))
2609     return REG_ERANGE;
2610
2611   /* We can handle no multi character collating elements without libc
2612      support.  */
2613   if (BE ((start_elem->type == COLL_SYM
2614            && strlen ((char *) start_elem->opr.name) > 1)
2615           || (end_elem->type == COLL_SYM
2616               && strlen ((char *) end_elem->opr.name) > 1), 0))
2617     return REG_ECOLLATE;
2618
2619 # ifdef RE_ENABLE_I18N
2620   {
2621     wchar_t wc;
2622     wint_t start_wc;
2623     wint_t end_wc;
2624     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2625
2626     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2627                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2628                    : 0));
2629     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2630               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2631                  : 0));
2632     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2633                 ? __btowc (start_ch) : start_elem->opr.wch);
2634     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2635               ? __btowc (end_ch) : end_elem->opr.wch);
2636     if (start_wc == WEOF || end_wc == WEOF)
2637       return REG_ECOLLATE;
2638     cmp_buf[0] = start_wc;
2639     cmp_buf[4] = end_wc;
2640     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2641       return REG_ERANGE;
2642
2643     /* Got valid collation sequence values, add them as a new entry.
2644        However, for !_LIBC we have no collation elements: if the
2645        character set is single byte, the single byte character set
2646        that we build below suffices.  parse_bracket_exp passes
2647        no MBCSET if dfa_mb_cur_max (dfa) == 1.  */
2648     if (mbcset)
2649       {
2650         /* Check the space of the arrays.  */
2651         if (BE (*range_alloc == mbcset->nranges, 0))
2652           {
2653             /* There is not enough space, need realloc.  */
2654             wchar_t *new_array_start, *new_array_end;
2655             int new_nranges;
2656
2657             /* +1 in case of mbcset->nranges is 0.  */
2658             new_nranges = 2 * mbcset->nranges + 1;
2659             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2660                are NULL if *range_alloc == 0.  */
2661             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2662                                           new_nranges);
2663             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2664                                         new_nranges);
2665
2666             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2667               return REG_ESPACE;
2668
2669             mbcset->range_starts = new_array_start;
2670             mbcset->range_ends = new_array_end;
2671             *range_alloc = new_nranges;
2672           }
2673
2674         mbcset->range_starts[mbcset->nranges] = start_wc;
2675         mbcset->range_ends[mbcset->nranges++] = end_wc;
2676       }
2677
2678     /* Build the table for single byte characters.  */
2679     for (wc = 0; wc < SBC_MAX; ++wc)
2680       {
2681         cmp_buf[2] = wc;
2682         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2683             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2684           bitset_set (sbcset, wc);
2685       }
2686   }
2687 # else /* not RE_ENABLE_I18N */
2688   {
2689     unsigned int ch;
2690     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2691                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2692                    : 0));
2693     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2694               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2695                  : 0));
2696     if (start_ch > end_ch)
2697       return REG_ERANGE;
2698     /* Build the table for single byte characters.  */
2699     for (ch = 0; ch < SBC_MAX; ++ch)
2700       if (start_ch <= ch  && ch <= end_ch)
2701         bitset_set (sbcset, ch);
2702   }
2703 # endif /* not RE_ENABLE_I18N */
2704   return REG_NOERROR;
2705 }
2706 #endif /* not _LIBC */
2707
2708 #ifndef _LIBC
2709 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2710    Build the collating element which is represented by NAME.
2711    The result are written to MBCSET and SBCSET.
2712    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2713    pointer argument since we may update it.  */
2714
2715 static reg_errcode_t
2716 internal_function
2717 # ifdef RE_ENABLE_I18N
2718 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2719                         int *coll_sym_alloc, const unsigned char *name)
2720 # else /* not RE_ENABLE_I18N */
2721 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2722 # endif /* not RE_ENABLE_I18N */
2723 {
2724   size_t name_len = strlen ((const char *) name);
2725   if (BE (name_len != 1, 0))
2726     return REG_ECOLLATE;
2727   else
2728     {
2729       bitset_set (sbcset, name[0]);
2730       return REG_NOERROR;
2731     }
2732 }
2733 #endif /* not _LIBC */
2734
2735 /* This function parse bracket expression like "[abc]", "[a-c]",
2736    "[[.a-a.]]" etc.  */
2737
2738 static bin_tree_t *
2739 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2740                    reg_syntax_t syntax, reg_errcode_t *err)
2741 {
2742 #ifdef _LIBC
2743 #if __OPTION_EGLIBC_LOCALE_CODE
2744   const unsigned char *collseqmb;
2745 # define COLLSEQMB_LOOKUP(ix) (collseqmb[(ix)])
2746 #else
2747 # define COLLSEQMB_LOOKUP(ix) (ix)
2748 #endif
2749
2750   const char *collseqwc;
2751   uint32_t nrules;
2752   int32_t table_size;
2753   const int32_t *symb_table;
2754   const unsigned char *extra;
2755
2756   /* Local function for parse_bracket_exp used in _LIBC environement.
2757      Seek the collating symbol entry correspondings to NAME.
2758      Return the index of the symbol in the SYMB_TABLE.  */
2759
2760   auto inline int32_t
2761   __attribute ((always_inline))
2762   seek_collating_symbol_entry (name, name_len)
2763          const unsigned char *name;
2764          size_t name_len;
2765     {
2766       int32_t hash = elem_hash ((const char *) name, name_len);
2767       int32_t elem = hash % table_size;
2768       if (symb_table[2 * elem] != 0)
2769         {
2770           int32_t second = hash % (table_size - 2) + 1;
2771
2772           do
2773             {
2774               /* First compare the hashing value.  */
2775               if (symb_table[2 * elem] == hash
2776                   /* Compare the length of the name.  */
2777                   && name_len == extra[symb_table[2 * elem + 1]]
2778                   /* Compare the name.  */
2779                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2780                              name_len) == 0)
2781                 {
2782                   /* Yep, this is the entry.  */
2783                   break;
2784                 }
2785
2786               /* Next entry.  */
2787               elem += second;
2788             }
2789           while (symb_table[2 * elem] != 0);
2790         }
2791       return elem;
2792     }
2793
2794   /* Local function for parse_bracket_exp used in _LIBC environment.
2795      Look up the collation sequence value of BR_ELEM.
2796      Return the value if succeeded, UINT_MAX otherwise.  */
2797
2798   auto inline unsigned int
2799   __attribute ((always_inline))
2800   lookup_collation_sequence_value (br_elem)
2801          bracket_elem_t *br_elem;
2802     {
2803       if (br_elem->type == SB_CHAR)
2804         {
2805           /*
2806           if (MB_CUR_MAX == 1)
2807           */
2808           if (nrules == 0)
2809             return COLLSEQMB_LOOKUP (br_elem->opr.ch);
2810           else
2811             {
2812               wint_t wc = __btowc (br_elem->opr.ch);
2813               return __collseq_table_lookup (collseqwc, wc);
2814             }
2815         }
2816 #if __OPTION_EGLIBC_LOCALE_CODE
2817       else if (br_elem->type == MB_CHAR)
2818         {
2819           if (nrules != 0)
2820             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2821         }
2822 #endif
2823       else if (br_elem->type == COLL_SYM)
2824         {
2825           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2826           if (nrules != 0)
2827             {
2828               int32_t elem, idx;
2829               elem = seek_collating_symbol_entry (br_elem->opr.name,
2830                                                   sym_name_len);
2831               if (symb_table[2 * elem] != 0)
2832                 {
2833                   /* We found the entry.  */
2834                   idx = symb_table[2 * elem + 1];
2835                   /* Skip the name of collating element name.  */
2836                   idx += 1 + extra[idx];
2837                   /* Skip the byte sequence of the collating element.  */
2838                   idx += 1 + extra[idx];
2839                   /* Adjust for the alignment.  */
2840                   idx = (idx + 3) & ~3;
2841                   /* Skip the multibyte collation sequence value.  */
2842                   idx += sizeof (unsigned int);
2843                   /* Skip the wide char sequence of the collating element.  */
2844                   idx += sizeof (unsigned int) *
2845                     (1 + *(unsigned int *) (extra + idx));
2846                   /* Return the collation sequence value.  */
2847                   return *(unsigned int *) (extra + idx);
2848                 }
2849               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2850                 {
2851                   /* No valid character.  Match it as a single byte
2852                      character.  */
2853                   return COLLSEQMB_LOOKUP (br_elem->opr.name[0]);
2854                 }
2855             }
2856           else if (sym_name_len == 1)
2857             return COLLSEQMB_LOOKUP (br_elem->opr.name[0]);
2858         }
2859       return UINT_MAX;
2860     }
2861
2862   /* Local function for parse_bracket_exp used in _LIBC environement.
2863      Build the range expression which starts from START_ELEM, and ends
2864      at END_ELEM.  The result are written to MBCSET and SBCSET.
2865      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2866      mbcset->range_ends, is a pointer argument sinse we may
2867      update it.  */
2868
2869   auto inline reg_errcode_t
2870   __attribute ((always_inline))
2871   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2872          re_charset_t *mbcset;
2873          int *range_alloc;
2874          bitset_t sbcset;
2875          bracket_elem_t *start_elem, *end_elem;
2876     {
2877       unsigned int ch;
2878       uint32_t start_collseq;
2879       uint32_t end_collseq;
2880
2881       /* Equivalence Classes and Character Classes can't be a range
2882          start/end.  */
2883       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2884               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2885               0))
2886         return REG_ERANGE;
2887
2888       start_collseq = lookup_collation_sequence_value (start_elem);
2889       end_collseq = lookup_collation_sequence_value (end_elem);
2890       /* Check start/end collation sequence values.  */
2891       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2892         return REG_ECOLLATE;
2893       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2894         return REG_ERANGE;
2895
2896       /* Got valid collation sequence values, add them as a new entry.
2897          However, if we have no collation elements, and the character set
2898          is single byte, the single byte character set that we
2899          build below suffices. */
2900       if (nrules > 0 || dfa_mb_cur_max (dfa) > 1)
2901         {
2902           /* Check the space of the arrays.  */
2903           if (BE (*range_alloc == mbcset->nranges, 0))
2904             {
2905               /* There is not enough space, need realloc.  */
2906               uint32_t *new_array_start;
2907               uint32_t *new_array_end;
2908               int new_nranges;
2909
2910               /* +1 in case of mbcset->nranges is 0.  */
2911               new_nranges = 2 * mbcset->nranges + 1;
2912               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2913                                             new_nranges);
2914               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2915                                           new_nranges);
2916
2917               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2918                 return REG_ESPACE;
2919
2920               mbcset->range_starts = new_array_start;
2921               mbcset->range_ends = new_array_end;
2922               *range_alloc = new_nranges;
2923             }
2924
2925           mbcset->range_starts[mbcset->nranges] = start_collseq;
2926           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2927         }
2928
2929       /* Build the table for single byte characters.  */
2930       for (ch = 0; ch < SBC_MAX; ch++)
2931         {
2932           uint32_t ch_collseq;
2933           /*
2934           if (MB_CUR_MAX == 1)
2935           */
2936           if (nrules == 0)
2937             ch_collseq = COLLSEQMB_LOOKUP (ch);
2938           else
2939             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2940           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2941             bitset_set (sbcset, ch);
2942         }
2943       return REG_NOERROR;
2944     }
2945
2946   /* Local function for parse_bracket_exp used in _LIBC environement.
2947      Build the collating element which is represented by NAME.
2948      The result are written to MBCSET and SBCSET.
2949      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2950      pointer argument sinse we may update it.  */
2951
2952   auto inline reg_errcode_t
2953   __attribute ((always_inline))
2954   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2955          re_charset_t *mbcset;
2956          int *coll_sym_alloc;
2957          bitset_t sbcset;
2958          const unsigned char *name;
2959     {
2960       int32_t elem, idx;
2961       size_t name_len = strlen ((const char *) name);
2962       if (nrules != 0)
2963         {
2964           elem = seek_collating_symbol_entry (name, name_len);
2965           if (symb_table[2 * elem] != 0)
2966             {
2967               /* We found the entry.  */
2968               idx = symb_table[2 * elem + 1];
2969               /* Skip the name of collating element name.  */
2970               idx += 1 + extra[idx];
2971             }
2972           else if (symb_table[2 * elem] == 0 && name_len == 1)
2973             {
2974               /* No valid character, treat it as a normal
2975                  character.  */
2976               bitset_set (sbcset, name[0]);
2977               return REG_NOERROR;
2978             }
2979           else
2980             return REG_ECOLLATE;
2981
2982           /* Got valid collation sequence, add it as a new entry.  */
2983           /* Check the space of the arrays.  */
2984           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2985             {
2986               /* Not enough, realloc it.  */
2987               /* +1 in case of mbcset->ncoll_syms is 0.  */
2988               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2989               /* Use realloc since mbcset->coll_syms is NULL
2990                  if *alloc == 0.  */
2991               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2992                                                    new_coll_sym_alloc);
2993               if (BE (new_coll_syms == NULL, 0))
2994                 return REG_ESPACE;
2995               mbcset->coll_syms = new_coll_syms;
2996               *coll_sym_alloc = new_coll_sym_alloc;
2997             }
2998           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2999           return REG_NOERROR;
3000         }
3001       else
3002         {
3003           if (BE (name_len != 1, 0))
3004             return REG_ECOLLATE;
3005           else
3006             {
3007               bitset_set (sbcset, name[0]);
3008               return REG_NOERROR;
3009             }
3010         }
3011     }
3012 #endif
3013
3014   re_token_t br_token;
3015   re_bitset_ptr_t sbcset;
3016 #ifdef RE_ENABLE_I18N
3017   re_charset_t *mbcset;
3018   int coll_sym_alloc = 0, range_alloc = 0;
3019 #if __OPTION_EGLIBC_LOCALE_CODE
3020   int mbchar_alloc = 0;
3021 #endif
3022   int equiv_class_alloc = 0, char_class_alloc = 0;
3023 #endif /* not RE_ENABLE_I18N */
3024   int non_match = 0;
3025   bin_tree_t *work_tree;
3026   int token_len;
3027   int first_round = 1;
3028 #ifdef _LIBC
3029 #if __OPTION_EGLIBC_LOCALE_CODE
3030   collseqmb = (const unsigned char *)
3031     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3032   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3033 #else
3034   /* This is true when OPTION_EGLIBC_LOCALE_CODE is disabled, but the
3035      compiler can't figure that out.  */
3036   nrules = 0;
3037 #endif
3038   if (nrules)
3039     {
3040       /*
3041       if (MB_CUR_MAX > 1)
3042       */
3043       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3044       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3045       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3046                                                   _NL_COLLATE_SYMB_TABLEMB);
3047       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3048                                                    _NL_COLLATE_SYMB_EXTRAMB);
3049     }
3050 #endif
3051   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3052 #ifdef RE_ENABLE_I18N
3053   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3054 #endif /* RE_ENABLE_I18N */
3055 #ifdef RE_ENABLE_I18N
3056   if (BE (sbcset == NULL || mbcset == NULL, 0))
3057 #else
3058   if (BE (sbcset == NULL, 0))
3059 #endif /* RE_ENABLE_I18N */
3060     {
3061       re_free (sbcset);
3062 #ifdef RE_ENABLE_I18N
3063       re_free (mbcset);
3064 #endif
3065       *err = REG_ESPACE;
3066       return NULL;
3067     }
3068
3069   token_len = peek_token_bracket (token, regexp, syntax);
3070   if (BE (token->type == END_OF_RE, 0))
3071     {
3072       *err = REG_BADPAT;
3073       goto parse_bracket_exp_free_return;
3074     }
3075   if (token->type == OP_NON_MATCH_LIST)
3076     {
3077 #ifdef RE_ENABLE_I18N
3078       mbcset->non_match = 1;
3079 #endif /* not RE_ENABLE_I18N */
3080       non_match = 1;
3081       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3082         bitset_set (sbcset, '\n');
3083       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3084       token_len = peek_token_bracket (token, regexp, syntax);
3085       if (BE (token->type == END_OF_RE, 0))
3086         {
3087           *err = REG_BADPAT;
3088           goto parse_bracket_exp_free_return;
3089         }
3090     }
3091
3092   /* We treat the first ']' as a normal character.  */
3093   if (token->type == OP_CLOSE_BRACKET)
3094     token->type = CHARACTER;
3095
3096   while (1)
3097     {
3098       bracket_elem_t start_elem, end_elem;
3099       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3100       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3101       reg_errcode_t ret;
3102       int token_len2 = 0, is_range_exp = 0;
3103       re_token_t token2;
3104
3105       start_elem.opr.name = start_name_buf;
3106       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3107                                    syntax, first_round);
3108       if (BE (ret != REG_NOERROR, 0))
3109         {
3110           *err = ret;
3111           goto parse_bracket_exp_free_return;
3112         }
3113       first_round = 0;
3114
3115       /* Get information about the next token.  We need it in any case.  */
3116       token_len = peek_token_bracket (token, regexp, syntax);
3117
3118       /* Do not check for ranges if we know they are not allowed.  */
3119       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3120         {
3121           if (BE (token->type == END_OF_RE, 0))
3122             {
3123               *err = REG_EBRACK;
3124               goto parse_bracket_exp_free_return;
3125             }
3126           if (token->type == OP_CHARSET_RANGE)
3127             {
3128               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3129               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3130               if (BE (token2.type == END_OF_RE, 0))
3131                 {
3132                   *err = REG_EBRACK;
3133                   goto parse_bracket_exp_free_return;
3134                 }
3135               if (token2.type == OP_CLOSE_BRACKET)
3136                 {
3137                   /* We treat the last '-' as a normal character.  */
3138                   re_string_skip_bytes (regexp, -token_len);
3139                   token->type = CHARACTER;
3140                 }
3141               else
3142                 is_range_exp = 1;
3143             }
3144         }
3145
3146       if (is_range_exp == 1)
3147         {
3148           end_elem.opr.name = end_name_buf;
3149           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150                                        dfa, syntax, 1);
3151           if (BE (ret != REG_NOERROR, 0))
3152             {
3153               *err = ret;
3154               goto parse_bracket_exp_free_return;
3155             }
3156
3157           token_len = peek_token_bracket (token, regexp, syntax);
3158
3159 #ifdef _LIBC
3160           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3161                                   &start_elem, &end_elem);
3162 #else
3163 # ifdef RE_ENABLE_I18N
3164           *err = build_range_exp (sbcset,
3165                                   dfa_mb_cur_max (dfa) > 1 ? mbcset : NULL,
3166                                   &range_alloc, &start_elem, &end_elem);
3167 # else
3168           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3169 # endif
3170 #endif /* RE_ENABLE_I18N */
3171           if (BE (*err != REG_NOERROR, 0))
3172             goto parse_bracket_exp_free_return;
3173         }
3174       else
3175         {
3176           switch (start_elem.type)
3177             {
3178             case SB_CHAR:
3179               bitset_set (sbcset, start_elem.opr.ch);
3180               break;
3181 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3182             case MB_CHAR:
3183               /* Check whether the array has enough space.  */
3184               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3185                 {
3186                   wchar_t *new_mbchars;
3187                   /* Not enough, realloc it.  */
3188                   /* +1 in case of mbcset->nmbchars is 0.  */
3189                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3190                   /* Use realloc since array is NULL if *alloc == 0.  */
3191                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3192                                             mbchar_alloc);
3193                   if (BE (new_mbchars == NULL, 0))
3194                     goto parse_bracket_exp_espace;
3195                   mbcset->mbchars = new_mbchars;
3196                 }
3197               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3198               break;
3199 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3200             case EQUIV_CLASS:
3201               *err = build_equiv_class (sbcset,
3202 #ifdef RE_ENABLE_I18N
3203                                         mbcset, &equiv_class_alloc,
3204 #endif /* RE_ENABLE_I18N */
3205                                         start_elem.opr.name);
3206               if (BE (*err != REG_NOERROR, 0))
3207                 goto parse_bracket_exp_free_return;
3208               break;
3209             case COLL_SYM:
3210               *err = build_collating_symbol (sbcset,
3211 #ifdef RE_ENABLE_I18N
3212                                              mbcset, &coll_sym_alloc,
3213 #endif /* RE_ENABLE_I18N */
3214                                              start_elem.opr.name);
3215               if (BE (*err != REG_NOERROR, 0))
3216                 goto parse_bracket_exp_free_return;
3217               break;
3218             case CHAR_CLASS:
3219               *err = build_charclass (regexp->trans, sbcset,
3220 #ifdef RE_ENABLE_I18N
3221                                       mbcset, &char_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223                                       start_elem.opr.name, syntax);
3224               if (BE (*err != REG_NOERROR, 0))
3225                goto parse_bracket_exp_free_return;
3226               break;
3227             default:
3228               assert (0);
3229               break;
3230             }
3231         }
3232       if (BE (token->type == END_OF_RE, 0))
3233         {
3234           *err = REG_EBRACK;
3235           goto parse_bracket_exp_free_return;
3236         }
3237       if (token->type == OP_CLOSE_BRACKET)
3238         break;
3239     }
3240
3241   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3242
3243   /* If it is non-matching list.  */
3244   if (non_match)
3245     bitset_not (sbcset);
3246
3247 #ifdef RE_ENABLE_I18N
3248   /* Ensure only single byte characters are set.  */
3249   if (dfa_mb_cur_max (dfa) > 1)
3250     bitset_mask (sbcset, dfa->sb_char);
3251
3252   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3253       || mbcset->nranges || (dfa_mb_cur_max (dfa) > 1 && (mbcset->nchar_classes
3254                                                      || mbcset->non_match)))
3255     {
3256       bin_tree_t *mbc_tree;
3257       int sbc_idx;
3258       /* Build a tree for complex bracket.  */
3259       dfa->has_mb_node = 1;
3260       br_token.type = COMPLEX_BRACKET;
3261       br_token.opr.mbcset = mbcset;
3262       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3263       if (BE (mbc_tree == NULL, 0))
3264         goto parse_bracket_exp_espace;
3265       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3266         if (sbcset[sbc_idx])
3267           break;
3268       /* If there are no bits set in sbcset, there is no point
3269          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3270       if (sbc_idx < BITSET_WORDS)
3271         {
3272           /* Build a tree for simple bracket.  */
3273           br_token.type = SIMPLE_BRACKET;
3274           br_token.opr.sbcset = sbcset;
3275           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276           if (BE (work_tree == NULL, 0))
3277             goto parse_bracket_exp_espace;
3278
3279           /* Then join them by ALT node.  */
3280           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3281           if (BE (work_tree == NULL, 0))
3282             goto parse_bracket_exp_espace;
3283         }
3284       else
3285         {
3286           re_free (sbcset);
3287           work_tree = mbc_tree;
3288         }
3289     }
3290   else
3291 #endif /* not RE_ENABLE_I18N */
3292     {
3293 #ifdef RE_ENABLE_I18N
3294       free_charset (mbcset);
3295 #endif
3296       /* Build a tree for simple bracket.  */
3297       br_token.type = SIMPLE_BRACKET;
3298       br_token.opr.sbcset = sbcset;
3299       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3300       if (BE (work_tree == NULL, 0))
3301         goto parse_bracket_exp_espace;
3302     }
3303   return work_tree;
3304
3305  parse_bracket_exp_espace:
3306   *err = REG_ESPACE;
3307  parse_bracket_exp_free_return:
3308   re_free (sbcset);
3309 #ifdef RE_ENABLE_I18N
3310   free_charset (mbcset);
3311 #endif /* RE_ENABLE_I18N */
3312   return NULL;
3313 }
3314
3315 /* Parse an element in the bracket expression.  */
3316
3317 static reg_errcode_t
3318 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3319                        re_token_t *token, int token_len, re_dfa_t *dfa,
3320                        reg_syntax_t syntax, int accept_hyphen)
3321 {
3322 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3323   int cur_char_size;
3324   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3325   if (cur_char_size > 1)
3326     {
3327       elem->type = MB_CHAR;
3328       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3329       re_string_skip_bytes (regexp, cur_char_size);
3330       return REG_NOERROR;
3331     }
3332 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3333   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3334   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3335       || token->type == OP_OPEN_EQUIV_CLASS)
3336     return parse_bracket_symbol (elem, regexp, token);
3337   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3338     {
3339       /* A '-' must only appear as anything but a range indicator before
3340          the closing bracket.  Everything else is an error.  */
3341       re_token_t token2;
3342       (void) peek_token_bracket (&token2, regexp, syntax);
3343       if (token2.type != OP_CLOSE_BRACKET)
3344         /* The actual error value is not standardized since this whole
3345            case is undefined.  But ERANGE makes good sense.  */
3346         return REG_ERANGE;
3347     }
3348   elem->type = SB_CHAR;
3349   elem->opr.ch = token->opr.c;
3350   return REG_NOERROR;
3351 }
3352
3353 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3354    such as [:<character_class>:], [.<collating_element>.], and
3355    [=<equivalent_class>=].  */
3356
3357 static reg_errcode_t
3358 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3359                       re_token_t *token)
3360 {
3361   unsigned char ch, delim = token->opr.c;
3362   int i = 0;
3363   if (re_string_eoi(regexp))
3364     return REG_EBRACK;
3365   for (;; ++i)
3366     {
3367       if (i >= BRACKET_NAME_BUF_SIZE)
3368         return REG_EBRACK;
3369       if (token->type == OP_OPEN_CHAR_CLASS)
3370         ch = re_string_fetch_byte_case (regexp);
3371       else
3372         ch = re_string_fetch_byte (regexp);
3373       if (re_string_eoi(regexp))
3374         return REG_EBRACK;
3375       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3376         break;
3377       elem->opr.name[i] = ch;
3378     }
3379   re_string_skip_bytes (regexp, 1);
3380   elem->opr.name[i] = '\0';
3381   switch (token->type)
3382     {
3383     case OP_OPEN_COLL_ELEM:
3384       elem->type = COLL_SYM;
3385       break;
3386     case OP_OPEN_EQUIV_CLASS:
3387       elem->type = EQUIV_CLASS;
3388       break;
3389     case OP_OPEN_CHAR_CLASS:
3390       elem->type = CHAR_CLASS;
3391       break;
3392     default:
3393       break;
3394     }
3395   return REG_NOERROR;
3396 }
3397
3398   /* Helper function for parse_bracket_exp.
3399      Build the equivalence class which is represented by NAME.
3400      The result are written to MBCSET and SBCSET.
3401      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3402      is a pointer argument sinse we may update it.  */
3403
3404 static reg_errcode_t
3405 #ifdef RE_ENABLE_I18N
3406 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3407                    int *equiv_class_alloc, const unsigned char *name)
3408 #else /* not RE_ENABLE_I18N */
3409 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3410 #endif /* not RE_ENABLE_I18N */
3411 {
3412   /* When __OPTION_EGLIBC_LOCALE_CODE is disabled, only the C locale
3413      is supported; it has no collation rules.  */
3414 #if defined _LIBC && __OPTION_EGLIBC_LOCALE_CODE
3415   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3416   if (nrules != 0)
3417     {
3418       const int32_t *table, *indirect;
3419       const unsigned char *weights, *extra, *cp;
3420       unsigned char char_buf[2];
3421       int32_t idx1, idx2;
3422       unsigned int ch;
3423       size_t len;
3424       /* This #include defines a local function!  */
3425 # include <locale/weight.h>
3426       /* Calculate the index for equivalence class.  */
3427       cp = name;
3428       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3429       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3430                                                _NL_COLLATE_WEIGHTMB);
3431       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3432                                                    _NL_COLLATE_EXTRAMB);
3433       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3434                                                 _NL_COLLATE_INDIRECTMB);
3435       idx1 = findidx (&cp);
3436       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3437         /* This isn't a valid character.  */
3438         return REG_ECOLLATE;
3439
3440       /* Build single byte matcing table for this equivalence class.  */
3441       char_buf[1] = (unsigned char) '\0';
3442       len = weights[idx1 & 0xffffff];
3443       for (ch = 0; ch < SBC_MAX; ++ch)
3444         {
3445           char_buf[0] = ch;
3446           cp = char_buf;
3447           idx2 = findidx (&cp);
3448 /*
3449           idx2 = table[ch];
3450 */
3451           if (idx2 == 0)
3452             /* This isn't a valid character.  */
3453             continue;
3454           /* Compare only if the length matches and the collation rule
3455              index is the same.  */
3456           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3457             {
3458               int cnt = 0;
3459
3460               while (cnt <= len &&
3461                      weights[(idx1 & 0xffffff) + 1 + cnt]
3462                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3463                 ++cnt;
3464
3465               if (cnt > len)
3466                 bitset_set (sbcset, ch);
3467             }
3468         }
3469       /* Check whether the array has enough space.  */
3470       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3471         {
3472           /* Not enough, realloc it.  */
3473           /* +1 in case of mbcset->nequiv_classes is 0.  */
3474           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3475           /* Use realloc since the array is NULL if *alloc == 0.  */
3476           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3477                                                    int32_t,
3478                                                    new_equiv_class_alloc);
3479           if (BE (new_equiv_classes == NULL, 0))
3480             return REG_ESPACE;
3481           mbcset->equiv_classes = new_equiv_classes;
3482           *equiv_class_alloc = new_equiv_class_alloc;
3483         }
3484       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3485     }
3486   else
3487 #endif /* _LIBC && __OPTION_EGLIBC_LOCALE_CODE */
3488     {
3489       if (BE (strlen ((const char *) name) != 1, 0))
3490         return REG_ECOLLATE;
3491       bitset_set (sbcset, *name);
3492     }
3493   return REG_NOERROR;
3494 }
3495
3496   /* Helper function for parse_bracket_exp.
3497      Build the character class which is represented by NAME.
3498      The result are written to MBCSET and SBCSET.
3499      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3500      is a pointer argument sinse we may update it.  */
3501
3502 static reg_errcode_t
3503 #ifdef RE_ENABLE_I18N
3504 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3505                  re_charset_t *mbcset, int *char_class_alloc,
3506                  const unsigned char *class_name, reg_syntax_t syntax)
3507 #else /* not RE_ENABLE_I18N */
3508 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3509                  const unsigned char *class_name, reg_syntax_t syntax)
3510 #endif /* not RE_ENABLE_I18N */
3511 {
3512   int i;
3513   const char *name = (const char *) class_name;
3514
3515   /* In case of REG_ICASE "upper" and "lower" match the both of
3516      upper and lower cases.  */
3517   if ((syntax & RE_ICASE)
3518       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3519     name = "alpha";
3520
3521 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3522   /* Check the space of the arrays.  */
3523   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3524     {
3525       /* Not enough, realloc it.  */
3526       /* +1 in case of mbcset->nchar_classes is 0.  */
3527       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3528       /* Use realloc since array is NULL if *alloc == 0.  */
3529       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3530                                                new_char_class_alloc);
3531       if (BE (new_char_classes == NULL, 0))
3532         return REG_ESPACE;
3533       mbcset->char_classes = new_char_classes;
3534       *char_class_alloc = new_char_class_alloc;
3535     }
3536   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3537 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3538
3539 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3540   do {                                          \
3541     if (BE (trans != NULL, 0))                  \
3542       {                                         \
3543         for (i = 0; i < SBC_MAX; ++i)           \
3544           if (ctype_func (i))                   \
3545             bitset_set (sbcset, trans[i]);      \
3546       }                                         \
3547     else                                        \
3548       {                                         \
3549         for (i = 0; i < SBC_MAX; ++i)           \
3550           if (ctype_func (i))                   \
3551             bitset_set (sbcset, i);             \
3552       }                                         \
3553   } while (0)
3554
3555   if (strcmp (name, "alnum") == 0)
3556     BUILD_CHARCLASS_LOOP (isalnum);
3557   else if (strcmp (name, "cntrl") == 0)
3558     BUILD_CHARCLASS_LOOP (iscntrl);
3559   else if (strcmp (name, "lower") == 0)
3560     BUILD_CHARCLASS_LOOP (islower);
3561   else if (strcmp (name, "space") == 0)
3562     BUILD_CHARCLASS_LOOP (isspace);
3563   else if (strcmp (name, "alpha") == 0)
3564     BUILD_CHARCLASS_LOOP (isalpha);
3565   else if (strcmp (name, "digit") == 0)
3566     BUILD_CHARCLASS_LOOP (isdigit);
3567   else if (strcmp (name, "print") == 0)
3568     BUILD_CHARCLASS_LOOP (isprint);
3569   else if (strcmp (name, "upper") == 0)
3570     BUILD_CHARCLASS_LOOP (isupper);
3571   else if (strcmp (name, "blank") == 0)
3572     BUILD_CHARCLASS_LOOP (isblank);
3573   else if (strcmp (name, "graph") == 0)
3574     BUILD_CHARCLASS_LOOP (isgraph);
3575   else if (strcmp (name, "punct") == 0)
3576     BUILD_CHARCLASS_LOOP (ispunct);
3577   else if (strcmp (name, "xdigit") == 0)
3578     BUILD_CHARCLASS_LOOP (isxdigit);
3579   else
3580     return REG_ECTYPE;
3581
3582   return REG_NOERROR;
3583 }
3584
3585 static bin_tree_t *
3586 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3587                     const unsigned char *class_name,
3588                     const unsigned char *extra, int non_match,
3589                     reg_errcode_t *err)
3590 {
3591   re_bitset_ptr_t sbcset;
3592 #ifdef RE_ENABLE_I18N
3593   re_charset_t *mbcset;
3594   int alloc = 0;
3595 #endif /* not RE_ENABLE_I18N */
3596   reg_errcode_t ret;
3597   re_token_t br_token;
3598   bin_tree_t *tree;
3599
3600   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3601 #ifdef RE_ENABLE_I18N
3602   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3603 #endif /* RE_ENABLE_I18N */
3604
3605 #ifdef RE_ENABLE_I18N
3606   if (BE (sbcset == NULL || mbcset == NULL, 0))
3607 #else /* not RE_ENABLE_I18N */
3608   if (BE (sbcset == NULL, 0))
3609 #endif /* not RE_ENABLE_I18N */
3610     {
3611       *err = REG_ESPACE;
3612       return NULL;
3613     }
3614
3615   if (non_match)
3616     {
3617 #ifdef RE_ENABLE_I18N
3618       mbcset->non_match = 1;
3619 #endif /* not RE_ENABLE_I18N */
3620     }
3621
3622   /* We don't care the syntax in this case.  */
3623   ret = build_charclass (trans, sbcset,
3624 #ifdef RE_ENABLE_I18N
3625                          mbcset, &alloc,
3626 #endif /* RE_ENABLE_I18N */
3627                          class_name, 0);
3628
3629   if (BE (ret != REG_NOERROR, 0))
3630     {
3631       re_free (sbcset);
3632 #ifdef RE_ENABLE_I18N
3633       free_charset (mbcset);
3634 #endif /* RE_ENABLE_I18N */
3635       *err = ret;
3636       return NULL;
3637     }
3638   /* \w match '_' also.  */
3639   for (; *extra; extra++)
3640     bitset_set (sbcset, *extra);
3641
3642   /* If it is non-matching list.  */
3643   if (non_match)
3644     bitset_not (sbcset);
3645
3646 #ifdef RE_ENABLE_I18N
3647   /* Ensure only single byte characters are set.  */
3648   if (dfa_mb_cur_max (dfa) > 1)
3649     bitset_mask (sbcset, dfa->sb_char);
3650 #endif
3651
3652   /* Build a tree for simple bracket.  */
3653   br_token.type = SIMPLE_BRACKET;
3654   br_token.opr.sbcset = sbcset;
3655   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3656   if (BE (tree == NULL, 0))
3657     goto build_word_op_espace;
3658
3659 #ifdef RE_ENABLE_I18N
3660   if (dfa_mb_cur_max (dfa) > 1)
3661     {
3662       bin_tree_t *mbc_tree;
3663       /* Build a tree for complex bracket.  */
3664       br_token.type = COMPLEX_BRACKET;
3665       br_token.opr.mbcset = mbcset;
3666       dfa->has_mb_node = 1;
3667       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3668       if (BE (mbc_tree == NULL, 0))
3669         goto build_word_op_espace;
3670       /* Then join them by ALT node.  */
3671       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3672       if (BE (mbc_tree != NULL, 1))
3673         return tree;
3674     }
3675   else
3676     {
3677       free_charset (mbcset);
3678       return tree;
3679     }
3680 #else /* not RE_ENABLE_I18N */
3681   return tree;
3682 #endif /* not RE_ENABLE_I18N */
3683
3684  build_word_op_espace:
3685   re_free (sbcset);
3686 #ifdef RE_ENABLE_I18N
3687   free_charset (mbcset);
3688 #endif /* RE_ENABLE_I18N */
3689   *err = REG_ESPACE;
3690   return NULL;
3691 }
3692
3693 /* This is intended for the expressions like "a{1,3}".
3694    Fetch a number from `input', and return the number.
3695    Return -1, if the number field is empty like "{,1}".
3696    Return -2, If an error is occured.  */
3697
3698 static int
3699 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3700 {
3701   int num = -1;
3702   unsigned char c;
3703   while (1)
3704     {
3705       fetch_token (token, input, syntax);
3706       c = token->opr.c;
3707       if (BE (token->type == END_OF_RE, 0))
3708         return -2;
3709       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3710         break;
3711       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3712              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3713       num = (num > RE_DUP_MAX) ? -2 : num;
3714     }
3715   return num;
3716 }
3717 \f
3718 #ifdef RE_ENABLE_I18N
3719 static void
3720 free_charset (re_charset_t *cset)
3721 {
3722   re_free (cset->mbchars);
3723 # ifdef _LIBC
3724   re_free (cset->coll_syms);
3725   re_free (cset->equiv_classes);
3726   re_free (cset->range_starts);
3727   re_free (cset->range_ends);
3728 # endif
3729   re_free (cset->char_classes);
3730   re_free (cset);
3731 }
3732 #endif /* RE_ENABLE_I18N */
3733 \f
3734 /* Functions for binary tree operation.  */
3735
3736 /* Create a tree node.  */
3737
3738 static bin_tree_t *
3739 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740              re_token_type_t type)
3741 {
3742   re_token_t t;
3743   t.type = type;
3744   return create_token_tree (dfa, left, right, &t);
3745 }
3746
3747 static bin_tree_t *
3748 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3749                    const re_token_t *token)
3750 {
3751   bin_tree_t *tree;
3752   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3753     {
3754       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3755
3756       if (storage == NULL)
3757         return NULL;
3758       storage->next = dfa->str_tree_storage;
3759       dfa->str_tree_storage = storage;
3760       dfa->str_tree_storage_idx = 0;
3761     }
3762   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3763
3764   tree->parent = NULL;
3765   tree->left = left;
3766   tree->right = right;
3767   tree->token = *token;
3768   tree->token.duplicated = 0;
3769   tree->token.opt_subexp = 0;
3770   tree->first = NULL;
3771   tree->next = NULL;
3772   tree->node_idx = -1;
3773
3774   if (left != NULL)
3775     left->parent = tree;
3776   if (right != NULL)
3777     right->parent = tree;
3778   return tree;
3779 }
3780
3781 /* Mark the tree SRC as an optional subexpression.
3782    To be called from preorder or postorder.  */
3783
3784 static reg_errcode_t
3785 mark_opt_subexp (void *extra, bin_tree_t *node)
3786 {
3787   int idx = (int) (long) extra;
3788   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3789     node->token.opt_subexp = 1;
3790
3791   return REG_NOERROR;
3792 }
3793
3794 /* Free the allocated memory inside NODE. */
3795
3796 static void
3797 free_token (re_token_t *node)
3798 {
3799 #ifdef RE_ENABLE_I18N
3800   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3801     free_charset (node->opr.mbcset);
3802   else
3803 #endif /* RE_ENABLE_I18N */
3804     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3805       re_free (node->opr.sbcset);
3806 }
3807
3808 /* Worker function for tree walking.  Free the allocated memory inside NODE
3809    and its children. */
3810
3811 static reg_errcode_t
3812 free_tree (void *extra, bin_tree_t *node)
3813 {
3814   free_token (&node->token);
3815   return REG_NOERROR;
3816 }
3817
3818
3819 /* Duplicate the node SRC, and return new node.  This is a preorder
3820    visit similar to the one implemented by the generic visitor, but
3821    we need more infrastructure to maintain two parallel trees --- so,
3822    it's easier to duplicate.  */
3823
3824 static bin_tree_t *
3825 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3826 {
3827   const bin_tree_t *node;
3828   bin_tree_t *dup_root;
3829   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3830
3831   for (node = root; ; )
3832     {
3833       /* Create a new tree and link it back to the current parent.  */
3834       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3835       if (*p_new == NULL)
3836         return NULL;
3837       (*p_new)->parent = dup_node;
3838       (*p_new)->token.duplicated = 1;
3839       dup_node = *p_new;
3840
3841       /* Go to the left node, or up and to the right.  */
3842       if (node->left)
3843         {
3844           node = node->left;
3845           p_new = &dup_node->left;
3846         }
3847       else
3848         {
3849           const bin_tree_t *prev = NULL;
3850           while (node->right == prev || node->right == NULL)
3851             {
3852               prev = node;
3853               node = node->parent;
3854               dup_node = dup_node->parent;
3855               if (!node)
3856                 return dup_root;
3857             }
3858           node = node->right;
3859           p_new = &dup_node->right;
3860         }
3861     }
3862 }