1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 #include <gnu/option-groups.h>
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 size_t length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 static void optimize_utf8 (re_dfa_t *dfa);
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
52 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
53 unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static int fetch_number (re_string_t *input, re_token_t *token,
60 static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax) internal_function;
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_token_t *token, int token_len,
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 int *equiv_class_alloc,
95 const unsigned char *name);
96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
99 int *char_class_alloc,
100 const unsigned char *class_name,
101 reg_syntax_t syntax);
102 #else /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 const unsigned char *name);
105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 const unsigned char *class_name,
108 reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 RE_TRANSLATE_TYPE trans,
112 const unsigned char *class_name,
113 const unsigned char *extra,
114 int non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 /* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
131 const char __re_error_msgid[] attribute_hidden =
133 #define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 const size_t __re_error_msgid_idx[] attribute_hidden =
206 /* Entry points for GNU code. */
208 /* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
216 re_compile_pattern (pattern, length, bufp)
219 struct re_pattern_buffer *bufp;
223 /* And GNU code determines whether or not to get register information
224 by passing null for the REGS argument to re_match, etc., not by
225 setting no_sub, unless RE_NO_SUB is set. */
226 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
228 /* Match anchors at newline. */
229 bufp->newline_anchor = 1;
231 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 weak_alias (__re_compile_pattern, re_compile_pattern)
241 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
242 also be assigned to arbitrarily: each pattern buffer stores its own
243 syntax, so it can be changed between regex compilations. */
244 /* This has no initializer because initialized variables in Emacs
245 become read-only after dumping. */
246 reg_syntax_t re_syntax_options;
249 /* Specify the precise syntax of regexps for compilation. This provides
250 for compatibility for various utilities which historically have
251 different, incompatible syntaxes.
253 The argument SYNTAX is a bit mask comprised of the various bits
254 defined in regex.h. We return the old syntax. */
257 re_set_syntax (syntax)
260 reg_syntax_t ret = re_syntax_options;
262 re_syntax_options = syntax;
266 weak_alias (__re_set_syntax, re_set_syntax)
270 re_compile_fastmap (bufp)
271 struct re_pattern_buffer *bufp;
273 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
274 char *fastmap = bufp->fastmap;
276 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
277 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
278 if (dfa->init_state != dfa->init_state_word)
279 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
280 if (dfa->init_state != dfa->init_state_nl)
281 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
282 if (dfa->init_state != dfa->init_state_begbuf)
283 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
284 bufp->fastmap_accurate = 1;
288 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 __attribute ((always_inline))
293 re_set_fastmap (char *fastmap, int icase, int ch)
297 fastmap[tolower (ch)] = 1;
300 /* Helper function for re_compile_fastmap.
301 Compile fastmap for the initial_state INIT_STATE. */
304 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
309 int icase = (dfa_mb_cur_max (dfa) == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
315 if (type == CHARACTER)
317 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
318 #ifdef RE_ENABLE_I18N
319 if ((bufp->syntax & RE_ICASE) && dfa_mb_cur_max (dfa) > 1)
321 unsigned char *buf = alloca (dfa_mb_cur_max (dfa)), *p;
326 *p++ = dfa->nodes[node].opr.c;
327 while (++node < dfa->nodes_len
328 && dfa->nodes[node].type == CHARACTER
329 && dfa->nodes[node].mb_partial)
330 *p++ = dfa->nodes[node].opr.c;
331 memset (&state, '\0', sizeof (state));
332 if (__mbrtowc (&wc, (const char *) buf, p - buf,
334 && (__wcrtomb ((char *) buf, towlower (wc), &state)
336 re_set_fastmap (fastmap, 0, buf[0]);
340 else if (type == SIMPLE_BRACKET)
343 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
346 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
347 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
348 if (w & ((bitset_word_t) 1 << j))
349 re_set_fastmap (fastmap, icase, ch);
353 /* When OPTION_EGLIBC_LOCALE_CODE is disabled, the current
354 locale is always C, which has no rules and no multi-byte
356 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
357 else if (type == COMPLEX_BRACKET)
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 /* See if we have to try all bytes which start multiple collation
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
376 re_set_fastmap (fastmap, icase, i);
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa_mb_cur_max (dfa) > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
387 || cset->nequiv_classes
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa_mb_cur_max (dfa) > 1)
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *__restrict preg;
475 const char *__restrict pattern;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
528 weak_alias (__regcomp, regcomp)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode, preg, errbuf, errbuf_size)
537 const regex_t *__restrict preg;
538 char *__restrict errbuf;
545 || errcode >= (int) (sizeof (__re_error_msgid_idx)
546 / sizeof (__re_error_msgid_idx[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
553 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
555 msg_size = strlen (msg) + 1; /* Includes the null. */
557 if (BE (errbuf_size != 0, 1))
559 if (BE (msg_size > errbuf_size, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
564 memcpy (errbuf, msg, errbuf_size - 1);
565 errbuf[errbuf_size - 1] = 0;
569 memcpy (errbuf, msg, msg_size);
575 weak_alias (__regerror, regerror)
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset_t utf8_sb_map =
586 /* Set the first 128 bits. */
587 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
593 free_dfa_content (re_dfa_t *dfa)
598 for (i = 0; i < dfa->nodes_len; ++i)
599 free_token (dfa->nodes + i);
600 re_free (dfa->nexts);
601 for (i = 0; i < dfa->nodes_len; ++i)
603 if (dfa->eclosures != NULL)
604 re_node_set_free (dfa->eclosures + i);
605 if (dfa->inveclosures != NULL)
606 re_node_set_free (dfa->inveclosures + i);
607 if (dfa->edests != NULL)
608 re_node_set_free (dfa->edests + i);
610 re_free (dfa->edests);
611 re_free (dfa->eclosures);
612 re_free (dfa->inveclosures);
613 re_free (dfa->nodes);
615 if (dfa->state_table)
616 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 struct re_state_table_entry *entry = dfa->state_table + i;
619 for (j = 0; j < entry->num; ++j)
621 re_dfastate_t *state = entry->array[j];
624 re_free (entry->array);
626 re_free (dfa->state_table);
627 #ifdef RE_ENABLE_I18N
628 if (dfa->sb_char != utf8_sb_map)
629 re_free (dfa->sb_char);
631 re_free (dfa->subexp_map);
633 re_free (dfa->re_str);
640 /* Free dynamically allocated space used by PREG. */
646 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
647 if (BE (dfa != NULL, 1))
648 free_dfa_content (dfa);
652 re_free (preg->fastmap);
653 preg->fastmap = NULL;
655 re_free (preg->translate);
656 preg->translate = NULL;
659 weak_alias (__regfree, regfree)
662 /* Entry points compatible with 4.2 BSD regex library. We don't define
663 them unless specifically requested. */
665 #if defined _REGEX_RE_COMP || defined _LIBC
667 /* BSD has one and only one pattern buffer. */
668 static struct re_pattern_buffer re_comp_buf;
672 /* Make these definitions weak in libc, so POSIX programs can redefine
673 these names if they don't use our functions, and still use
674 regcomp/regexec above without link errors. */
685 if (!re_comp_buf.buffer)
686 return gettext ("No previous regular expression");
690 if (re_comp_buf.buffer)
692 fastmap = re_comp_buf.fastmap;
693 re_comp_buf.fastmap = NULL;
694 __regfree (&re_comp_buf);
695 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
696 re_comp_buf.fastmap = fastmap;
699 if (re_comp_buf.fastmap == NULL)
701 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
702 if (re_comp_buf.fastmap == NULL)
703 return (char *) gettext (__re_error_msgid
704 + __re_error_msgid_idx[(int) REG_ESPACE]);
707 /* Since `re_exec' always passes NULL for the `regs' argument, we
708 don't need to initialize the pattern buffer fields which affect it. */
710 /* Match anchors at newlines. */
711 re_comp_buf.newline_anchor = 1;
713 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
719 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723 libc_freeres_fn (free_mem)
725 __regfree (&re_comp_buf);
729 #endif /* _REGEX_RE_COMP */
731 /* Internal entry point.
732 Compile the regular expression PATTERN, whose length is LENGTH.
733 SYNTAX indicate regular expression's syntax. */
736 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
739 reg_errcode_t err = REG_NOERROR;
743 /* Initialize the pattern buffer. */
744 preg->fastmap_accurate = 0;
745 preg->syntax = syntax;
746 preg->not_bol = preg->not_eol = 0;
749 preg->can_be_null = 0;
750 preg->regs_allocated = REGS_UNALLOCATED;
752 /* Initialize the dfa. */
753 dfa = (re_dfa_t *) preg->buffer;
754 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
756 /* If zero allocated, but buffer is non-null, try to realloc
757 enough space. This loses if buffer's address is bogus, but
758 that is the user's responsibility. If ->buffer is NULL this
759 is a simple allocation. */
760 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
763 preg->allocated = sizeof (re_dfa_t);
764 preg->buffer = (unsigned char *) dfa;
766 preg->used = sizeof (re_dfa_t);
768 err = init_dfa (dfa, length);
769 if (BE (err != REG_NOERROR, 0))
771 free_dfa_content (dfa);
777 /* Note: length+1 will not overflow since it is checked in init_dfa. */
778 dfa->re_str = re_malloc (char, length + 1);
779 strncpy (dfa->re_str, pattern, length + 1);
782 __libc_lock_init (dfa->lock);
784 err = re_string_construct (®exp, pattern, length, preg->translate,
785 syntax & RE_ICASE, dfa);
786 if (BE (err != REG_NOERROR, 0))
788 re_compile_internal_free_return:
789 free_workarea_compile (preg);
790 re_string_destruct (®exp);
791 free_dfa_content (dfa);
797 /* Parse the regular expression, and build a structure tree. */
799 dfa->str_tree = parse (®exp, preg, syntax, &err);
800 if (BE (dfa->str_tree == NULL, 0))
801 goto re_compile_internal_free_return;
803 /* Analyze the tree and create the nfa. */
804 err = analyze (preg);
805 if (BE (err != REG_NOERROR, 0))
806 goto re_compile_internal_free_return;
808 #ifdef RE_ENABLE_I18N
809 /* If possible, do searching in single byte encoding to speed things up. */
810 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
814 /* Then create the initial state of the dfa. */
815 err = create_initial_state (dfa);
817 /* Release work areas. */
818 free_workarea_compile (preg);
819 re_string_destruct (®exp);
821 if (BE (err != REG_NOERROR, 0))
823 free_dfa_content (dfa);
831 /* Initialize DFA. We use the length of the regular expression PAT_LEN
832 as the initial length of some arrays. */
835 init_dfa (re_dfa_t *dfa, size_t pat_len)
837 unsigned int table_size;
842 memset (dfa, '\0', sizeof (re_dfa_t));
844 /* Force allocation of str_tree_storage the first time. */
845 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
847 /* Avoid overflows. */
848 if (pat_len == SIZE_MAX)
851 dfa->nodes_alloc = pat_len + 1;
852 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
854 /* table_size = 2 ^ ceil(log pat_len) */
855 for (table_size = 1; ; table_size <<= 1)
856 if (table_size > pat_len)
859 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
860 dfa->state_hash_mask = table_size - 1;
862 dfa->mb_cur_max = MB_CUR_MAX;
864 if (dfa_mb_cur_max (dfa) == 6
865 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
867 # if __OPTION_EGLIBC_LOCALE_CODE
868 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
871 dfa->map_notascii = 0;
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name = nl_langinfo (CODESET);
877 codeset_name = getenv ("LC_ALL");
878 if (codeset_name == NULL || codeset_name[0] == '\0')
879 codeset_name = getenv ("LC_CTYPE");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LANG");
882 if (codeset_name == NULL)
884 else if (strchr (codeset_name, '.') != NULL)
885 codeset_name = strchr (codeset_name, '.') + 1;
888 if (strcasecmp (codeset_name, "UTF-8") == 0
889 || strcasecmp (codeset_name, "UTF8") == 0)
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa->map_notascii = 0;
897 #ifdef RE_ENABLE_I18N
898 if (dfa_mb_cur_max (dfa) > 1)
901 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
907 if (BE (dfa->sb_char == NULL, 0))
910 /* Set the bits corresponding to single byte chars. */
911 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
912 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
914 wint_t wch = __btowc (ch);
916 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
918 if (isascii (ch) && wch != ch)
919 dfa->map_notascii = 1;
926 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
931 /* Initialize WORD_CHAR table, which indicate which character is
932 "word". In this case "word" means that it is the word construction
933 character used by some operators like "\<", "\>", etc. */
937 init_word_char (re_dfa_t *dfa)
940 dfa->word_ops_used = 1;
941 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
942 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
943 if (isalnum (ch) || ch == '_')
944 dfa->word_char[i] |= (bitset_word_t) 1 << j;
947 /* Free the work area which are only used while compiling. */
950 free_workarea_compile (regex_t *preg)
952 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
953 bin_tree_storage_t *storage, *next;
954 for (storage = dfa->str_tree_storage; storage; storage = next)
956 next = storage->next;
959 dfa->str_tree_storage = NULL;
960 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
961 dfa->str_tree = NULL;
962 re_free (dfa->org_indices);
963 dfa->org_indices = NULL;
966 /* Create initial states for all contexts. */
969 create_initial_state (re_dfa_t *dfa)
973 re_node_set init_nodes;
975 /* Initial states have the epsilon closure of the node which is
976 the first node of the regular expression. */
977 first = dfa->str_tree->first->node_idx;
978 dfa->init_node = first;
979 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
980 if (BE (err != REG_NOERROR, 0))
983 /* The back-references which are in initial states can epsilon transit,
984 since in this case all of the subexpressions can be null.
985 Then we add epsilon closures of the nodes which are the next nodes of
986 the back-references. */
987 if (dfa->nbackref > 0)
988 for (i = 0; i < init_nodes.nelem; ++i)
990 int node_idx = init_nodes.elems[i];
991 re_token_type_t type = dfa->nodes[node_idx].type;
994 if (type != OP_BACK_REF)
996 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
998 re_token_t *clexp_node;
999 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1000 if (clexp_node->type == OP_CLOSE_SUBEXP
1001 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1004 if (clexp_idx == init_nodes.nelem)
1007 if (type == OP_BACK_REF)
1009 int dest_idx = dfa->edests[node_idx].elems[0];
1010 if (!re_node_set_contains (&init_nodes, dest_idx))
1012 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018 /* It must be the first time to invoke acquire_state. */
1019 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1020 /* We don't check ERR here, since the initial state must not be NULL. */
1021 if (BE (dfa->init_state == NULL, 0))
1023 if (dfa->init_state->has_constraint)
1025 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1027 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1029 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1033 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1034 || dfa->init_state_begbuf == NULL, 0))
1038 dfa->init_state_word = dfa->init_state_nl
1039 = dfa->init_state_begbuf = dfa->init_state;
1041 re_node_set_free (&init_nodes);
1045 #ifdef RE_ENABLE_I18N
1046 /* If it is possible to do searching in single byte encoding instead of UTF-8
1047 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1048 DFA nodes where needed. */
1051 optimize_utf8 (re_dfa_t *dfa)
1053 int node, i, mb_chars = 0, has_period = 0;
1055 for (node = 0; node < dfa->nodes_len; ++node)
1056 switch (dfa->nodes[node].type)
1059 if (dfa->nodes[node].opr.c >= 0x80)
1063 switch (dfa->nodes[node].opr.ctx_type)
1071 /* Word anchors etc. cannot be handled. It's okay to test
1072 opr.ctx_type since constraints (for all DFA nodes) are
1073 created by ORing one or more opr.ctx_type values. */
1083 case OP_DUP_ASTERISK:
1084 case OP_OPEN_SUBEXP:
1085 case OP_CLOSE_SUBEXP:
1087 case COMPLEX_BRACKET:
1089 case SIMPLE_BRACKET:
1090 /* Just double check. The non-ASCII range starts at 0x80. */
1091 assert (0x80 % BITSET_WORD_BITS == 0);
1092 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1093 if (dfa->nodes[node].opr.sbcset[i])
1100 if (mb_chars || has_period)
1101 for (node = 0; node < dfa->nodes_len; ++node)
1103 if (dfa->nodes[node].type == CHARACTER
1104 && dfa->nodes[node].opr.c >= 0x80)
1105 dfa->nodes[node].mb_partial = 0;
1106 else if (dfa->nodes[node].type == OP_PERIOD)
1107 dfa->nodes[node].type = OP_UTF8_PERIOD;
1110 /* The search can be in single byte locale. */
1111 dfa->mb_cur_max = 1;
1113 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1117 /* Analyze the structure tree, and calculate "first", "next", "edest",
1118 "eclosure", and "inveclosure". */
1120 static reg_errcode_t
1121 analyze (regex_t *preg)
1123 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1126 /* Allocate arrays. */
1127 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1128 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1129 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1130 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1131 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1132 || dfa->eclosures == NULL, 0))
1135 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1136 if (dfa->subexp_map != NULL)
1139 for (i = 0; i < preg->re_nsub; i++)
1140 dfa->subexp_map[i] = i;
1141 preorder (dfa->str_tree, optimize_subexps, dfa);
1142 for (i = 0; i < preg->re_nsub; i++)
1143 if (dfa->subexp_map[i] != i)
1145 if (i == preg->re_nsub)
1147 free (dfa->subexp_map);
1148 dfa->subexp_map = NULL;
1152 ret = postorder (dfa->str_tree, lower_subexps, preg);
1153 if (BE (ret != REG_NOERROR, 0))
1155 ret = postorder (dfa->str_tree, calc_first, dfa);
1156 if (BE (ret != REG_NOERROR, 0))
1158 preorder (dfa->str_tree, calc_next, dfa);
1159 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1160 if (BE (ret != REG_NOERROR, 0))
1162 ret = calc_eclosure (dfa);
1163 if (BE (ret != REG_NOERROR, 0))
1166 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1167 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1168 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1171 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1172 if (BE (dfa->inveclosures == NULL, 0))
1174 ret = calc_inveclosure (dfa);
1180 /* Our parse trees are very unbalanced, so we cannot use a stack to
1181 implement parse tree visits. Instead, we use parent pointers and
1182 some hairy code in these two functions. */
1183 static reg_errcode_t
1184 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1187 bin_tree_t *node, *prev;
1189 for (node = root; ; )
1191 /* Descend down the tree, preferably to the left (or to the right
1192 if that's the only child). */
1193 while (node->left || node->right)
1201 reg_errcode_t err = fn (extra, node);
1202 if (BE (err != REG_NOERROR, 0))
1204 if (node->parent == NULL)
1207 node = node->parent;
1209 /* Go up while we have a node that is reached from the right. */
1210 while (node->right == prev || node->right == NULL);
1215 static reg_errcode_t
1216 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221 for (node = root; ; )
1223 reg_errcode_t err = fn (extra, node);
1224 if (BE (err != REG_NOERROR, 0))
1227 /* Go to the left node, or up and to the right. */
1232 bin_tree_t *prev = NULL;
1233 while (node->right == prev || node->right == NULL)
1236 node = node->parent;
1245 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1246 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1247 backreferences as well. Requires a preorder visit. */
1248 static reg_errcode_t
1249 optimize_subexps (void *extra, bin_tree_t *node)
1251 re_dfa_t *dfa = (re_dfa_t *) extra;
1253 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1255 int idx = node->token.opr.idx;
1256 node->token.opr.idx = dfa->subexp_map[idx];
1257 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1260 else if (node->token.type == SUBEXP
1261 && node->left && node->left->token.type == SUBEXP)
1263 int other_idx = node->left->token.opr.idx;
1265 node->left = node->left->left;
1267 node->left->parent = node;
1269 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1270 if (other_idx < BITSET_WORD_BITS)
1271 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1277 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1278 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1279 static reg_errcode_t
1280 lower_subexps (void *extra, bin_tree_t *node)
1282 regex_t *preg = (regex_t *) extra;
1283 reg_errcode_t err = REG_NOERROR;
1285 if (node->left && node->left->token.type == SUBEXP)
1287 node->left = lower_subexp (&err, preg, node->left);
1289 node->left->parent = node;
1291 if (node->right && node->right->token.type == SUBEXP)
1293 node->right = lower_subexp (&err, preg, node->right);
1295 node->right->parent = node;
1302 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1304 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1305 bin_tree_t *body = node->left;
1306 bin_tree_t *op, *cls, *tree1, *tree;
1309 /* We do not optimize empty subexpressions, because otherwise we may
1310 have bad CONCAT nodes with NULL children. This is obviously not
1311 very common, so we do not lose much. An example that triggers
1312 this case is the sed "script" /\(\)/x. */
1313 && node->left != NULL
1314 && (node->token.opr.idx >= BITSET_WORD_BITS
1315 || !(dfa->used_bkref_map
1316 & ((bitset_word_t) 1 << node->token.opr.idx))))
1319 /* Convert the SUBEXP node to the concatenation of an
1320 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1321 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1322 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1323 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1324 tree = create_tree (dfa, op, tree1, CONCAT);
1325 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1331 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1332 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1336 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1337 nodes. Requires a postorder visit. */
1338 static reg_errcode_t
1339 calc_first (void *extra, bin_tree_t *node)
1341 re_dfa_t *dfa = (re_dfa_t *) extra;
1342 if (node->token.type == CONCAT)
1344 node->first = node->left->first;
1345 node->node_idx = node->left->node_idx;
1350 node->node_idx = re_dfa_add_node (dfa, node->token);
1351 if (BE (node->node_idx == -1, 0))
1353 if (node->token.type == ANCHOR)
1354 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1359 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1360 static reg_errcode_t
1361 calc_next (void *extra, bin_tree_t *node)
1363 switch (node->token.type)
1365 case OP_DUP_ASTERISK:
1366 node->left->next = node;
1369 node->left->next = node->right->first;
1370 node->right->next = node->next;
1374 node->left->next = node->next;
1376 node->right->next = node->next;
1382 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1383 static reg_errcode_t
1384 link_nfa_nodes (void *extra, bin_tree_t *node)
1386 re_dfa_t *dfa = (re_dfa_t *) extra;
1387 int idx = node->node_idx;
1388 reg_errcode_t err = REG_NOERROR;
1390 switch (node->token.type)
1396 assert (node->next == NULL);
1399 case OP_DUP_ASTERISK:
1403 dfa->has_plural_match = 1;
1404 if (node->left != NULL)
1405 left = node->left->first->node_idx;
1407 left = node->next->node_idx;
1408 if (node->right != NULL)
1409 right = node->right->first->node_idx;
1411 right = node->next->node_idx;
1413 assert (right > -1);
1414 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1419 case OP_OPEN_SUBEXP:
1420 case OP_CLOSE_SUBEXP:
1421 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1425 dfa->nexts[idx] = node->next->node_idx;
1426 if (node->token.type == OP_BACK_REF)
1427 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1431 assert (!IS_EPSILON_NODE (node->token.type));
1432 dfa->nexts[idx] = node->next->node_idx;
1439 /* Duplicate the epsilon closure of the node ROOT_NODE.
1440 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1441 to their own constraint. */
1443 static reg_errcode_t
1445 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1446 int root_node, unsigned int init_constraint)
1448 int org_node, clone_node, ret;
1449 unsigned int constraint = init_constraint;
1450 for (org_node = top_org_node, clone_node = top_clone_node;;)
1452 int org_dest, clone_dest;
1453 if (dfa->nodes[org_node].type == OP_BACK_REF)
1455 /* If the back reference epsilon-transit, its destination must
1456 also have the constraint. Then duplicate the epsilon closure
1457 of the destination of the back reference, and store it in
1458 edests of the back reference. */
1459 org_dest = dfa->nexts[org_node];
1460 re_node_set_empty (dfa->edests + clone_node);
1461 clone_dest = duplicate_node (dfa, org_dest, constraint);
1462 if (BE (clone_dest == -1, 0))
1464 dfa->nexts[clone_node] = dfa->nexts[org_node];
1465 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466 if (BE (ret < 0, 0))
1469 else if (dfa->edests[org_node].nelem == 0)
1471 /* In case of the node can't epsilon-transit, don't duplicate the
1472 destination and store the original destination as the
1473 destination of the node. */
1474 dfa->nexts[clone_node] = dfa->nexts[org_node];
1477 else if (dfa->edests[org_node].nelem == 1)
1479 /* In case of the node can epsilon-transit, and it has only one
1481 org_dest = dfa->edests[org_node].elems[0];
1482 re_node_set_empty (dfa->edests + clone_node);
1483 /* If the node is root_node itself, it means the epsilon clsoure
1484 has a loop. Then tie it to the destination of the root_node. */
1485 if (org_node == root_node && clone_node != org_node)
1487 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1488 if (BE (ret < 0, 0))
1492 /* In case of the node has another constraint, add it. */
1493 constraint |= dfa->nodes[org_node].constraint;
1494 clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 if (BE (clone_dest == -1, 0))
1497 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 if (BE (ret < 0, 0))
1501 else /* dfa->edests[org_node].nelem == 2 */
1503 /* In case of the node can epsilon-transit, and it has two
1504 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1505 org_dest = dfa->edests[org_node].elems[0];
1506 re_node_set_empty (dfa->edests + clone_node);
1507 /* Search for a duplicated node which satisfies the constraint. */
1508 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1509 if (clone_dest == -1)
1511 /* There is no such duplicated node, create a new one. */
1513 clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 if (BE (clone_dest == -1, 0))
1516 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (ret < 0, 0))
1519 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1520 root_node, constraint);
1521 if (BE (err != REG_NOERROR, 0))
1526 /* There is a duplicated node which satisfies the constraint,
1527 use it to avoid infinite loop. */
1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 if (BE (ret < 0, 0))
1533 org_dest = dfa->edests[org_node].elems[1];
1534 clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 if (BE (clone_dest == -1, 0))
1537 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 if (BE (ret < 0, 0))
1541 org_node = org_dest;
1542 clone_node = clone_dest;
1547 /* Search for a node which is duplicated from the node ORG_NODE, and
1548 satisfies the constraint CONSTRAINT. */
1551 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1552 unsigned int constraint)
1555 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1557 if (org_node == dfa->org_indices[idx]
1558 && constraint == dfa->nodes[idx].constraint)
1559 return idx; /* Found. */
1561 return -1; /* Not found. */
1564 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1565 Return the index of the new node, or -1 if insufficient storage is
1569 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1571 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1572 if (BE (dup_idx != -1, 1))
1574 dfa->nodes[dup_idx].constraint = constraint;
1575 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1576 dfa->nodes[dup_idx].duplicated = 1;
1578 /* Store the index of the original node. */
1579 dfa->org_indices[dup_idx] = org_idx;
1584 static reg_errcode_t
1585 calc_inveclosure (re_dfa_t *dfa)
1588 for (idx = 0; idx < dfa->nodes_len; ++idx)
1589 re_node_set_init_empty (dfa->inveclosures + idx);
1591 for (src = 0; src < dfa->nodes_len; ++src)
1593 int *elems = dfa->eclosures[src].elems;
1594 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1596 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1597 if (BE (ret == -1, 0))
1605 /* Calculate "eclosure" for all the node in DFA. */
1607 static reg_errcode_t
1608 calc_eclosure (re_dfa_t *dfa)
1610 int node_idx, incomplete;
1612 assert (dfa->nodes_len > 0);
1615 /* For each nodes, calculate epsilon closure. */
1616 for (node_idx = 0; ; ++node_idx)
1619 re_node_set eclosure_elem;
1620 if (node_idx == dfa->nodes_len)
1629 assert (dfa->eclosures[node_idx].nelem != -1);
1632 /* If we have already calculated, skip it. */
1633 if (dfa->eclosures[node_idx].nelem != 0)
1635 /* Calculate epsilon closure of `node_idx'. */
1636 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1637 if (BE (err != REG_NOERROR, 0))
1640 if (dfa->eclosures[node_idx].nelem == 0)
1643 re_node_set_free (&eclosure_elem);
1649 /* Calculate epsilon closure of NODE. */
1651 static reg_errcode_t
1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1656 re_node_set eclosure;
1659 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1660 if (BE (err != REG_NOERROR, 0))
1663 /* This indicates that we are calculating this node now.
1664 We reference this value to avoid infinite loop. */
1665 dfa->eclosures[node].nelem = -1;
1667 /* If the current node has constraints, duplicate all nodes
1668 since they must inherit the constraints. */
1669 if (dfa->nodes[node].constraint
1670 && dfa->edests[node].nelem
1671 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1673 err = duplicate_node_closure (dfa, node, node, node,
1674 dfa->nodes[node].constraint);
1675 if (BE (err != REG_NOERROR, 0))
1679 /* Expand each epsilon destination nodes. */
1680 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1681 for (i = 0; i < dfa->edests[node].nelem; ++i)
1683 re_node_set eclosure_elem;
1684 int edest = dfa->edests[node].elems[i];
1685 /* If calculating the epsilon closure of `edest' is in progress,
1686 return intermediate result. */
1687 if (dfa->eclosures[edest].nelem == -1)
1692 /* If we haven't calculated the epsilon closure of `edest' yet,
1693 calculate now. Otherwise use calculated epsilon closure. */
1694 if (dfa->eclosures[edest].nelem == 0)
1696 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1697 if (BE (err != REG_NOERROR, 0))
1701 eclosure_elem = dfa->eclosures[edest];
1702 /* Merge the epsilon closure of `edest'. */
1703 re_node_set_merge (&eclosure, &eclosure_elem);
1704 /* If the epsilon closure of `edest' is incomplete,
1705 the epsilon closure of this node is also incomplete. */
1706 if (dfa->eclosures[edest].nelem == 0)
1709 re_node_set_free (&eclosure_elem);
1713 /* An epsilon closure includes itself. */
1714 ret = re_node_set_insert (&eclosure, node);
1715 if (BE (ret < 0, 0))
1717 if (incomplete && !root)
1718 dfa->eclosures[node].nelem = 0;
1720 dfa->eclosures[node] = eclosure;
1721 *new_set = eclosure;
1725 /* Functions for token which are used in the parser. */
1727 /* Fetch a token from INPUT.
1728 We must not use this function inside bracket expressions. */
1732 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1734 re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 /* Peek a token from INPUT, and return the length of the token.
1738 We must not use this function inside bracket expressions. */
1742 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746 if (re_string_eoi (input))
1748 token->type = END_OF_RE;
1752 c = re_string_peek_byte (input, 0);
1755 token->word_char = 0;
1756 #ifdef RE_ENABLE_I18N
1757 token->mb_partial = 0;
1758 if (string_mb_cur_max (input) > 1 &&
1759 !re_string_first_byte (input, re_string_cur_idx (input)))
1761 token->type = CHARACTER;
1762 token->mb_partial = 1;
1769 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1771 token->type = BACK_SLASH;
1775 c2 = re_string_peek_byte_case (input, 1);
1777 token->type = CHARACTER;
1778 #ifdef RE_ENABLE_I18N
1779 if (string_mb_cur_max (input) > 1)
1781 wint_t wc = re_string_wchar_at (input,
1782 re_string_cur_idx (input) + 1);
1783 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787 token->word_char = IS_WORD_CHAR (c2) != 0;
1792 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1793 token->type = OP_ALT;
1795 case '1': case '2': case '3': case '4': case '5':
1796 case '6': case '7': case '8': case '9':
1797 if (!(syntax & RE_NO_BK_REFS))
1799 token->type = OP_BACK_REF;
1800 token->opr.idx = c2 - '1';
1804 if (!(syntax & RE_NO_GNU_OPS))
1806 token->type = ANCHOR;
1807 token->opr.ctx_type = WORD_FIRST;
1811 if (!(syntax & RE_NO_GNU_OPS))
1813 token->type = ANCHOR;
1814 token->opr.ctx_type = WORD_LAST;
1818 if (!(syntax & RE_NO_GNU_OPS))
1820 token->type = ANCHOR;
1821 token->opr.ctx_type = WORD_DELIM;
1825 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = ANCHOR;
1828 token->opr.ctx_type = NOT_WORD_DELIM;
1832 if (!(syntax & RE_NO_GNU_OPS))
1833 token->type = OP_WORD;
1836 if (!(syntax & RE_NO_GNU_OPS))
1837 token->type = OP_NOTWORD;
1840 if (!(syntax & RE_NO_GNU_OPS))
1841 token->type = OP_SPACE;
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = OP_NOTSPACE;
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = BUF_FIRST;
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = BUF_LAST;
1862 if (!(syntax & RE_NO_BK_PARENS))
1863 token->type = OP_OPEN_SUBEXP;
1866 if (!(syntax & RE_NO_BK_PARENS))
1867 token->type = OP_CLOSE_SUBEXP;
1870 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1871 token->type = OP_DUP_PLUS;
1874 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1875 token->type = OP_DUP_QUESTION;
1878 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1879 token->type = OP_OPEN_DUP_NUM;
1882 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1883 token->type = OP_CLOSE_DUP_NUM;
1891 token->type = CHARACTER;
1892 #ifdef RE_ENABLE_I18N
1893 if (string_mb_cur_max (input) > 1)
1895 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1896 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900 token->word_char = IS_WORD_CHAR (token->opr.c);
1905 if (syntax & RE_NEWLINE_ALT)
1906 token->type = OP_ALT;
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1910 token->type = OP_ALT;
1913 token->type = OP_DUP_ASTERISK;
1916 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1917 token->type = OP_DUP_PLUS;
1920 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1921 token->type = OP_DUP_QUESTION;
1924 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1925 token->type = OP_OPEN_DUP_NUM;
1928 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1929 token->type = OP_CLOSE_DUP_NUM;
1932 if (syntax & RE_NO_BK_PARENS)
1933 token->type = OP_OPEN_SUBEXP;
1936 if (syntax & RE_NO_BK_PARENS)
1937 token->type = OP_CLOSE_SUBEXP;
1940 token->type = OP_OPEN_BRACKET;
1943 token->type = OP_PERIOD;
1946 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1947 re_string_cur_idx (input) != 0)
1949 char prev = re_string_peek_byte (input, -1);
1950 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1953 token->type = ANCHOR;
1954 token->opr.ctx_type = LINE_FIRST;
1957 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1958 re_string_cur_idx (input) + 1 != re_string_length (input))
1961 re_string_skip_bytes (input, 1);
1962 peek_token (&next, input, syntax);
1963 re_string_skip_bytes (input, -1);
1964 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1967 token->type = ANCHOR;
1968 token->opr.ctx_type = LINE_LAST;
1976 /* Peek a token from INPUT, and return the length of the token.
1977 We must not use this function out of bracket expressions. */
1981 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1984 if (re_string_eoi (input))
1986 token->type = END_OF_RE;
1989 c = re_string_peek_byte (input, 0);
1992 #ifdef RE_ENABLE_I18N
1993 if (string_mb_cur_max (input) > 1 &&
1994 !re_string_first_byte (input, re_string_cur_idx (input)))
1996 token->type = CHARACTER;
1999 #endif /* RE_ENABLE_I18N */
2001 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2002 && re_string_cur_idx (input) + 1 < re_string_length (input))
2004 /* In this case, '\' escape a character. */
2006 re_string_skip_bytes (input, 1);
2007 c2 = re_string_peek_byte (input, 0);
2009 token->type = CHARACTER;
2012 if (c == '[') /* '[' is a special char in a bracket exps. */
2016 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2017 c2 = re_string_peek_byte (input, 1);
2025 token->type = OP_OPEN_COLL_ELEM;
2028 token->type = OP_OPEN_EQUIV_CLASS;
2031 if (syntax & RE_CHAR_CLASSES)
2033 token->type = OP_OPEN_CHAR_CLASS;
2036 /* else fall through. */
2038 token->type = CHARACTER;
2048 token->type = OP_CHARSET_RANGE;
2051 token->type = OP_CLOSE_BRACKET;
2054 token->type = OP_NON_MATCH_LIST;
2057 token->type = CHARACTER;
2062 /* Functions for parser. */
2064 /* Entry point of the parser.
2065 Parse the regular expression REGEXP and return the structure tree.
2066 If an error is occured, ERR is set by error code, and return NULL.
2067 This function build the following tree, from regular expression <reg_exp>:
2073 CAT means concatenation.
2074 EOR means end of regular expression. */
2077 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2080 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2081 bin_tree_t *tree, *eor, *root;
2082 re_token_t current_token;
2083 dfa->syntax = syntax;
2084 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2085 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2086 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2088 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2090 root = create_tree (dfa, tree, eor, CONCAT);
2093 if (BE (eor == NULL || root == NULL, 0))
2101 /* This function build the following tree, from regular expression
2102 <branch1>|<branch2>:
2108 ALT means alternative, which represents the operator `|'. */
2111 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2112 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2114 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115 bin_tree_t *tree, *branch = NULL;
2116 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2117 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 while (token->type == OP_ALT)
2122 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2123 if (token->type != OP_ALT && token->type != END_OF_RE
2124 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2126 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2127 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2132 tree = create_tree (dfa, tree, branch, OP_ALT);
2133 if (BE (tree == NULL, 0))
2142 /* This function build the following tree, from regular expression
2149 CAT means concatenation. */
2152 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2155 bin_tree_t *tree, *exp;
2156 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2157 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2158 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161 while (token->type != OP_ALT && token->type != END_OF_RE
2162 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2164 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2165 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2168 postorder (tree, free_tree, NULL);
2171 if (tree != NULL && exp != NULL)
2173 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2174 if (newtree == NULL)
2176 postorder (exp, free_tree, NULL);
2177 postorder (tree, free_tree, NULL);
2183 else if (tree == NULL)
2185 /* Otherwise exp == NULL, we don't need to create new tree. */
2190 /* This function build the following tree, from regular expression a*:
2197 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2198 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2200 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2202 switch (token->type)
2205 tree = create_token_tree (dfa, NULL, NULL, token);
2206 if (BE (tree == NULL, 0))
2211 #ifdef RE_ENABLE_I18N
2212 if (dfa_mb_cur_max (dfa) > 1)
2214 while (!re_string_eoi (regexp)
2215 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217 bin_tree_t *mbc_remain;
2218 fetch_token (token, regexp, syntax);
2219 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2220 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2221 if (BE (mbc_remain == NULL || tree == NULL, 0))
2230 case OP_OPEN_SUBEXP:
2231 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2232 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2235 case OP_OPEN_BRACKET:
2236 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2237 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2241 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2246 dfa->used_bkref_map |= 1 << token->opr.idx;
2247 tree = create_token_tree (dfa, NULL, NULL, token);
2248 if (BE (tree == NULL, 0))
2254 dfa->has_mb_node = 1;
2256 case OP_OPEN_DUP_NUM:
2257 if (syntax & RE_CONTEXT_INVALID_DUP)
2263 case OP_DUP_ASTERISK:
2265 case OP_DUP_QUESTION:
2266 if (syntax & RE_CONTEXT_INVALID_OPS)
2271 else if (syntax & RE_CONTEXT_INDEP_OPS)
2273 fetch_token (token, regexp, syntax);
2274 return parse_expression (regexp, preg, token, syntax, nest, err);
2276 /* else fall through */
2277 case OP_CLOSE_SUBEXP:
2278 if ((token->type == OP_CLOSE_SUBEXP) &&
2279 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2284 /* else fall through */
2285 case OP_CLOSE_DUP_NUM:
2286 /* We treat it as a normal character. */
2288 /* Then we can these characters as normal characters. */
2289 token->type = CHARACTER;
2290 /* mb_partial and word_char bits should be initialized already
2292 tree = create_token_tree (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2300 if ((token->opr.ctx_type
2301 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2302 && dfa->word_ops_used == 0)
2303 init_word_char (dfa);
2304 if (token->opr.ctx_type == WORD_DELIM
2305 || token->opr.ctx_type == NOT_WORD_DELIM)
2307 bin_tree_t *tree_first, *tree_last;
2308 if (token->opr.ctx_type == WORD_DELIM)
2310 token->opr.ctx_type = WORD_FIRST;
2311 tree_first = create_token_tree (dfa, NULL, NULL, token);
2312 token->opr.ctx_type = WORD_LAST;
2316 token->opr.ctx_type = INSIDE_WORD;
2317 tree_first = create_token_tree (dfa, NULL, NULL, token);
2318 token->opr.ctx_type = INSIDE_NOTWORD;
2320 tree_last = create_token_tree (dfa, NULL, NULL, token);
2321 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2322 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2330 tree = create_token_tree (dfa, NULL, NULL, token);
2331 if (BE (tree == NULL, 0))
2337 /* We must return here, since ANCHORs can't be followed
2338 by repetition operators.
2339 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2340 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2341 fetch_token (token, regexp, syntax);
2344 tree = create_token_tree (dfa, NULL, NULL, token);
2345 if (BE (tree == NULL, 0))
2350 if (dfa_mb_cur_max (dfa) > 1)
2351 dfa->has_mb_node = 1;
2355 tree = build_charclass_op (dfa, regexp->trans,
2356 (const unsigned char *) "alnum",
2357 (const unsigned char *) "_",
2358 token->type == OP_NOTWORD, err);
2359 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2364 tree = build_charclass_op (dfa, regexp->trans,
2365 (const unsigned char *) "space",
2366 (const unsigned char *) "",
2367 token->type == OP_NOTSPACE, err);
2368 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2378 /* Must not happen? */
2384 fetch_token (token, regexp, syntax);
2386 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2387 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2389 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2390 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 /* In BRE consecutive duplications are not allowed. */
2393 if ((syntax & RE_CONTEXT_INVALID_DUP)
2394 && (token->type == OP_DUP_ASTERISK
2395 || token->type == OP_OPEN_DUP_NUM))
2405 /* This function build the following tree, from regular expression
2413 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2414 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2416 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2419 cur_nsub = preg->re_nsub++;
2421 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2423 /* The subexpression may be a null string. */
2424 if (token->type == OP_CLOSE_SUBEXP)
2428 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2429 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2432 postorder (tree, free_tree, NULL);
2435 if (BE (*err != REG_NOERROR, 0))
2439 if (cur_nsub <= '9' - '1')
2440 dfa->completed_bkref_map |= 1 << cur_nsub;
2442 tree = create_tree (dfa, tree, NULL, SUBEXP);
2443 if (BE (tree == NULL, 0))
2448 tree->token.opr.idx = cur_nsub;
2452 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2455 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2456 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2458 bin_tree_t *tree = NULL, *old_tree = NULL;
2459 int i, start, end, start_idx = re_string_cur_idx (regexp);
2460 re_token_t start_token = *token;
2462 if (token->type == OP_OPEN_DUP_NUM)
2465 start = fetch_number (regexp, token, syntax);
2468 if (token->type == CHARACTER && token->opr.c == ',')
2469 start = 0; /* We treat "{,m}" as "{0,m}". */
2472 *err = REG_BADBR; /* <re>{} is invalid. */
2476 if (BE (start != -2, 1))
2478 /* We treat "{n}" as "{n,n}". */
2479 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2480 : ((token->type == CHARACTER && token->opr.c == ',')
2481 ? fetch_number (regexp, token, syntax) : -2));
2483 if (BE (start == -2 || end == -2, 0))
2485 /* Invalid sequence. */
2486 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2488 if (token->type == END_OF_RE)
2496 /* If the syntax bit is set, rollback. */
2497 re_string_set_index (regexp, start_idx);
2498 *token = start_token;
2499 token->type = CHARACTER;
2500 /* mb_partial and word_char bits should be already initialized by
2505 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2507 /* First number greater than second. */
2514 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2515 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2518 fetch_token (token, regexp, syntax);
2520 if (BE (elem == NULL, 0))
2522 if (BE (start == 0 && end == 0, 0))
2524 postorder (elem, free_tree, NULL);
2528 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2529 if (BE (start > 0, 0))
2532 for (i = 2; i <= start; ++i)
2534 elem = duplicate_tree (elem, dfa);
2535 tree = create_tree (dfa, tree, elem, CONCAT);
2536 if (BE (elem == NULL || tree == NULL, 0))
2537 goto parse_dup_op_espace;
2543 /* Duplicate ELEM before it is marked optional. */
2544 elem = duplicate_tree (elem, dfa);
2550 if (elem->token.type == SUBEXP)
2551 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2553 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2554 if (BE (tree == NULL, 0))
2555 goto parse_dup_op_espace;
2557 /* This loop is actually executed only when end != -1,
2558 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2559 already created the start+1-th copy. */
2560 for (i = start + 2; i <= end; ++i)
2562 elem = duplicate_tree (elem, dfa);
2563 tree = create_tree (dfa, tree, elem, CONCAT);
2564 if (BE (elem == NULL || tree == NULL, 0))
2565 goto parse_dup_op_espace;
2567 tree = create_tree (dfa, tree, NULL, OP_ALT);
2568 if (BE (tree == NULL, 0))
2569 goto parse_dup_op_espace;
2573 tree = create_tree (dfa, old_tree, tree, CONCAT);
2577 parse_dup_op_espace:
2582 /* Size of the names for collating symbol/equivalence_class/character_class.
2583 I'm not sure, but maybe enough. */
2584 #define BRACKET_NAME_BUF_SIZE 32
2587 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2588 Build the range expression which starts from START_ELEM, and ends
2589 at END_ELEM. The result are written to MBCSET and SBCSET.
2590 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2591 mbcset->range_ends, is a pointer argument sinse we may
2594 static reg_errcode_t
2596 # ifdef RE_ENABLE_I18N
2597 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2598 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2599 # else /* not RE_ENABLE_I18N */
2600 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2601 bracket_elem_t *end_elem)
2602 # endif /* not RE_ENABLE_I18N */
2604 unsigned int start_ch, end_ch;
2605 /* Equivalence Classes and Character Classes can't be a range start/end. */
2606 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2607 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2611 /* We can handle no multi character collating elements without libc
2613 if (BE ((start_elem->type == COLL_SYM
2614 && strlen ((char *) start_elem->opr.name) > 1)
2615 || (end_elem->type == COLL_SYM
2616 && strlen ((char *) end_elem->opr.name) > 1), 0))
2617 return REG_ECOLLATE;
2619 # ifdef RE_ENABLE_I18N
2624 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2626 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2627 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2629 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2630 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2632 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2633 ? __btowc (start_ch) : start_elem->opr.wch);
2634 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2635 ? __btowc (end_ch) : end_elem->opr.wch);
2636 if (start_wc == WEOF || end_wc == WEOF)
2637 return REG_ECOLLATE;
2638 cmp_buf[0] = start_wc;
2639 cmp_buf[4] = end_wc;
2640 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2643 /* Got valid collation sequence values, add them as a new entry.
2644 However, for !_LIBC we have no collation elements: if the
2645 character set is single byte, the single byte character set
2646 that we build below suffices. parse_bracket_exp passes
2647 no MBCSET if dfa_mb_cur_max (dfa) == 1. */
2650 /* Check the space of the arrays. */
2651 if (BE (*range_alloc == mbcset->nranges, 0))
2653 /* There is not enough space, need realloc. */
2654 wchar_t *new_array_start, *new_array_end;
2657 /* +1 in case of mbcset->nranges is 0. */
2658 new_nranges = 2 * mbcset->nranges + 1;
2659 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2660 are NULL if *range_alloc == 0. */
2661 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2663 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2666 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2669 mbcset->range_starts = new_array_start;
2670 mbcset->range_ends = new_array_end;
2671 *range_alloc = new_nranges;
2674 mbcset->range_starts[mbcset->nranges] = start_wc;
2675 mbcset->range_ends[mbcset->nranges++] = end_wc;
2678 /* Build the table for single byte characters. */
2679 for (wc = 0; wc < SBC_MAX; ++wc)
2682 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2683 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2684 bitset_set (sbcset, wc);
2687 # else /* not RE_ENABLE_I18N */
2690 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2691 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2693 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2694 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2696 if (start_ch > end_ch)
2698 /* Build the table for single byte characters. */
2699 for (ch = 0; ch < SBC_MAX; ++ch)
2700 if (start_ch <= ch && ch <= end_ch)
2701 bitset_set (sbcset, ch);
2703 # endif /* not RE_ENABLE_I18N */
2706 #endif /* not _LIBC */
2709 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2710 Build the collating element which is represented by NAME.
2711 The result are written to MBCSET and SBCSET.
2712 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2713 pointer argument since we may update it. */
2715 static reg_errcode_t
2717 # ifdef RE_ENABLE_I18N
2718 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2719 int *coll_sym_alloc, const unsigned char *name)
2720 # else /* not RE_ENABLE_I18N */
2721 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2722 # endif /* not RE_ENABLE_I18N */
2724 size_t name_len = strlen ((const char *) name);
2725 if (BE (name_len != 1, 0))
2726 return REG_ECOLLATE;
2729 bitset_set (sbcset, name[0]);
2733 #endif /* not _LIBC */
2735 /* This function parse bracket expression like "[abc]", "[a-c]",
2739 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2740 reg_syntax_t syntax, reg_errcode_t *err)
2743 #if __OPTION_EGLIBC_LOCALE_CODE
2744 const unsigned char *collseqmb;
2745 # define COLLSEQMB_LOOKUP(ix) (collseqmb[(ix)])
2747 # define COLLSEQMB_LOOKUP(ix) (ix)
2750 const char *collseqwc;
2753 const int32_t *symb_table;
2754 const unsigned char *extra;
2756 /* Local function for parse_bracket_exp used in _LIBC environement.
2757 Seek the collating symbol entry correspondings to NAME.
2758 Return the index of the symbol in the SYMB_TABLE. */
2761 __attribute ((always_inline))
2762 seek_collating_symbol_entry (name, name_len)
2763 const unsigned char *name;
2766 int32_t hash = elem_hash ((const char *) name, name_len);
2767 int32_t elem = hash % table_size;
2768 if (symb_table[2 * elem] != 0)
2770 int32_t second = hash % (table_size - 2) + 1;
2774 /* First compare the hashing value. */
2775 if (symb_table[2 * elem] == hash
2776 /* Compare the length of the name. */
2777 && name_len == extra[symb_table[2 * elem + 1]]
2778 /* Compare the name. */
2779 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2782 /* Yep, this is the entry. */
2789 while (symb_table[2 * elem] != 0);
2794 /* Local function for parse_bracket_exp used in _LIBC environment.
2795 Look up the collation sequence value of BR_ELEM.
2796 Return the value if succeeded, UINT_MAX otherwise. */
2798 auto inline unsigned int
2799 __attribute ((always_inline))
2800 lookup_collation_sequence_value (br_elem)
2801 bracket_elem_t *br_elem;
2803 if (br_elem->type == SB_CHAR)
2806 if (MB_CUR_MAX == 1)
2809 return COLLSEQMB_LOOKUP (br_elem->opr.ch);
2812 wint_t wc = __btowc (br_elem->opr.ch);
2813 return __collseq_table_lookup (collseqwc, wc);
2816 #if __OPTION_EGLIBC_LOCALE_CODE
2817 else if (br_elem->type == MB_CHAR)
2820 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2823 else if (br_elem->type == COLL_SYM)
2825 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2829 elem = seek_collating_symbol_entry (br_elem->opr.name,
2831 if (symb_table[2 * elem] != 0)
2833 /* We found the entry. */
2834 idx = symb_table[2 * elem + 1];
2835 /* Skip the name of collating element name. */
2836 idx += 1 + extra[idx];
2837 /* Skip the byte sequence of the collating element. */
2838 idx += 1 + extra[idx];
2839 /* Adjust for the alignment. */
2840 idx = (idx + 3) & ~3;
2841 /* Skip the multibyte collation sequence value. */
2842 idx += sizeof (unsigned int);
2843 /* Skip the wide char sequence of the collating element. */
2844 idx += sizeof (unsigned int) *
2845 (1 + *(unsigned int *) (extra + idx));
2846 /* Return the collation sequence value. */
2847 return *(unsigned int *) (extra + idx);
2849 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2851 /* No valid character. Match it as a single byte
2853 return COLLSEQMB_LOOKUP (br_elem->opr.name[0]);
2856 else if (sym_name_len == 1)
2857 return COLLSEQMB_LOOKUP (br_elem->opr.name[0]);
2862 /* Local function for parse_bracket_exp used in _LIBC environement.
2863 Build the range expression which starts from START_ELEM, and ends
2864 at END_ELEM. The result are written to MBCSET and SBCSET.
2865 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2866 mbcset->range_ends, is a pointer argument sinse we may
2869 auto inline reg_errcode_t
2870 __attribute ((always_inline))
2871 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2872 re_charset_t *mbcset;
2875 bracket_elem_t *start_elem, *end_elem;
2878 uint32_t start_collseq;
2879 uint32_t end_collseq;
2881 /* Equivalence Classes and Character Classes can't be a range
2883 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2884 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2888 start_collseq = lookup_collation_sequence_value (start_elem);
2889 end_collseq = lookup_collation_sequence_value (end_elem);
2890 /* Check start/end collation sequence values. */
2891 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2892 return REG_ECOLLATE;
2893 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2896 /* Got valid collation sequence values, add them as a new entry.
2897 However, if we have no collation elements, and the character set
2898 is single byte, the single byte character set that we
2899 build below suffices. */
2900 if (nrules > 0 || dfa_mb_cur_max (dfa) > 1)
2902 /* Check the space of the arrays. */
2903 if (BE (*range_alloc == mbcset->nranges, 0))
2905 /* There is not enough space, need realloc. */
2906 uint32_t *new_array_start;
2907 uint32_t *new_array_end;
2910 /* +1 in case of mbcset->nranges is 0. */
2911 new_nranges = 2 * mbcset->nranges + 1;
2912 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2914 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2917 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2920 mbcset->range_starts = new_array_start;
2921 mbcset->range_ends = new_array_end;
2922 *range_alloc = new_nranges;
2925 mbcset->range_starts[mbcset->nranges] = start_collseq;
2926 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2929 /* Build the table for single byte characters. */
2930 for (ch = 0; ch < SBC_MAX; ch++)
2932 uint32_t ch_collseq;
2934 if (MB_CUR_MAX == 1)
2937 ch_collseq = COLLSEQMB_LOOKUP (ch);
2939 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2940 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2941 bitset_set (sbcset, ch);
2946 /* Local function for parse_bracket_exp used in _LIBC environement.
2947 Build the collating element which is represented by NAME.
2948 The result are written to MBCSET and SBCSET.
2949 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2950 pointer argument sinse we may update it. */
2952 auto inline reg_errcode_t
2953 __attribute ((always_inline))
2954 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2955 re_charset_t *mbcset;
2956 int *coll_sym_alloc;
2958 const unsigned char *name;
2961 size_t name_len = strlen ((const char *) name);
2964 elem = seek_collating_symbol_entry (name, name_len);
2965 if (symb_table[2 * elem] != 0)
2967 /* We found the entry. */
2968 idx = symb_table[2 * elem + 1];
2969 /* Skip the name of collating element name. */
2970 idx += 1 + extra[idx];
2972 else if (symb_table[2 * elem] == 0 && name_len == 1)
2974 /* No valid character, treat it as a normal
2976 bitset_set (sbcset, name[0]);
2980 return REG_ECOLLATE;
2982 /* Got valid collation sequence, add it as a new entry. */
2983 /* Check the space of the arrays. */
2984 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986 /* Not enough, realloc it. */
2987 /* +1 in case of mbcset->ncoll_syms is 0. */
2988 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2989 /* Use realloc since mbcset->coll_syms is NULL
2991 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2992 new_coll_sym_alloc);
2993 if (BE (new_coll_syms == NULL, 0))
2995 mbcset->coll_syms = new_coll_syms;
2996 *coll_sym_alloc = new_coll_sym_alloc;
2998 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3003 if (BE (name_len != 1, 0))
3004 return REG_ECOLLATE;
3007 bitset_set (sbcset, name[0]);
3014 re_token_t br_token;
3015 re_bitset_ptr_t sbcset;
3016 #ifdef RE_ENABLE_I18N
3017 re_charset_t *mbcset;
3018 int coll_sym_alloc = 0, range_alloc = 0;
3019 #if __OPTION_EGLIBC_LOCALE_CODE
3020 int mbchar_alloc = 0;
3022 int equiv_class_alloc = 0, char_class_alloc = 0;
3023 #endif /* not RE_ENABLE_I18N */
3025 bin_tree_t *work_tree;
3027 int first_round = 1;
3029 #if __OPTION_EGLIBC_LOCALE_CODE
3030 collseqmb = (const unsigned char *)
3031 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3032 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3034 /* This is true when OPTION_EGLIBC_LOCALE_CODE is disabled, but the
3035 compiler can't figure that out. */
3043 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3044 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3045 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3046 _NL_COLLATE_SYMB_TABLEMB);
3047 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3048 _NL_COLLATE_SYMB_EXTRAMB);
3051 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3052 #ifdef RE_ENABLE_I18N
3053 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3054 #endif /* RE_ENABLE_I18N */
3055 #ifdef RE_ENABLE_I18N
3056 if (BE (sbcset == NULL || mbcset == NULL, 0))
3058 if (BE (sbcset == NULL, 0))
3059 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3069 token_len = peek_token_bracket (token, regexp, syntax);
3070 if (BE (token->type == END_OF_RE, 0))
3073 goto parse_bracket_exp_free_return;
3075 if (token->type == OP_NON_MATCH_LIST)
3077 #ifdef RE_ENABLE_I18N
3078 mbcset->non_match = 1;
3079 #endif /* not RE_ENABLE_I18N */
3081 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3082 bitset_set (sbcset, '\n');
3083 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3084 token_len = peek_token_bracket (token, regexp, syntax);
3085 if (BE (token->type == END_OF_RE, 0))
3088 goto parse_bracket_exp_free_return;
3092 /* We treat the first ']' as a normal character. */
3093 if (token->type == OP_CLOSE_BRACKET)
3094 token->type = CHARACTER;
3098 bracket_elem_t start_elem, end_elem;
3099 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3100 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3102 int token_len2 = 0, is_range_exp = 0;
3105 start_elem.opr.name = start_name_buf;
3106 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3107 syntax, first_round);
3108 if (BE (ret != REG_NOERROR, 0))
3111 goto parse_bracket_exp_free_return;
3115 /* Get information about the next token. We need it in any case. */
3116 token_len = peek_token_bracket (token, regexp, syntax);
3118 /* Do not check for ranges if we know they are not allowed. */
3119 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3121 if (BE (token->type == END_OF_RE, 0))
3124 goto parse_bracket_exp_free_return;
3126 if (token->type == OP_CHARSET_RANGE)
3128 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3129 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3130 if (BE (token2.type == END_OF_RE, 0))
3133 goto parse_bracket_exp_free_return;
3135 if (token2.type == OP_CLOSE_BRACKET)
3137 /* We treat the last '-' as a normal character. */
3138 re_string_skip_bytes (regexp, -token_len);
3139 token->type = CHARACTER;
3146 if (is_range_exp == 1)
3148 end_elem.opr.name = end_name_buf;
3149 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3151 if (BE (ret != REG_NOERROR, 0))
3154 goto parse_bracket_exp_free_return;
3157 token_len = peek_token_bracket (token, regexp, syntax);
3160 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3161 &start_elem, &end_elem);
3163 # ifdef RE_ENABLE_I18N
3164 *err = build_range_exp (sbcset,
3165 dfa_mb_cur_max (dfa) > 1 ? mbcset : NULL,
3166 &range_alloc, &start_elem, &end_elem);
3168 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3170 #endif /* RE_ENABLE_I18N */
3171 if (BE (*err != REG_NOERROR, 0))
3172 goto parse_bracket_exp_free_return;
3176 switch (start_elem.type)
3179 bitset_set (sbcset, start_elem.opr.ch);
3181 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3183 /* Check whether the array has enough space. */
3184 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3186 wchar_t *new_mbchars;
3187 /* Not enough, realloc it. */
3188 /* +1 in case of mbcset->nmbchars is 0. */
3189 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3190 /* Use realloc since array is NULL if *alloc == 0. */
3191 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3193 if (BE (new_mbchars == NULL, 0))
3194 goto parse_bracket_exp_espace;
3195 mbcset->mbchars = new_mbchars;
3197 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3199 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3201 *err = build_equiv_class (sbcset,
3202 #ifdef RE_ENABLE_I18N
3203 mbcset, &equiv_class_alloc,
3204 #endif /* RE_ENABLE_I18N */
3205 start_elem.opr.name);
3206 if (BE (*err != REG_NOERROR, 0))
3207 goto parse_bracket_exp_free_return;
3210 *err = build_collating_symbol (sbcset,
3211 #ifdef RE_ENABLE_I18N
3212 mbcset, &coll_sym_alloc,
3213 #endif /* RE_ENABLE_I18N */
3214 start_elem.opr.name);
3215 if (BE (*err != REG_NOERROR, 0))
3216 goto parse_bracket_exp_free_return;
3219 *err = build_charclass (regexp->trans, sbcset,
3220 #ifdef RE_ENABLE_I18N
3221 mbcset, &char_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223 start_elem.opr.name, syntax);
3224 if (BE (*err != REG_NOERROR, 0))
3225 goto parse_bracket_exp_free_return;
3232 if (BE (token->type == END_OF_RE, 0))
3235 goto parse_bracket_exp_free_return;
3237 if (token->type == OP_CLOSE_BRACKET)
3241 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3243 /* If it is non-matching list. */
3245 bitset_not (sbcset);
3247 #ifdef RE_ENABLE_I18N
3248 /* Ensure only single byte characters are set. */
3249 if (dfa_mb_cur_max (dfa) > 1)
3250 bitset_mask (sbcset, dfa->sb_char);
3252 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3253 || mbcset->nranges || (dfa_mb_cur_max (dfa) > 1 && (mbcset->nchar_classes
3254 || mbcset->non_match)))
3256 bin_tree_t *mbc_tree;
3258 /* Build a tree for complex bracket. */
3259 dfa->has_mb_node = 1;
3260 br_token.type = COMPLEX_BRACKET;
3261 br_token.opr.mbcset = mbcset;
3262 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3263 if (BE (mbc_tree == NULL, 0))
3264 goto parse_bracket_exp_espace;
3265 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3266 if (sbcset[sbc_idx])
3268 /* If there are no bits set in sbcset, there is no point
3269 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3270 if (sbc_idx < BITSET_WORDS)
3272 /* Build a tree for simple bracket. */
3273 br_token.type = SIMPLE_BRACKET;
3274 br_token.opr.sbcset = sbcset;
3275 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276 if (BE (work_tree == NULL, 0))
3277 goto parse_bracket_exp_espace;
3279 /* Then join them by ALT node. */
3280 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3281 if (BE (work_tree == NULL, 0))
3282 goto parse_bracket_exp_espace;
3287 work_tree = mbc_tree;
3291 #endif /* not RE_ENABLE_I18N */
3293 #ifdef RE_ENABLE_I18N
3294 free_charset (mbcset);
3296 /* Build a tree for simple bracket. */
3297 br_token.type = SIMPLE_BRACKET;
3298 br_token.opr.sbcset = sbcset;
3299 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3300 if (BE (work_tree == NULL, 0))
3301 goto parse_bracket_exp_espace;
3305 parse_bracket_exp_espace:
3307 parse_bracket_exp_free_return:
3309 #ifdef RE_ENABLE_I18N
3310 free_charset (mbcset);
3311 #endif /* RE_ENABLE_I18N */
3315 /* Parse an element in the bracket expression. */
3317 static reg_errcode_t
3318 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3319 re_token_t *token, int token_len, re_dfa_t *dfa,
3320 reg_syntax_t syntax, int accept_hyphen)
3322 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3324 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3325 if (cur_char_size > 1)
3327 elem->type = MB_CHAR;
3328 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3329 re_string_skip_bytes (regexp, cur_char_size);
3332 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3333 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3334 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3335 || token->type == OP_OPEN_EQUIV_CLASS)
3336 return parse_bracket_symbol (elem, regexp, token);
3337 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3339 /* A '-' must only appear as anything but a range indicator before
3340 the closing bracket. Everything else is an error. */
3342 (void) peek_token_bracket (&token2, regexp, syntax);
3343 if (token2.type != OP_CLOSE_BRACKET)
3344 /* The actual error value is not standardized since this whole
3345 case is undefined. But ERANGE makes good sense. */
3348 elem->type = SB_CHAR;
3349 elem->opr.ch = token->opr.c;
3353 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3354 such as [:<character_class>:], [.<collating_element>.], and
3355 [=<equivalent_class>=]. */
3357 static reg_errcode_t
3358 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3361 unsigned char ch, delim = token->opr.c;
3363 if (re_string_eoi(regexp))
3367 if (i >= BRACKET_NAME_BUF_SIZE)
3369 if (token->type == OP_OPEN_CHAR_CLASS)
3370 ch = re_string_fetch_byte_case (regexp);
3372 ch = re_string_fetch_byte (regexp);
3373 if (re_string_eoi(regexp))
3375 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3377 elem->opr.name[i] = ch;
3379 re_string_skip_bytes (regexp, 1);
3380 elem->opr.name[i] = '\0';
3381 switch (token->type)
3383 case OP_OPEN_COLL_ELEM:
3384 elem->type = COLL_SYM;
3386 case OP_OPEN_EQUIV_CLASS:
3387 elem->type = EQUIV_CLASS;
3389 case OP_OPEN_CHAR_CLASS:
3390 elem->type = CHAR_CLASS;
3398 /* Helper function for parse_bracket_exp.
3399 Build the equivalence class which is represented by NAME.
3400 The result are written to MBCSET and SBCSET.
3401 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3402 is a pointer argument sinse we may update it. */
3404 static reg_errcode_t
3405 #ifdef RE_ENABLE_I18N
3406 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3407 int *equiv_class_alloc, const unsigned char *name)
3408 #else /* not RE_ENABLE_I18N */
3409 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3410 #endif /* not RE_ENABLE_I18N */
3412 /* When __OPTION_EGLIBC_LOCALE_CODE is disabled, only the C locale
3413 is supported; it has no collation rules. */
3414 #if defined _LIBC && __OPTION_EGLIBC_LOCALE_CODE
3415 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3418 const int32_t *table, *indirect;
3419 const unsigned char *weights, *extra, *cp;
3420 unsigned char char_buf[2];
3424 /* This #include defines a local function! */
3425 # include <locale/weight.h>
3426 /* Calculate the index for equivalence class. */
3428 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3429 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3430 _NL_COLLATE_WEIGHTMB);
3431 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3432 _NL_COLLATE_EXTRAMB);
3433 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3434 _NL_COLLATE_INDIRECTMB);
3435 idx1 = findidx (&cp);
3436 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3437 /* This isn't a valid character. */
3438 return REG_ECOLLATE;
3440 /* Build single byte matcing table for this equivalence class. */
3441 char_buf[1] = (unsigned char) '\0';
3442 len = weights[idx1 & 0xffffff];
3443 for (ch = 0; ch < SBC_MAX; ++ch)
3447 idx2 = findidx (&cp);
3452 /* This isn't a valid character. */
3454 /* Compare only if the length matches and the collation rule
3455 index is the same. */
3456 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3460 while (cnt <= len &&
3461 weights[(idx1 & 0xffffff) + 1 + cnt]
3462 == weights[(idx2 & 0xffffff) + 1 + cnt])
3466 bitset_set (sbcset, ch);
3469 /* Check whether the array has enough space. */
3470 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3472 /* Not enough, realloc it. */
3473 /* +1 in case of mbcset->nequiv_classes is 0. */
3474 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3475 /* Use realloc since the array is NULL if *alloc == 0. */
3476 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3478 new_equiv_class_alloc);
3479 if (BE (new_equiv_classes == NULL, 0))
3481 mbcset->equiv_classes = new_equiv_classes;
3482 *equiv_class_alloc = new_equiv_class_alloc;
3484 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3487 #endif /* _LIBC && __OPTION_EGLIBC_LOCALE_CODE */
3489 if (BE (strlen ((const char *) name) != 1, 0))
3490 return REG_ECOLLATE;
3491 bitset_set (sbcset, *name);
3496 /* Helper function for parse_bracket_exp.
3497 Build the character class which is represented by NAME.
3498 The result are written to MBCSET and SBCSET.
3499 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3500 is a pointer argument sinse we may update it. */
3502 static reg_errcode_t
3503 #ifdef RE_ENABLE_I18N
3504 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3505 re_charset_t *mbcset, int *char_class_alloc,
3506 const unsigned char *class_name, reg_syntax_t syntax)
3507 #else /* not RE_ENABLE_I18N */
3508 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3509 const unsigned char *class_name, reg_syntax_t syntax)
3510 #endif /* not RE_ENABLE_I18N */
3513 const char *name = (const char *) class_name;
3515 /* In case of REG_ICASE "upper" and "lower" match the both of
3516 upper and lower cases. */
3517 if ((syntax & RE_ICASE)
3518 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3521 #if defined RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE
3522 /* Check the space of the arrays. */
3523 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3525 /* Not enough, realloc it. */
3526 /* +1 in case of mbcset->nchar_classes is 0. */
3527 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3528 /* Use realloc since array is NULL if *alloc == 0. */
3529 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3530 new_char_class_alloc);
3531 if (BE (new_char_classes == NULL, 0))
3533 mbcset->char_classes = new_char_classes;
3534 *char_class_alloc = new_char_class_alloc;
3536 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3537 #endif /* RE_ENABLE_I18N && __OPTION_EGLIBC_LOCALE_CODE */
3539 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3541 if (BE (trans != NULL, 0)) \
3543 for (i = 0; i < SBC_MAX; ++i) \
3544 if (ctype_func (i)) \
3545 bitset_set (sbcset, trans[i]); \
3549 for (i = 0; i < SBC_MAX; ++i) \
3550 if (ctype_func (i)) \
3551 bitset_set (sbcset, i); \
3555 if (strcmp (name, "alnum") == 0)
3556 BUILD_CHARCLASS_LOOP (isalnum);
3557 else if (strcmp (name, "cntrl") == 0)
3558 BUILD_CHARCLASS_LOOP (iscntrl);
3559 else if (strcmp (name, "lower") == 0)
3560 BUILD_CHARCLASS_LOOP (islower);
3561 else if (strcmp (name, "space") == 0)
3562 BUILD_CHARCLASS_LOOP (isspace);
3563 else if (strcmp (name, "alpha") == 0)
3564 BUILD_CHARCLASS_LOOP (isalpha);
3565 else if (strcmp (name, "digit") == 0)
3566 BUILD_CHARCLASS_LOOP (isdigit);
3567 else if (strcmp (name, "print") == 0)
3568 BUILD_CHARCLASS_LOOP (isprint);
3569 else if (strcmp (name, "upper") == 0)
3570 BUILD_CHARCLASS_LOOP (isupper);
3571 else if (strcmp (name, "blank") == 0)
3572 BUILD_CHARCLASS_LOOP (isblank);
3573 else if (strcmp (name, "graph") == 0)
3574 BUILD_CHARCLASS_LOOP (isgraph);
3575 else if (strcmp (name, "punct") == 0)
3576 BUILD_CHARCLASS_LOOP (ispunct);
3577 else if (strcmp (name, "xdigit") == 0)
3578 BUILD_CHARCLASS_LOOP (isxdigit);
3586 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3587 const unsigned char *class_name,
3588 const unsigned char *extra, int non_match,
3591 re_bitset_ptr_t sbcset;
3592 #ifdef RE_ENABLE_I18N
3593 re_charset_t *mbcset;
3595 #endif /* not RE_ENABLE_I18N */
3597 re_token_t br_token;
3600 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3601 #ifdef RE_ENABLE_I18N
3602 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3603 #endif /* RE_ENABLE_I18N */
3605 #ifdef RE_ENABLE_I18N
3606 if (BE (sbcset == NULL || mbcset == NULL, 0))
3607 #else /* not RE_ENABLE_I18N */
3608 if (BE (sbcset == NULL, 0))
3609 #endif /* not RE_ENABLE_I18N */
3617 #ifdef RE_ENABLE_I18N
3618 mbcset->non_match = 1;
3619 #endif /* not RE_ENABLE_I18N */
3622 /* We don't care the syntax in this case. */
3623 ret = build_charclass (trans, sbcset,
3624 #ifdef RE_ENABLE_I18N
3626 #endif /* RE_ENABLE_I18N */
3629 if (BE (ret != REG_NOERROR, 0))
3632 #ifdef RE_ENABLE_I18N
3633 free_charset (mbcset);
3634 #endif /* RE_ENABLE_I18N */
3638 /* \w match '_' also. */
3639 for (; *extra; extra++)
3640 bitset_set (sbcset, *extra);
3642 /* If it is non-matching list. */
3644 bitset_not (sbcset);
3646 #ifdef RE_ENABLE_I18N
3647 /* Ensure only single byte characters are set. */
3648 if (dfa_mb_cur_max (dfa) > 1)
3649 bitset_mask (sbcset, dfa->sb_char);
3652 /* Build a tree for simple bracket. */
3653 br_token.type = SIMPLE_BRACKET;
3654 br_token.opr.sbcset = sbcset;
3655 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3656 if (BE (tree == NULL, 0))
3657 goto build_word_op_espace;
3659 #ifdef RE_ENABLE_I18N
3660 if (dfa_mb_cur_max (dfa) > 1)
3662 bin_tree_t *mbc_tree;
3663 /* Build a tree for complex bracket. */
3664 br_token.type = COMPLEX_BRACKET;
3665 br_token.opr.mbcset = mbcset;
3666 dfa->has_mb_node = 1;
3667 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3668 if (BE (mbc_tree == NULL, 0))
3669 goto build_word_op_espace;
3670 /* Then join them by ALT node. */
3671 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3672 if (BE (mbc_tree != NULL, 1))
3677 free_charset (mbcset);
3680 #else /* not RE_ENABLE_I18N */
3682 #endif /* not RE_ENABLE_I18N */
3684 build_word_op_espace:
3686 #ifdef RE_ENABLE_I18N
3687 free_charset (mbcset);
3688 #endif /* RE_ENABLE_I18N */
3693 /* This is intended for the expressions like "a{1,3}".
3694 Fetch a number from `input', and return the number.
3695 Return -1, if the number field is empty like "{,1}".
3696 Return -2, If an error is occured. */
3699 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3705 fetch_token (token, input, syntax);
3707 if (BE (token->type == END_OF_RE, 0))
3709 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3711 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3712 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3713 num = (num > RE_DUP_MAX) ? -2 : num;
3718 #ifdef RE_ENABLE_I18N
3720 free_charset (re_charset_t *cset)
3722 re_free (cset->mbchars);
3724 re_free (cset->coll_syms);
3725 re_free (cset->equiv_classes);
3726 re_free (cset->range_starts);
3727 re_free (cset->range_ends);
3729 re_free (cset->char_classes);
3732 #endif /* RE_ENABLE_I18N */
3734 /* Functions for binary tree operation. */
3736 /* Create a tree node. */
3739 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740 re_token_type_t type)
3744 return create_token_tree (dfa, left, right, &t);
3748 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3749 const re_token_t *token)
3752 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3754 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3756 if (storage == NULL)
3758 storage->next = dfa->str_tree_storage;
3759 dfa->str_tree_storage = storage;
3760 dfa->str_tree_storage_idx = 0;
3762 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3764 tree->parent = NULL;
3766 tree->right = right;
3767 tree->token = *token;
3768 tree->token.duplicated = 0;
3769 tree->token.opt_subexp = 0;
3772 tree->node_idx = -1;
3775 left->parent = tree;
3777 right->parent = tree;
3781 /* Mark the tree SRC as an optional subexpression.
3782 To be called from preorder or postorder. */
3784 static reg_errcode_t
3785 mark_opt_subexp (void *extra, bin_tree_t *node)
3787 int idx = (int) (long) extra;
3788 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3789 node->token.opt_subexp = 1;
3794 /* Free the allocated memory inside NODE. */
3797 free_token (re_token_t *node)
3799 #ifdef RE_ENABLE_I18N
3800 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3801 free_charset (node->opr.mbcset);
3803 #endif /* RE_ENABLE_I18N */
3804 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3805 re_free (node->opr.sbcset);
3808 /* Worker function for tree walking. Free the allocated memory inside NODE
3809 and its children. */
3811 static reg_errcode_t
3812 free_tree (void *extra, bin_tree_t *node)
3814 free_token (&node->token);
3819 /* Duplicate the node SRC, and return new node. This is a preorder
3820 visit similar to the one implemented by the generic visitor, but
3821 we need more infrastructure to maintain two parallel trees --- so,
3822 it's easier to duplicate. */
3825 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3827 const bin_tree_t *node;
3828 bin_tree_t *dup_root;
3829 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3831 for (node = root; ; )
3833 /* Create a new tree and link it back to the current parent. */
3834 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3837 (*p_new)->parent = dup_node;
3838 (*p_new)->token.duplicated = 1;
3841 /* Go to the left node, or up and to the right. */
3845 p_new = &dup_node->left;
3849 const bin_tree_t *prev = NULL;
3850 while (node->right == prev || node->right == NULL)
3853 node = node->parent;
3854 dup_node = dup_node->parent;
3859 p_new = &dup_node->right;