X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ian/git?p=pcre3.git;a=blobdiff_plain;f=pcre_compile.c;h=1bc2b7f030a1a1c9eb728cf804f97f258d59b3c6;hp=c76ca1418d97679764ff32c4370cc74a3f06e741;hb=4c26ecf8fa23b71fb8803a1e0e2d2258b4fb71e0;hpb=e5f50570097752e9d8d68df700473362e385bda6 diff --git a/pcre_compile.c b/pcre_compile.c index c76ca14..1bc2b7f 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2014 University of Cambridge + Copyright (c) 1997-2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -47,8 +47,8 @@ supporting internal functions that are not used by other modules. */ #endif #define NLBLOCK cd /* Block containing newline information */ -#define PSSTART start_pattern /* Field containing processed string start */ -#define PSEND end_pattern /* Field containing processed string end */ +#define PSSTART start_pattern /* Field containing pattern start */ +#define PSEND end_pattern /* Field containing pattern end */ #include "pcre_internal.h" @@ -174,7 +174,7 @@ static const short int escapes[] = { -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, - CHAR_GRAVE_ACCENT, 7, + CHAR_GRAVE_ACCENT, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, @@ -202,9 +202,9 @@ static const short int escapes[] = { /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', -/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, +/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, -/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, +/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, @@ -219,6 +219,12 @@ static const short int escapes[] = { /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 }; + +/* We also need a table of characters that may follow \c in an EBCDIC +environment for characters 0-31. */ + +static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; + #endif @@ -458,7 +464,7 @@ static const char error_texts[] = "range out of order in character class\0" "nothing to repeat\0" /* 10 */ - "operand of unlimited repeat could match the empty string\0" /** DEAD **/ + "internal error: invalid forward reference offset\0" "internal error: unexpected repeat\0" "unrecognized character after (? or (?-\0" "POSIX named classes are supported only within a class\0" @@ -527,7 +533,11 @@ static const char error_texts[] = "different names for subpatterns of the same number are not allowed\0" "(*MARK) must have an argument\0" "this version of PCRE is not compiled with Unicode property support\0" +#ifndef EBCDIC "\\c must be followed by an ASCII character\0" +#else + "\\c must be followed by a letter or one of [\\]^_?\0" +#endif "\\k is not followed by a braced, angle-bracketed, or quoted name\0" /* 70 */ "internal error: unknown opcode in find_fixedlength()\0" @@ -549,6 +559,8 @@ static const char error_texts[] = "group name must start with a non-digit\0" /* 85 */ "parentheses are too deeply nested (stack check)\0" + "digits missing in \\x{} or \\o{}\0" + "regular expression is too complicated\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -1259,6 +1271,7 @@ else case CHAR_o: if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else + if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else { ptr += 2; c = 0; @@ -1328,6 +1341,11 @@ else if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { ptr += 2; + if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR86; + break; + } c = 0; overflow = FALSE; while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) @@ -1418,7 +1436,16 @@ else c ^= 0x40; #else /* EBCDIC coding */ if (c >= CHAR_a && c <= CHAR_z) c += 64; - c ^= 0xC0; + if (c == CHAR_QUESTION_MARK) + c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; + else + { + for (i = 0; i < 32; i++) + { + if (c == ebcdic_escape_c[i]) break; + } + if (i < 32) c = i; else *errorcodeptr = ERR68; + } #endif break; @@ -1583,7 +1610,7 @@ read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr) int min = 0; int max = -1; -while (IS_DIGIT(*p)) +while (IS_DIGIT(*p)) { min = min * 10 + (int)(*p++ - CHAR_0); if (min > 65535) @@ -1591,14 +1618,14 @@ while (IS_DIGIT(*p)) *errorcodeptr = ERR5; return p; } - } + } if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else { if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while(IS_DIGIT(*p)) + while(IS_DIGIT(*p)) { max = max * 10 + (int)(*p++ - CHAR_0); if (max > 65535) @@ -1606,7 +1633,7 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else *errorcodeptr = ERR5; return p; } - } + } if (max < min) { *errorcodeptr = ERR4; @@ -1697,6 +1724,7 @@ Arguments: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode atend TRUE if called when the pattern is complete cd the "compile data" structure + recurses chain of recurse_check to catch mutual recursion Returns: the fixed length, or -1 if there is no fixed length, @@ -1706,10 +1734,11 @@ Returns: the fixed length, */ static int -find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) +find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd, + recurse_check *recurses) { int length = -1; - +recurse_check this_recurse; register int branchlength = 0; register pcre_uchar *cc = code + 1 + LINK_SIZE; @@ -1734,7 +1763,8 @@ for (;;) case OP_ONCE: case OP_ONCE_NC: case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); + d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd, + recurses); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -1768,7 +1798,15 @@ for (;;) cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ if (cc > cs && cc < ce) return -1; /* Recursion */ - d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd); + else /* Check for mutual recursion */ + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) return -1; /* Mutual recursion */ + } + this_recurse.prev = recurses; + this_recurse.group = cs; + d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse); if (d < 0) return d; branchlength += d; cc += 1 + LINK_SIZE; @@ -1781,7 +1819,7 @@ for (;;) case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += PRIV(OP_lengths)[*cc]; + cc += 1 + LINK_SIZE; break; /* Skip over things that don't match chars */ @@ -2122,32 +2160,60 @@ for (;;) { case OP_CHAR: case OP_CHARI: + case OP_NOT: + case OP_NOTI: case OP_EXACT: case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: case OP_UPTO: case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: case OP_MINUPTO: case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: case OP_STAR: case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: case OP_MINSTAR: case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: case OP_POSSTAR: case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: case OP_PLUS: case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: case OP_MINPLUS: case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: case OP_POSPLUS: case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: case OP_QUERY: case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: case OP_MINQUERY: case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } @@ -2327,11 +2393,6 @@ Arguments: Returns: TRUE if what is matched could be empty */ -typedef struct recurse_check { - struct recurse_check *prev; - const pcre_uchar *group; -} recurse_check; - static BOOL could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, BOOL utf, compile_data *cd, recurse_check *recurses) @@ -2367,6 +2428,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (c == OP_RECURSE) { const pcre_uchar *scode = cd->start_code + GET(code, 1); + const pcre_uchar *endgroup = scode; BOOL empty_branch; /* Test for forward reference or uncompleted reference. This is disabled @@ -2381,20 +2443,16 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ } - /* If we are scanning a completed pattern, there are no forward references - and all groups are complete. We need to detect whether this is a recursive - call, as otherwise there will be an infinite loop. If it is a recursion, - just skip over it. Simple recursions are easily detected. For mutual - recursions we keep a chain on the stack. */ + /* If the reference is to a completed group, we need to detect whether this + is a recursive call, as otherwise there will be an infinite loop. If it is + a recursion, just skip over it. Simple recursions are easily detected. For + mutual recursions we keep a chain on the stack. */ + do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); + if (code >= scode && code <= endgroup) continue; /* Simple recursion */ else { recurse_check *r = recurses; - const pcre_uchar *endgroup = scode; - - do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); - if (code >= scode && code <= endgroup) continue; /* Simple recursion */ - for (r = recurses; r != NULL; r = r->prev) if (r->group == scode) break; if (r != NULL) continue; /* Mutual recursion */ @@ -2449,7 +2507,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (c == OP_BRA || c == OP_BRAPOS || c == OP_CBRA || c == OP_CBRAPOS || c == OP_ONCE || c == OP_ONCE_NC || - c == OP_COND) + c == OP_COND || c == OP_SCOND) { BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ @@ -2465,8 +2523,8 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL)) - empty_branch = TRUE; + if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, + recurses)) empty_branch = TRUE; code += GET(code, 1); } while (*code == OP_ALT); @@ -3035,7 +3093,7 @@ switch(c) end += 1 + 2 * IMM2_SIZE; break; } - list[2] = end - code; + list[2] = (pcre_uint32)(end - code); return end; } return NULL; /* Opcode not accepted */ @@ -3061,7 +3119,7 @@ Returns: TRUE if the auto-possessification is possible static BOOL compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd, - const pcre_uint32 *base_list, const pcre_uchar *base_end) + const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit) { pcre_uchar c; pcre_uint32 list[8]; @@ -3076,6 +3134,10 @@ const pcre_uint8 *class_bitset; const pcre_uint8 *set1, *set2, *set_end; pcre_uint32 chr; BOOL accepted, invert_bits; +BOOL entered_a_group = FALSE; + +if (*rec_limit == 0) return FALSE; +--(*rec_limit); /* Note: the base_list[1] contains whether the current opcode has greedy (represented by a non-zero value) quantifier. This is a different from @@ -3129,8 +3191,10 @@ for(;;) case OP_ONCE: case OP_ONCE_NC: /* Atomic sub-patterns and assertions can always auto-possessify their - last iterator. */ - return TRUE; + last iterator. However, if the group was entered as a result of checking + a previous iterator, this is not possible. */ + + return !entered_a_group; } code += PRIV(OP_lengths)[c]; @@ -3145,10 +3209,13 @@ for(;;) while (*next_code == OP_ALT) { - if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE; + if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit)) + return FALSE; code = next_code + 1 + LINK_SIZE; next_code += GET(next_code, 1); } + + entered_a_group = TRUE; continue; case OP_BRAZERO: @@ -3163,11 +3230,14 @@ for(;;) /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ next_code += 1 + LINK_SIZE; - if (!compare_opcodes(next_code, utf, cd, base_list, base_end)) + if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit)) return FALSE; code += PRIV(OP_lengths)[c]; continue; + + default: + break; } /* Check for a supported opcode, and load its properties. */ @@ -3406,8 +3476,7 @@ for(;;) rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; - if (!accepted) - return FALSE; + if (!accepted) return FALSE; if (list[1] == 0) return TRUE; /* Might be an empty repeat. */ @@ -3594,11 +3663,20 @@ register pcre_uchar c; const pcre_uchar *end; pcre_uchar *repeat_opcode; pcre_uint32 list[8]; +int rec_limit; for (;;) { c = *code; + /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK, + it may compile without complaining, but may get into a loop here if the code + pointer points to a bad value. This is, of course a documentated possibility, + when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and + just give up on this optimization. */ + + if (c >= OP_TABLE_LENGTH) return; + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) { c -= get_repeat_base(c) - OP_STAR; @@ -3606,7 +3684,8 @@ for (;;) get_chr_property_list(code, utf, cd->fcc, list) : NULL; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - if (end != NULL && compare_opcodes(end, utf, cd, list, end)) + rec_limit = 1000; + if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit)) { switch(c) { @@ -3662,7 +3741,8 @@ for (;;) list[1] = (c & 1) == 0; - if (compare_opcodes(end, utf, cd, list, end)) + rec_limit = 1000; + if (compare_opcodes(end, utf, cd, list, end, &rec_limit)) { switch (c) { @@ -3826,11 +3906,11 @@ didn't consider this to be a POSIX class. Likewise for [:1234:]. The problem in trying to be exactly like Perl is in the handling of escapes. We have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code -below handles the special case of \], but does not try to do any other escape -processing. This makes it different from Perl for cases such as [:l\ower:] -where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize -"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does, -I think. +below handles the special cases \\ and \], but does not try to do any other +escape processing. This makes it different from Perl for cases such as +[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does +not recognize "l\ower". This is a lesser evil than not diagnosing bad classes +when Perl does, I think. A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. It seems that the appearance of a nested POSIX class supersedes an apparent @@ -3857,21 +3937,16 @@ pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ for (++ptr; *ptr != CHAR_NULL; ptr++) { - if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + if (*ptr == CHAR_BACKSLASH && + (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || + ptr[1] == CHAR_BACKSLASH)) ptr++; - else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; - else + else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || + *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; + else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) { - if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) - { - *endptr = ptr; - return TRUE; - } - if (*ptr == CHAR_LEFT_SQUARE_BRACKET && - (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, endptr)) - return FALSE; + *endptr = ptr; + return TRUE; } } return FALSE; @@ -3925,48 +4000,42 @@ have their offsets adjusted. That one of the jobs of this function. Before it is called, the partially compiled regex must be temporarily terminated with OP_END. -This function has been extended with the possibility of forward references for -recursions and subroutine calls. It must also check the list of such references -for the group we are dealing with. If it finds that one of the recursions in -the current group is on this list, it adjusts the offset in the list, not the -value in the reference (which is a group number). +This function has been extended to cope with forward references for recursions +and subroutine calls. It must check the list of such references for the +group we are dealing with. If it finds that one of the recursions in the +current group is on this list, it does not adjust the value in the reference +(which is a group number). After the group has been scanned, all the offsets in +the forward reference list for the group are adjusted. Arguments: group points to the start of the group adjust the amount by which the group is to be moved utf TRUE in UTF-8 / UTF-16 / UTF-32 mode cd contains pointers to tables etc. - save_hwm the hwm forward reference pointer at the start of the group + save_hwm_offset the hwm forward reference offset at the start of the group Returns: nothing */ static void adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, - pcre_uchar *save_hwm) + size_t save_hwm_offset) { +int offset; +pcre_uchar *hc; pcre_uchar *ptr = group; while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) { - int offset; - pcre_uchar *hc; - - /* See if this recursion is on the forward reference list. If so, adjust the - reference. */ - - for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; + hc += LINK_SIZE) { offset = (int)GET(hc, 0); - if (cd->start_code + offset == ptr + 1) - { - PUT(hc, 0, offset + adjust); - break; - } + if (cd->start_code + offset == ptr + 1) break; } - /* Otherwise, adjust the recursion offset if it's after the start of this - group. */ + /* If we have not found this recursion on the forward reference list, adjust + the recursion's offset if it's after the start of this group. */ if (hc >= cd->hwm) { @@ -3976,6 +4045,15 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) ptr += 1 + LINK_SIZE; } + +/* Now adjust all forward reference offsets for the group. */ + +for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; + hc += LINK_SIZE) + { + offset = (int)GET(hc, 0); + PUT(hc, 0, offset + adjust); + } } @@ -4160,7 +4238,11 @@ if ((options & PCRE_CASELESS) != 0) range. Otherwise, use a recursive call to add the additional range. */ else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ - else if (od > end && oc <= end + 1) end = od; /* Extend upwards */ + else if (od > end && oc <= end + 1) + { + end = od; /* Extend upwards */ + if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); + } else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od); } } @@ -4400,7 +4482,7 @@ const pcre_uchar *tempptr; const pcre_uchar *nestptr = NULL; pcre_uchar *previous = NULL; pcre_uchar *previous_callout = NULL; -pcre_uchar *save_hwm = NULL; +size_t item_hwm_offset = 0; pcre_uint8 classbits[32]; /* We can fish out the UTF-8 setting once and for all into a BOOL, but we @@ -4510,7 +4592,8 @@ for (;; ptr++) if (code > cd->start_workspace + cd->workspace_size - WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ { - *errorcodeptr = ERR52; + *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)? + ERR52 : ERR87; goto FAILED; } @@ -4558,8 +4641,7 @@ for (;; ptr++) /* In the real compile phase, just check the workspace used by the forward reference list. */ - else if (cd->hwm > cd->start_workspace + cd->workspace_size - - WORK_SIZE_SAFETY_MARGIN) + else if (cd->hwm > cd->start_workspace + cd->workspace_size) { *errorcodeptr = ERR52; goto FAILED; @@ -4680,7 +4762,8 @@ for (;; ptr++) previous = NULL; if ((options & PCRE_MULTILINE) != 0) { - if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; + if (firstcharflags == REQ_UNSET) + zerofirstcharflags = firstcharflags = REQ_NONE; *code++ = OP_CIRCM; } else *code++ = OP_CIRC; @@ -4701,6 +4784,7 @@ for (;; ptr++) zeroreqchar = reqchar; zeroreqcharflags = reqcharflags; previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -4752,6 +4836,7 @@ for (;; ptr++) /* Handle a real character class. */ previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ @@ -4857,10 +4942,11 @@ for (;; ptr++) (which is on the stack). We have to remember that there was XCLASS data, however. */ + if (class_uchardata > class_uchardata_base) xclass = TRUE; + if (lengthptr != NULL && class_uchardata > class_uchardata_base) { - xclass = TRUE; - *lengthptr += class_uchardata - class_uchardata_base; + *lengthptr += (int)(class_uchardata - class_uchardata_base); class_uchardata = class_uchardata_base; } #endif @@ -4962,10 +5048,26 @@ for (;; ptr++) ptr = tempptr + 1; continue; - /* For all other POSIX classes, no special action is taken in UCP - mode. Fall through to the non_UCP case. */ + /* For the other POSIX classes (ascii, xdigit) we are going to fall + through to the non-UCP case and build a bit map for characters with + code points less than 256. If we are in a negated POSIX class + within a non-negated overall class, characters with code points + greater than 255 must all match. In the special case where we have + not yet generated any xclass data, and this is the final item in + the overall class, we need do nothing: later on, the opcode + OP_NCLASS will be used to indicate that characters greater than 255 + are acceptable. If we have already seen an xclass item or one may + follow (we have to assume that it might if this is not the end of + the class), explicitly match all wide codepoints. */ default: + if (!negate_class && local_negate && + (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET)) + { + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); + } break; } } @@ -5129,9 +5231,9 @@ for (;; ptr++) cd, PRIV(vspace_list)); continue; -#ifdef SUPPORT_UCP case ESC_p: case ESC_P: +#ifdef SUPPORT_UCP { BOOL negated; unsigned int ptype = 0, pdata = 0; @@ -5145,6 +5247,9 @@ for (;; ptr++) class_has_8bitchar--; /* Undo! */ continue; } +#else + *errorcodeptr = ERR45; + goto FAILED; #endif /* Unrecognized escapes are faulted if PCRE is running in its strict mode. By default, for compatibility with Perl, they are @@ -5301,16 +5406,20 @@ for (;; ptr++) CLASS_SINGLE_CHARACTER: if (class_one_char < 2) class_one_char++; - /* If class_one_char is 1, we have the first single character in the - class, and there have been no prior ranges, or XCLASS items generated by - escapes. If this is the final character in the class, we can optimize by - turning the item into a 1-character OP_CHAR[I] if it's positive, or - OP_NOT[I] if it's negative. In the positive case, it can cause firstchar - to be set. Otherwise, there can be no first char if this item is first, - whatever repeat count may follow. In the case of reqchar, save the - previous value for reinstating. */ + /* If xclass_has_prop is false and class_one_char is 1, we have the first + single character in the class, and there have been no prior ranges, or + XCLASS items generated by escapes. If this is the final character in the + class, we can optimize by turning the item into a 1-character OP_CHAR[I] + if it's positive, or OP_NOT[I] if it's negative. In the positive case, it + can cause firstchar to be set. Otherwise, there can be no first char if + this item is first, whatever repeat count may follow. In the case of + reqchar, save the previous value for reinstating. */ - if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + if (!inescq && +#ifdef SUPPORT_UCP + !xclass_has_prop && +#endif + class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) { ptr++; zeroreqchar = reqchar; @@ -5426,9 +5535,10 @@ for (;; ptr++) actual compiled code. */ #ifdef SUPPORT_UTF - if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0)) + if (xclass && (xclass_has_prop || !should_flip_negation || + (options & PCRE_UCP) != 0)) #elif !defined COMPILE_PCRE8 - if (xclass && !should_flip_negation) + if (xclass && (xclass_has_prop || !should_flip_negation)) #endif #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 { @@ -5458,6 +5568,12 @@ for (;; ptr++) PUT(previous, 1, (int)(code - previous)); break; /* End of class handling */ } + + /* Even though any XCLASS list is now discarded, we must allow for + its memory. */ + + if (lengthptr != NULL) + *lengthptr += (int)(class_uchardata - class_uchardata_base); #endif /* If there are no characters > 255, or they are all to be included or @@ -5858,6 +5974,7 @@ for (;; ptr++) { register int i; int len = (int)(code - previous); + size_t base_hwm_offset = item_hwm_offset; pcre_uchar *bralink = NULL; pcre_uchar *brazeroptr = NULL; @@ -5912,7 +6029,7 @@ for (;; ptr++) if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; - adjust_recurse(previous, 1, utf, cd, save_hwm); + adjust_recurse(previous, 1, utf, cd, item_hwm_offset); memmove(previous + 1, previous, IN_UCHARS(len)); code++; if (repeat_max == 0) @@ -5936,7 +6053,7 @@ for (;; ptr++) { int offset; *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset); memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; @@ -5999,26 +6116,25 @@ for (;; ptr++) for (i = 1; i < repeat_min; i++) { pcre_uchar *hc; - pcre_uchar *this_hwm = cd->hwm; + size_t this_hwm_offset = cd->hwm - cd->start_workspace; memcpy(code, previous, IN_UCHARS(len)); while (cd->hwm > cd->start_workspace + cd->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - base_hwm_offset)) { - int save_offset = save_hwm - cd->start_workspace; - int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; - this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; + hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len); cd->hwm += LINK_SIZE; } - save_hwm = this_hwm; + base_hwm_offset = this_hwm_offset; code += len; } } @@ -6063,7 +6179,7 @@ for (;; ptr++) else for (i = repeat_max - 1; i >= 0; i--) { pcre_uchar *hc; - pcre_uchar *this_hwm = cd->hwm; + size_t this_hwm_offset = cd->hwm - cd->start_workspace; *code++ = OP_BRAZERO + repeat_type; @@ -6085,22 +6201,21 @@ for (;; ptr++) copying them. */ while (cd->hwm > cd->start_workspace + cd->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - base_hwm_offset)) { - int save_offset = save_hwm - cd->start_workspace; - int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; - this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; + hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); cd->hwm += LINK_SIZE; } - save_hwm = this_hwm; + base_hwm_offset = this_hwm_offset; code += len; } @@ -6183,6 +6298,12 @@ for (;; ptr++) while (*scode == OP_ALT); } + /* A conditional group with only one branch has an implicit empty + alternative branch. */ + + if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) + *bracode = OP_SCOND; + /* Handle possessive quantifiers. */ if (possessive_quantifier) @@ -6196,11 +6317,11 @@ for (;; ptr++) { int nlen = (int)(code - bracode); *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; - *bracode = OP_BRAPOS; + *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; *code++ = OP_KETRPOS; PUTINC(code, 0, nlen); PUT(bracode, 1, nlen); @@ -6330,7 +6451,7 @@ for (;; ptr++) else { *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -6379,7 +6500,7 @@ for (;; ptr++) default: *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -6408,15 +6529,25 @@ for (;; ptr++) parenthesis forms. */ case CHAR_LEFT_PARENTHESIS: - newoptions = options; - skipbytes = 0; - bravalue = OP_CBRA; - save_hwm = cd->hwm; - reset_bracount = FALSE; + ptr++; - /* First deal with various "verbs" that can be introduced by '*'. */ + /* First deal with comments. Putting this code right at the start ensures + that comments have no bad side effects. */ + + if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) + { + ptr += 2; + while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; + if (*ptr == CHAR_NULL) + { + *errorcodeptr = ERR18; + goto FAILED; + } + continue; + } + + /* Now deal with various "verbs" that can be introduced by '*'. */ - ptr++; if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0)))) { @@ -6475,8 +6606,21 @@ for (;; ptr++) cd->had_accept = TRUE; for (oc = cd->open_caps; oc != NULL; oc = oc->next) { - *code++ = OP_CLOSE; - PUT2INC(code, 0, oc->number); + if (lengthptr != NULL) + { +#ifdef COMPILE_PCRE8 + *lengthptr += 1 + IMM2_SIZE; +#elif defined COMPILE_PCRE16 + *lengthptr += 2 + IMM2_SIZE; +#elif defined COMPILE_PCRE32 + *lengthptr += 4 + IMM2_SIZE; +#endif + } + else + { + *code++ = OP_CLOSE; + PUT2INC(code, 0, oc->number); + } } setverb = *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; @@ -6505,9 +6649,17 @@ for (;; ptr++) goto FAILED; } setverb = *code++ = verbs[i].op_arg; - *code++ = arglen; - memcpy(code, arg, IN_UCHARS(arglen)); - code += arglen; + if (lengthptr != NULL) /* In pass 1 just add in the length */ + { /* to avoid potential workspace */ + *lengthptr += arglen; /* overflow. */ + *code++ = 0; + } + else + { + *code++ = arglen; + memcpy(code, arg, IN_UCHARS(arglen)); + code += arglen; + } *code++ = 0; } @@ -6537,10 +6689,18 @@ for (;; ptr++) goto FAILED; } + /* Initialize for "real" parentheses */ + + newoptions = options; + skipbytes = 0; + bravalue = OP_CBRA; + item_hwm_offset = cd->hwm - cd->start_workspace; + reset_bracount = FALSE; + /* Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. */ - else if (*ptr == CHAR_QUESTION_MARK) + if (*ptr == CHAR_QUESTION_MARK) { int i, set, unset, namelen; int *optset; @@ -6549,20 +6709,10 @@ for (;; ptr++) switch (*(++ptr)) { - case CHAR_NUMBER_SIGN: /* Comment; skip to ket */ - ptr++; - while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - if (*ptr == CHAR_NULL) - { - *errorcodeptr = ERR18; - goto FAILED; - } - continue; - - /* ------------------------------------------------------------ */ case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ reset_bracount = TRUE; + cd->dupgroups = TRUE; /* Record (?| encountered */ /* Fall through */ /* ------------------------------------------------------------ */ @@ -6608,8 +6758,13 @@ for (;; ptr++) if (tempptr[1] == CHAR_QUESTION_MARK && (tempptr[2] == CHAR_EQUALS_SIGN || tempptr[2] == CHAR_EXCLAMATION_MARK || - tempptr[2] == CHAR_LESS_THAN_SIGN)) + (tempptr[2] == CHAR_LESS_THAN_SIGN && + (tempptr[3] == CHAR_EQUALS_SIGN || + tempptr[3] == CHAR_EXCLAMATION_MARK)))) + { + cd->iscondassert = TRUE; break; + } /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ @@ -6658,6 +6813,12 @@ for (;; ptr++) { while (IS_DIGIT(*ptr)) { + if (recno > INT_MAX / 10 - 1) /* Integer overflow */ + { + while (IS_DIGIT(*ptr)) ptr++; + *errorcodeptr = ERR61; + goto FAILED; + } recno = recno * 10 + (int)(*ptr - CHAR_0); ptr++; } @@ -6686,7 +6847,7 @@ for (;; ptr++) ptr++; } namelen = (int)(ptr - name); - if (lengthptr != NULL) *lengthptr += IMM2_SIZE; + if (lengthptr != NULL) skipbytes += IMM2_SIZE; } /* Check the terminator */ @@ -6722,6 +6883,7 @@ for (;; ptr++) goto FAILED; } PUT2(code, 2+LINK_SIZE, recno); + if (recno > cd->top_backref) cd->top_backref = recno; break; } @@ -6744,12 +6906,15 @@ for (;; ptr++) int offset = i++; int count = 1; recno = GET2(slot, 0); /* Number from first found */ + if (recno > cd->top_backref) cd->top_backref = recno; for (; i < cd->names_found; i++) { slot += cd->name_entry_size; - if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break; + if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 || + (slot+IMM2_SIZE)[namelen] != 0) break; count++; } + if (count > 1) { PUT2(code, 2+LINK_SIZE, offset); @@ -6788,6 +6953,11 @@ for (;; ptr++) *errorcodeptr = ERR15; goto FAILED; } + if (recno > INT_MAX / 10 - 1) /* Integer overflow */ + { + *errorcodeptr = ERR61; + goto FAILED; + } recno = recno * 10 + name[i] - CHAR_0; } if (recno == 0) recno = RREF_ANY; @@ -7064,6 +7234,7 @@ for (;; ptr++) if (lengthptr != NULL) { named_group *ng; + recno = 0; if (namelen == 0) { @@ -7081,23 +7252,65 @@ for (;; ptr++) goto FAILED; } - /* The name table does not exist in the first pass; instead we must - scan the list of names encountered so far in order to get the - number. If the name is not found, set the value to 0 for a forward - reference. */ - - ng = cd->named_groups; - for (i = 0; i < cd->names_found; i++, ng++) - { - if (namelen == ng->length && - STRNCMP_UC_UC(name, ng->name, namelen) == 0) - break; - } - recno = (i < cd->names_found)? ng->number : 0; - /* Count named back references. */ if (!is_recurse) cd->namedrefcount++; + + /* We have to allow for a named reference to a duplicated name (this + cannot be determined until the second pass). This needs an extra + 16-bit data item. */ + + *lengthptr += IMM2_SIZE; + + /* If this is a forward reference and we are within a (?|...) group, + the reference may end up as the number of a group which we are + currently inside, that is, it could be a recursive reference. In the + real compile this will be picked up and the reference wrapped with + OP_ONCE to make it atomic, so we must space in case this occurs. */ + + /* In fact, this can happen for a non-forward reference because + another group with the same number might be created later. This + issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance + only mode, we finesse the bug by allowing more memory always. */ + + *lengthptr += 2 + 2*LINK_SIZE; + + /* It is even worse than that. The current reference may be to an + existing named group with a different number (so apparently not + recursive) but which later on is also attached to a group with the + current number. This can only happen if $(| has been previous + encountered. In that case, we allow yet more memory, just in case. + (Again, this is fixed "properly" in PCRE2. */ + + if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE; + + /* Otherwise, check for recursion here. The name table does not exist + in the first pass; instead we must scan the list of names encountered + so far in order to get the number. If the name is not found, leave + the value of recno as 0 for a forward reference. */ + + else + { + ng = cd->named_groups; + for (i = 0; i < cd->names_found; i++, ng++) + { + if (namelen == ng->length && + STRNCMP_UC_UC(name, ng->name, namelen) == 0) + { + open_capitem *oc; + recno = ng->number; + if (is_recurse) break; + for (oc = cd->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == recno) + { + oc->flag = TRUE; + break; + } + } + } + } + } } /* In the real compile, search the name table. We check the name @@ -7152,6 +7365,7 @@ for (;; ptr++) { if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF; PUT2INC(code, 0, index); PUT2INC(code, 0, count); @@ -7189,9 +7403,14 @@ for (;; ptr++) /* ------------------------------------------------------------ */ - case CHAR_R: /* Recursion */ - ptr++; /* Same as (?0) */ - /* Fall through */ + case CHAR_R: /* Recursion, same as (?0) */ + recno = 0; + if (*(++ptr) != CHAR_RIGHT_PARENTHESIS) + { + *errorcodeptr = ERR29; + goto FAILED; + } + goto HANDLE_RECURSION; /* ------------------------------------------------------------ */ @@ -7228,7 +7447,15 @@ for (;; ptr++) recno = 0; while(IS_DIGIT(*ptr)) + { + if (recno > INT_MAX / 10 - 1) /* Integer overflow */ + { + while (IS_DIGIT(*ptr)) ptr++; + *errorcodeptr = ERR61; + goto FAILED; + } recno = recno * 10 + *ptr++ - CHAR_0; + } if (*ptr != (pcre_uchar)terminator) { @@ -7265,6 +7492,7 @@ for (;; ptr++) HANDLE_RECURSION: previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; called = cd->start_code; /* When we are actually compiling, find the bracket that is being @@ -7452,12 +7680,26 @@ for (;; ptr++) goto FAILED; } - /* Assertions used not to be repeatable, but this was changed for Perl - compatibility, so all kinds can now be repeated. We copy code into a + /* All assertions used not to be repeatable, but this was changed for Perl + compatibility. All kinds can now be repeated except for assertions that are + conditions (Perl also forbids these to be repeated). We copy code into a non-register variable (tempcode) in order to be able to pass its address - because some compilers complain otherwise. */ + because some compilers complain otherwise. At the start of a conditional + group whose condition is an assertion, cd->iscondassert is set. We unset it + here so as to allow assertions later in the group to be quantified. */ + + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && + cd->iscondassert) + { + previous = NULL; + cd->iscondassert = FALSE; + } + else + { + previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; + } - previous = code; /* For handling repetition */ *code = bravalue; tempcode = code; tempreqvary = cd->req_varyopt; /* Save value before bracket */ @@ -7704,7 +7946,7 @@ for (;; ptr++) const pcre_uchar *p; pcre_uint32 cf; - save_hwm = cd->hwm; /* Normally this is set when '(' is read */ + item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */ terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; @@ -7733,7 +7975,7 @@ for (;; ptr++) if (*p != (pcre_uchar)terminator) { *errorcodeptr = ERR57; - break; + goto FAILED; } ptr++; goto HANDLE_NUMERICAL_RECURSION; @@ -7748,7 +7990,7 @@ for (;; ptr++) ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) { *errorcodeptr = ERR69; - break; + goto FAILED; } is_recurse = FALSE; terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? @@ -7772,6 +8014,7 @@ for (;; ptr++) HANDLE_REFERENCE: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF; PUT2INC(code, 0, recno); cd->backref_map |= (recno < 32)? (1 << recno) : 1; @@ -7801,6 +8044,7 @@ for (;; ptr++) if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr)) goto FAILED; previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; *code++ = ptype; *code++ = pdata; @@ -7841,6 +8085,7 @@ for (;; ptr++) { previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; + item_hwm_offset = cd->hwm - cd->start_workspace; *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; } } @@ -7884,6 +8129,7 @@ for (;; ptr++) ONE_CHAR: previous = code; + item_hwm_offset = cd->hwm - cd->start_workspace; /* For caseless UTF-8 mode when UCP support is available, check whether this character has more than one other case. If so, generate a special @@ -8031,6 +8277,7 @@ int length; unsigned int orig_bracount; unsigned int max_bracount; branch_chain bc; +size_t save_hwm_offset; /* If set, call the external function that checks for stack availability. */ @@ -8048,6 +8295,8 @@ bc.current_branch = code; firstchar = reqchar = 0; firstcharflags = reqcharflags = REQ_UNSET; +save_hwm_offset = cd->hwm - cd->start_workspace; + /* Accumulate the length for use in the pre-compile phase. Start with the length of the BRA and KET and any extra bytes that are required at the beginning. We accumulate in a local variable to save frequent testing of @@ -8189,7 +8438,7 @@ for (;;) int fixed_length; *code = OP_END; fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0, - FALSE, cd); + FALSE, cd, NULL); DPRINTF(("fixed length = %d\n", fixed_length)); if (fixed_length == -3) { @@ -8241,12 +8490,16 @@ for (;;) /* If it was a capturing subpattern, check to see if it contained any recursive back references. If so, we must wrap it in atomic brackets. - In any event, remove the block from the chain. */ + Because we are moving code along, we must ensure that any pending recursive + references are updated. In any event, remove the block from the chain. */ if (capnumber > 0) { if (cd->open_caps->flag) { + *code = OP_END; + adjust_recurse(start_bracket, 1 + LINK_SIZE, + (options & PCRE_UTF8) != 0, cd, save_hwm_offset); memmove(start_bracket + 1 + LINK_SIZE, start_bracket, IN_UCHARS(code - start_bracket)); *start_bracket = OP_ONCE; @@ -8470,6 +8723,7 @@ do { case OP_RREF: case OP_DNRREF: case OP_DEF: + case OP_FAIL: return FALSE; default: /* Assertion */ @@ -9051,9 +9305,11 @@ cd->names_found = 0; cd->name_entry_size = 0; cd->name_table = NULL; cd->dupnames = FALSE; +cd->dupgroups = FALSE; cd->namedrefcount = 0; cd->start_code = cworkspace; cd->hwm = cworkspace; +cd->iscondassert = FALSE; cd->start_workspace = cworkspace; cd->workspace_size = COMPILE_WORK_SIZE; cd->named_groups = named_groups; @@ -9091,13 +9347,6 @@ if (length > MAX_PATTERN_SIZE) goto PCRE_EARLY_ERROR_RETURN; } -/* If there are groups with duplicate names and there are also references by -name, we must allow for the possibility of named references to duplicated -groups. These require an extra data item each. */ - -if (cd->dupnames && cd->namedrefcount > 0) - length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar); - /* Compute the size of the data block for storing the compiled pattern. Integer overflow should no longer be possible because nowadays we limit the maximum value of cd->names_found and cd->name_entry_size. */ @@ -9156,6 +9405,7 @@ cd->name_table = (pcre_uchar *)re + re->name_table_offset; codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; cd->hwm = (pcre_uchar *)(cd->start_workspace); +cd->iscondassert = FALSE; cd->req_varyopt = 0; cd->had_accept = FALSE; cd->had_pruneorskip = FALSE; @@ -9228,6 +9478,16 @@ if (cd->hwm > cd->start_workspace) int offset, recno; cd->hwm -= LINK_SIZE; offset = GET(cd->hwm, 0); + + /* Check that the hwm handling hasn't gone wrong. This whole area is + rewritten in PCRE2 because there are some obscure cases. */ + + if (offset == 0 || codestart[offset-1] != OP_RECURSE) + { + errorcode = ERR10; + break; + } + recno = GET(codestart, offset); if (recno != prev_recno) { @@ -9251,11 +9511,18 @@ subpattern. */ if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; -/* Unless disabled, check whether single character iterators can be -auto-possessified. The function overwrites the appropriate opcode values. */ +/* Unless disabled, check whether any single character iterators can be +auto-possessified. The function overwrites the appropriate opcode values, so +the type of the pointer must be cast. NOTE: the intermediate variable "temp" is +used in this code because at least one compiler gives a warning about loss of +"const" attribute if the cast (pcre_uchar *)codestart is used directly in the +function call. */ -if ((options & PCRE_NO_AUTO_POSSESS) == 0) - auto_possessify((pcre_uchar *)codestart, utf, cd); +if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0) + { + pcre_uchar *temp = (pcre_uchar *)codestart; + auto_possessify(temp, utf, cd); + } /* If there were any lookbehind assertions that contained OP_RECURSE (recursions or subroutine calls), a flag is set for them to be checked here, @@ -9285,7 +9552,7 @@ if (errorcode == 0 && cd->check_lookbehind) int end_op = *be; *be = OP_END; fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE, - cd); + cd, NULL); *be = end_op; DPRINTF(("fixed length = %d\n", fixed_length)); if (fixed_length < 0) @@ -9478,4 +9745,3 @@ return (pcre32 *)re; } /* End of pcre_compile.c */ -