and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2014 University of Cambridge
+ Copyright (c) 1997-2016 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
#endif
#define NLBLOCK cd /* Block containing newline information */
-#define PSSTART start_pattern /* Field containing processed string start */
-#define PSEND end_pattern /* Field containing processed string end */
+#define PSSTART start_pattern /* Field containing pattern start */
+#define PSEND end_pattern /* Field containing pattern end */
#include "pcre_internal.h"
-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
- CHAR_GRAVE_ACCENT, 7,
+ CHAR_GRAVE_ACCENT, ESC_a,
-ESC_b, 0,
-ESC_d, ESC_e,
ESC_f, 0,
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
-/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
+/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
-/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
+/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
};
+
+/* We also need a table of characters that may follow \c in an EBCDIC
+environment for characters 0-31. */
+
+static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
+
#endif
"range out of order in character class\0"
"nothing to repeat\0"
/* 10 */
- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
+ "internal error: invalid forward reference offset\0"
"internal error: unexpected repeat\0"
"unrecognized character after (? or (?-\0"
"POSIX named classes are supported only within a class\0"
"different names for subpatterns of the same number are not allowed\0"
"(*MARK) must have an argument\0"
"this version of PCRE is not compiled with Unicode property support\0"
+#ifndef EBCDIC
"\\c must be followed by an ASCII character\0"
+#else
+ "\\c must be followed by a letter or one of [\\]^_?\0"
+#endif
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
/* 70 */
"internal error: unknown opcode in find_fixedlength()\0"
"group name must start with a non-digit\0"
/* 85 */
"parentheses are too deeply nested (stack check)\0"
+ "digits missing in \\x{} or \\o{}\0"
+ "regular expression is too complicated\0"
;
/* Table to identify digits and hex digits. This is used when compiling
case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
+ if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
{
ptr += 2;
c = 0;
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
ptr += 2;
+ if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
+ {
+ *errorcodeptr = ERR86;
+ break;
+ }
c = 0;
overflow = FALSE;
while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
c ^= 0x40;
#else /* EBCDIC coding */
if (c >= CHAR_a && c <= CHAR_z) c += 64;
- c ^= 0xC0;
+ if (c == CHAR_QUESTION_MARK)
+ c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
+ else
+ {
+ for (i = 0; i < 32; i++)
+ {
+ if (c == ebcdic_escape_c[i]) break;
+ }
+ if (i < 32) c = i; else *errorcodeptr = ERR68;
+ }
#endif
break;
int min = 0;
int max = -1;
-while (IS_DIGIT(*p))
+while (IS_DIGIT(*p))
{
min = min * 10 + (int)(*p++ - CHAR_0);
if (min > 65535)
*errorcodeptr = ERR5;
return p;
}
- }
+ }
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
{
if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
{
max = 0;
- while(IS_DIGIT(*p))
+ while(IS_DIGIT(*p))
{
max = max * 10 + (int)(*p++ - CHAR_0);
if (max > 65535)
*errorcodeptr = ERR5;
return p;
}
- }
+ }
if (max < min)
{
*errorcodeptr = ERR4;
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
atend TRUE if called when the pattern is complete
cd the "compile data" structure
+ recurses chain of recurse_check to catch mutual recursion
Returns: the fixed length,
or -1 if there is no fixed length,
*/
static int
-find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
+find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
+ recurse_check *recurses)
{
int length = -1;
-
+recurse_check this_recurse;
register int branchlength = 0;
register pcre_uchar *cc = code + 1 + LINK_SIZE;
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
- d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
+ d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
+ recurses);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
if (cc > cs && cc < ce) return -1; /* Recursion */
- d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
+ else /* Check for mutual recursion */
+ {
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) return -1; /* Mutual recursion */
+ }
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
if (d < 0) return d;
branchlength += d;
cc += 1 + LINK_SIZE;
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += GET(cc, 1); while (*cc == OP_ALT);
- cc += PRIV(OP_lengths)[*cc];
+ cc += 1 + LINK_SIZE;
break;
/* Skip over things that don't match chars */
{
case OP_CHAR:
case OP_CHARI:
+ case OP_NOT:
+ case OP_NOTI:
case OP_EXACT:
case OP_EXACTI:
+ case OP_NOTEXACT:
+ case OP_NOTEXACTI:
case OP_UPTO:
case OP_UPTOI:
+ case OP_NOTUPTO:
+ case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
+ case OP_NOTMINUPTO:
+ case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
+ case OP_NOTPOSUPTO:
+ case OP_NOTPOSUPTOI:
case OP_STAR:
case OP_STARI:
+ case OP_NOTSTAR:
+ case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
+ case OP_NOTMINSTAR:
+ case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
+ case OP_NOTPOSSTAR:
+ case OP_NOTPOSSTARI:
case OP_PLUS:
case OP_PLUSI:
+ case OP_NOTPLUS:
+ case OP_NOTPLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
+ case OP_NOTMINPLUS:
+ case OP_NOTMINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
+ case OP_NOTPOSPLUS:
+ case OP_NOTPOSPLUSI:
case OP_QUERY:
case OP_QUERYI:
+ case OP_NOTQUERY:
+ case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
+ case OP_NOTMINQUERY:
+ case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
+ case OP_NOTPOSQUERY:
+ case OP_NOTPOSQUERYI:
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;
}
Returns: TRUE if what is matched could be empty
*/
-typedef struct recurse_check {
- struct recurse_check *prev;
- const pcre_uchar *group;
-} recurse_check;
-
static BOOL
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
BOOL utf, compile_data *cd, recurse_check *recurses)
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
else
- {
+ {
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == scode) break;
if (r != NULL) continue; /* Mutual recursion */
- }
+ }
/* Completed reference; scan the referenced group, remembering it on the
stack chain to detect mutual recursions. */
if (c == OP_BRA || c == OP_BRAPOS ||
c == OP_CBRA || c == OP_CBRAPOS ||
c == OP_ONCE || c == OP_ONCE_NC ||
- c == OP_COND)
+ c == OP_COND || c == OP_SCOND)
{
BOOL empty_branch;
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
empty_branch = FALSE;
do
{
- if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
- empty_branch = TRUE;
+ if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
+ recurses)) empty_branch = TRUE;
code += GET(code, 1);
}
while (*code == OP_ALT);
end += 1 + 2 * IMM2_SIZE;
break;
}
- list[2] = end - code;
+ list[2] = (pcre_uint32)(end - code);
return end;
}
return NULL; /* Opcode not accepted */
static BOOL
compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
- const pcre_uint32 *base_list, const pcre_uchar *base_end)
+ const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
{
pcre_uchar c;
pcre_uint32 list[8];
const pcre_uint8 *set1, *set2, *set_end;
pcre_uint32 chr;
BOOL accepted, invert_bits;
+BOOL entered_a_group = FALSE;
+
+if (*rec_limit == 0) return FALSE;
+--(*rec_limit);
/* Note: the base_list[1] contains whether the current opcode has greedy
(represented by a non-zero value) quantifier. This is a different from
case OP_ONCE:
case OP_ONCE_NC:
/* Atomic sub-patterns and assertions can always auto-possessify their
- last iterator. */
- return TRUE;
+ last iterator. However, if the group was entered as a result of checking
+ a previous iterator, this is not possible. */
+
+ return !entered_a_group;
}
code += PRIV(OP_lengths)[c];
while (*next_code == OP_ALT)
{
- if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
+ if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
+ return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
}
+
+ entered_a_group = TRUE;
continue;
case OP_BRAZERO:
/* The bracket content will be checked by the
OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
- if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
+ if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
continue;
+
+ default:
+ break;
}
/* Check for a supported opcode, and load its properties. */
rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
- if (!accepted)
- return FALSE;
+ if (!accepted) return FALSE;
if (list[1] == 0) return TRUE;
/* Might be an empty repeat. */
const pcre_uchar *end;
pcre_uchar *repeat_opcode;
pcre_uint32 list[8];
+int rec_limit;
for (;;)
{
c = *code;
+ /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
+ it may compile without complaining, but may get into a loop here if the code
+ pointer points to a bad value. This is, of course a documentated possibility,
+ when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
+ just give up on this optimization. */
+
+ if (c >= OP_TABLE_LENGTH) return;
+
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
{
c -= get_repeat_base(c) - OP_STAR;
get_chr_property_list(code, utf, cd->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
- if (end != NULL && compare_opcodes(end, utf, cd, list, end))
+ rec_limit = 1000;
+ if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
{
switch(c)
{
list[1] = (c & 1) == 0;
- if (compare_opcodes(end, utf, cd, list, end))
+ rec_limit = 1000;
+ if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
{
switch (c)
{
The problem in trying to be exactly like Perl is in the handling of escapes. We
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
-below handles the special case of \], but does not try to do any other escape
-processing. This makes it different from Perl for cases such as [:l\ower:]
-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
-"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
-I think.
+below handles the special cases \\ and \], but does not try to do any other
+escape processing. This makes it different from Perl for cases such as
+[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
+not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
+when Perl does, I think.
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
It seems that the appearance of a nested POSIX class supersedes an apparent
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
for (++ptr; *ptr != CHAR_NULL; ptr++)
{
- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ if (*ptr == CHAR_BACKSLASH &&
+ (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
+ ptr[1] == CHAR_BACKSLASH))
ptr++;
- else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
- else
+ else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
+ *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
+ else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
- if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
- {
- *endptr = ptr;
- return TRUE;
- }
- if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
- (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
- ptr[1] == CHAR_EQUALS_SIGN) &&
- check_posix_syntax(ptr, endptr))
- return FALSE;
+ *endptr = ptr;
+ return TRUE;
}
}
return FALSE;
is called, the partially compiled regex must be temporarily terminated with
OP_END.
-This function has been extended with the possibility of forward references for
-recursions and subroutine calls. It must also check the list of such references
-for the group we are dealing with. If it finds that one of the recursions in
-the current group is on this list, it adjusts the offset in the list, not the
-value in the reference (which is a group number).
+This function has been extended to cope with forward references for recursions
+and subroutine calls. It must check the list of such references for the
+group we are dealing with. If it finds that one of the recursions in the
+current group is on this list, it does not adjust the value in the reference
+(which is a group number). After the group has been scanned, all the offsets in
+the forward reference list for the group are adjusted.
Arguments:
group points to the start of the group
adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
size_t save_hwm_offset)
{
+int offset;
+pcre_uchar *hc;
pcre_uchar *ptr = group;
while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
{
- int offset;
- pcre_uchar *hc;
-
- /* See if this recursion is on the forward reference list. If so, adjust the
- reference. */
-
- for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
+ for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
hc += LINK_SIZE)
{
offset = (int)GET(hc, 0);
- if (cd->start_code + offset == ptr + 1)
- {
- PUT(hc, 0, offset + adjust);
- break;
- }
+ if (cd->start_code + offset == ptr + 1) break;
}
- /* Otherwise, adjust the recursion offset if it's after the start of this
- group. */
+ /* If we have not found this recursion on the forward reference list, adjust
+ the recursion's offset if it's after the start of this group. */
if (hc >= cd->hwm)
{
ptr += 1 + LINK_SIZE;
}
+
+/* Now adjust all forward reference offsets for the group. */
+
+for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
+ hc += LINK_SIZE)
+ {
+ offset = (int)GET(hc, 0);
+ PUT(hc, 0, offset + adjust);
+ }
}
range. Otherwise, use a recursive call to add the additional range. */
else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
- else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
+ else if (od > end && oc <= end + 1)
+ {
+ end = od; /* Extend upwards */
+ if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
+ }
else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
}
}
const pcre_uchar *nestptr = NULL;
pcre_uchar *previous = NULL;
pcre_uchar *previous_callout = NULL;
-size_t save_hwm_offset = 0;
+size_t item_hwm_offset = 0;
pcre_uint8 classbits[32];
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
if (code > cd->start_workspace + cd->workspace_size -
WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
{
- *errorcodeptr = ERR52;
+ *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
+ ERR52 : ERR87;
goto FAILED;
}
/* In the real compile phase, just check the workspace used by the forward
reference list. */
- else if (cd->hwm > cd->start_workspace + cd->workspace_size -
- WORK_SIZE_SAFETY_MARGIN)
+ else if (cd->hwm > cd->start_workspace + cd->workspace_size)
{
*errorcodeptr = ERR52;
goto FAILED;
previous = NULL;
if ((options & PCRE_MULTILINE) != 0)
{
- if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
+ if (firstcharflags == REQ_UNSET)
+ zerofirstcharflags = firstcharflags = REQ_NONE;
*code++ = OP_CIRCM;
}
else *code++ = OP_CIRC;
zeroreqchar = reqchar;
zeroreqcharflags = reqcharflags;
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break;
/* Handle a real character class. */
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
they are encountered at the top level, so we'll do that too. */
(which is on the stack). We have to remember that there was XCLASS data,
however. */
+ if (class_uchardata > class_uchardata_base) xclass = TRUE;
+
if (lengthptr != NULL && class_uchardata > class_uchardata_base)
{
- xclass = TRUE;
- *lengthptr += class_uchardata - class_uchardata_base;
+ *lengthptr += (int)(class_uchardata - class_uchardata_base);
class_uchardata = class_uchardata_base;
}
#endif
ptr = tempptr + 1;
continue;
- /* For all other POSIX classes, no special action is taken in UCP
- mode. Fall through to the non_UCP case. */
+ /* For the other POSIX classes (ascii, xdigit) we are going to fall
+ through to the non-UCP case and build a bit map for characters with
+ code points less than 256. If we are in a negated POSIX class
+ within a non-negated overall class, characters with code points
+ greater than 255 must all match. In the special case where we have
+ not yet generated any xclass data, and this is the final item in
+ the overall class, we need do nothing: later on, the opcode
+ OP_NCLASS will be used to indicate that characters greater than 255
+ are acceptable. If we have already seen an xclass item or one may
+ follow (we have to assume that it might if this is not the end of
+ the class), explicitly match all wide codepoints. */
default:
+ if (!negate_class && local_negate &&
+ (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
+ {
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
+ }
break;
}
}
cd, PRIV(vspace_list));
continue;
-#ifdef SUPPORT_UCP
case ESC_p:
case ESC_P:
+#ifdef SUPPORT_UCP
{
BOOL negated;
unsigned int ptype = 0, pdata = 0;
class_has_8bitchar--; /* Undo! */
continue;
}
+#else
+ *errorcodeptr = ERR45;
+ goto FAILED;
#endif
/* Unrecognized escapes are faulted if PCRE is running in its
strict mode. By default, for compatibility with Perl, they are
CLASS_SINGLE_CHARACTER:
if (class_one_char < 2) class_one_char++;
- /* If class_one_char is 1, we have the first single character in the
- class, and there have been no prior ranges, or XCLASS items generated by
- escapes. If this is the final character in the class, we can optimize by
- turning the item into a 1-character OP_CHAR[I] if it's positive, or
- OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
- to be set. Otherwise, there can be no first char if this item is first,
- whatever repeat count may follow. In the case of reqchar, save the
- previous value for reinstating. */
+ /* If xclass_has_prop is false and class_one_char is 1, we have the first
+ single character in the class, and there have been no prior ranges, or
+ XCLASS items generated by escapes. If this is the final character in the
+ class, we can optimize by turning the item into a 1-character OP_CHAR[I]
+ if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
+ can cause firstchar to be set. Otherwise, there can be no first char if
+ this item is first, whatever repeat count may follow. In the case of
+ reqchar, save the previous value for reinstating. */
- if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ if (!inescq &&
+#ifdef SUPPORT_UCP
+ !xclass_has_prop &&
+#endif
+ class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
ptr++;
zeroreqchar = reqchar;
actual compiled code. */
#ifdef SUPPORT_UTF
- if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
+ if (xclass && (xclass_has_prop || !should_flip_negation ||
+ (options & PCRE_UCP) != 0))
#elif !defined COMPILE_PCRE8
- if (xclass && !should_flip_negation)
+ if (xclass && (xclass_has_prop || !should_flip_negation))
#endif
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
{
PUT(previous, 1, (int)(code - previous));
break; /* End of class handling */
}
+
+ /* Even though any XCLASS list is now discarded, we must allow for
+ its memory. */
+
+ if (lengthptr != NULL)
+ *lengthptr += (int)(class_uchardata - class_uchardata_base);
#endif
/* If there are no characters > 255, or they are all to be included or
{
register int i;
int len = (int)(code - previous);
+ size_t base_hwm_offset = item_hwm_offset;
pcre_uchar *bralink = NULL;
pcre_uchar *brazeroptr = NULL;
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{
*code = OP_END;
- adjust_recurse(previous, 1, utf, cd, save_hwm_offset);
+ adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
memmove(previous + 1, previous, IN_UCHARS(len));
code++;
if (repeat_max == 0)
{
int offset;
*code = OP_END;
- adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset);
+ adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
code += 2 + LINK_SIZE;
*previous++ = OP_BRAZERO + repeat_type;
while (cd->hwm > cd->start_workspace + cd->workspace_size -
WORK_SIZE_SAFETY_MARGIN -
- (this_hwm_offset - save_hwm_offset))
+ (this_hwm_offset - base_hwm_offset))
{
*errorcodeptr = expand_workspace(cd);
if (*errorcodeptr != 0) goto FAILED;
}
- for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset;
+ for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
hc += LINK_SIZE)
{
PUT(cd->hwm, 0, GET(hc, 0) + len);
cd->hwm += LINK_SIZE;
}
- save_hwm_offset = this_hwm_offset;
+ base_hwm_offset = this_hwm_offset;
code += len;
}
}
while (cd->hwm > cd->start_workspace + cd->workspace_size -
WORK_SIZE_SAFETY_MARGIN -
- (this_hwm_offset - save_hwm_offset))
+ (this_hwm_offset - base_hwm_offset))
{
*errorcodeptr = expand_workspace(cd);
if (*errorcodeptr != 0) goto FAILED;
}
- for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset;
+ for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
hc += LINK_SIZE)
{
PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
cd->hwm += LINK_SIZE;
}
- save_hwm_offset = this_hwm_offset;
+ base_hwm_offset = this_hwm_offset;
code += len;
}
while (*scode == OP_ALT);
}
+ /* A conditional group with only one branch has an implicit empty
+ alternative branch. */
+
+ if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
+ *bracode = OP_SCOND;
+
/* Handle possessive quantifiers. */
if (possessive_quantifier)
{
int nlen = (int)(code - bracode);
*code = OP_END;
- adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
+ adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
code += 1 + LINK_SIZE;
nlen += 1 + LINK_SIZE;
- *bracode = OP_BRAPOS;
+ *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
*code++ = OP_KETRPOS;
PUTINC(code, 0, nlen);
PUT(bracode, 1, nlen);
else
{
*code = OP_END;
- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
code += 1 + LINK_SIZE;
len += 1 + LINK_SIZE;
default:
*code = OP_END;
- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
code += 1 + LINK_SIZE;
len += 1 + LINK_SIZE;
parenthesis forms. */
case CHAR_LEFT_PARENTHESIS:
- newoptions = options;
- skipbytes = 0;
- bravalue = OP_CBRA;
- save_hwm_offset = cd->hwm - cd->start_workspace;
- reset_bracount = FALSE;
+ ptr++;
- /* First deal with various "verbs" that can be introduced by '*'. */
+ /* First deal with comments. Putting this code right at the start ensures
+ that comments have no bad side effects. */
+
+ if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
+ {
+ ptr += 2;
+ while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
+ if (*ptr == CHAR_NULL)
+ {
+ *errorcodeptr = ERR18;
+ goto FAILED;
+ }
+ continue;
+ }
+
+ /* Now deal with various "verbs" that can be introduced by '*'. */
- ptr++;
if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
|| (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
{
cd->had_accept = TRUE;
for (oc = cd->open_caps; oc != NULL; oc = oc->next)
{
- *code++ = OP_CLOSE;
- PUT2INC(code, 0, oc->number);
+ if (lengthptr != NULL)
+ {
+#ifdef COMPILE_PCRE8
+ *lengthptr += 1 + IMM2_SIZE;
+#elif defined COMPILE_PCRE16
+ *lengthptr += 2 + IMM2_SIZE;
+#elif defined COMPILE_PCRE32
+ *lengthptr += 4 + IMM2_SIZE;
+#endif
+ }
+ else
+ {
+ *code++ = OP_CLOSE;
+ PUT2INC(code, 0, oc->number);
+ }
}
setverb = *code++ =
(cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
goto FAILED;
}
setverb = *code++ = verbs[i].op_arg;
- *code++ = arglen;
- memcpy(code, arg, IN_UCHARS(arglen));
- code += arglen;
+ if (lengthptr != NULL) /* In pass 1 just add in the length */
+ { /* to avoid potential workspace */
+ *lengthptr += arglen; /* overflow. */
+ *code++ = 0;
+ }
+ else
+ {
+ *code++ = arglen;
+ memcpy(code, arg, IN_UCHARS(arglen));
+ code += arglen;
+ }
*code++ = 0;
}
goto FAILED;
}
+ /* Initialize for "real" parentheses */
+
+ newoptions = options;
+ skipbytes = 0;
+ bravalue = OP_CBRA;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
+ reset_bracount = FALSE;
+
/* Deal with the extended parentheses; all are introduced by '?', and the
appearance of any of them means that this is not a capturing group. */
- else if (*ptr == CHAR_QUESTION_MARK)
+ if (*ptr == CHAR_QUESTION_MARK)
{
int i, set, unset, namelen;
int *optset;
switch (*(++ptr))
{
- case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
- ptr++;
- while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
- if (*ptr == CHAR_NULL)
- {
- *errorcodeptr = ERR18;
- goto FAILED;
- }
- continue;
-
-
/* ------------------------------------------------------------ */
case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
reset_bracount = TRUE;
+ cd->dupgroups = TRUE; /* Record (?| encountered */
/* Fall through */
/* ------------------------------------------------------------ */
if (tempptr[1] == CHAR_QUESTION_MARK &&
(tempptr[2] == CHAR_EQUALS_SIGN ||
tempptr[2] == CHAR_EXCLAMATION_MARK ||
- tempptr[2] == CHAR_LESS_THAN_SIGN))
+ (tempptr[2] == CHAR_LESS_THAN_SIGN &&
+ (tempptr[3] == CHAR_EQUALS_SIGN ||
+ tempptr[3] == CHAR_EXCLAMATION_MARK))))
+ {
+ cd->iscondassert = TRUE;
break;
+ }
/* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
{
while (IS_DIGIT(*ptr))
{
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+ {
+ while (IS_DIGIT(*ptr)) ptr++;
+ *errorcodeptr = ERR61;
+ goto FAILED;
+ }
recno = recno * 10 + (int)(*ptr - CHAR_0);
ptr++;
}
ptr++;
}
namelen = (int)(ptr - name);
- if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
+ if (lengthptr != NULL) skipbytes += IMM2_SIZE;
}
/* Check the terminator */
goto FAILED;
}
PUT2(code, 2+LINK_SIZE, recno);
+ if (recno > cd->top_backref) cd->top_backref = recno;
break;
}
int offset = i++;
int count = 1;
recno = GET2(slot, 0); /* Number from first found */
+ if (recno > cd->top_backref) cd->top_backref = recno;
for (; i < cd->names_found; i++)
{
slot += cd->name_entry_size;
- if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
+ if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
+ (slot+IMM2_SIZE)[namelen] != 0) break;
count++;
}
+
if (count > 1)
{
PUT2(code, 2+LINK_SIZE, offset);
*errorcodeptr = ERR15;
goto FAILED;
}
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+ {
+ *errorcodeptr = ERR61;
+ goto FAILED;
+ }
recno = recno * 10 + name[i] - CHAR_0;
}
if (recno == 0) recno = RREF_ANY;
if (lengthptr != NULL)
{
named_group *ng;
+ recno = 0;
if (namelen == 0)
{
goto FAILED;
}
- /* The name table does not exist in the first pass; instead we must
- scan the list of names encountered so far in order to get the
- number. If the name is not found, set the value to 0 for a forward
- reference. */
+ /* Count named back references. */
- recno = 0;
- ng = cd->named_groups;
- for (i = 0; i < cd->names_found; i++, ng++)
+ if (!is_recurse) cd->namedrefcount++;
+
+ /* We have to allow for a named reference to a duplicated name (this
+ cannot be determined until the second pass). This needs an extra
+ 16-bit data item. */
+
+ *lengthptr += IMM2_SIZE;
+
+ /* If this is a forward reference and we are within a (?|...) group,
+ the reference may end up as the number of a group which we are
+ currently inside, that is, it could be a recursive reference. In the
+ real compile this will be picked up and the reference wrapped with
+ OP_ONCE to make it atomic, so we must space in case this occurs. */
+
+ /* In fact, this can happen for a non-forward reference because
+ another group with the same number might be created later. This
+ issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
+ only mode, we finesse the bug by allowing more memory always. */
+
+ *lengthptr += 2 + 2*LINK_SIZE;
+
+ /* It is even worse than that. The current reference may be to an
+ existing named group with a different number (so apparently not
+ recursive) but which later on is also attached to a group with the
+ current number. This can only happen if $(| has been previous
+ encountered. In that case, we allow yet more memory, just in case.
+ (Again, this is fixed "properly" in PCRE2. */
+
+ if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
+
+ /* Otherwise, check for recursion here. The name table does not exist
+ in the first pass; instead we must scan the list of names encountered
+ so far in order to get the number. If the name is not found, leave
+ the value of recno as 0 for a forward reference. */
+
+ /* This patch (removing "else") fixes a problem when a reference is
+ to multiple identically named nested groups from within the nest.
+ Once again, it is not the "proper" fix, and it results in an
+ over-allocation of memory. */
+
+ /* else */
{
- if (namelen == ng->length &&
- STRNCMP_UC_UC(name, ng->name, namelen) == 0)
+ ng = cd->named_groups;
+ for (i = 0; i < cd->names_found; i++, ng++)
{
- open_capitem *oc;
- recno = ng->number;
- if (is_recurse) break;
- for (oc = cd->open_caps; oc != NULL; oc = oc->next)
- {
- if (oc->number == recno)
- {
- oc->flag = TRUE;
- break;
- }
- }
- }
+ if (namelen == ng->length &&
+ STRNCMP_UC_UC(name, ng->name, namelen) == 0)
+ {
+ open_capitem *oc;
+ recno = ng->number;
+ if (is_recurse) break;
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+ {
+ if (oc->number == recno)
+ {
+ oc->flag = TRUE;
+ break;
+ }
+ }
+ }
+ }
}
-
- /* Count named back references. */
-
- if (!is_recurse) cd->namedrefcount++;
}
/* In the real compile, search the name table. We check the name
{
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
PUT2INC(code, 0, index);
PUT2INC(code, 0, count);
/* ------------------------------------------------------------ */
- case CHAR_R: /* Recursion */
- ptr++; /* Same as (?0) */
- /* Fall through */
+ case CHAR_R: /* Recursion, same as (?0) */
+ recno = 0;
+ if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
+ {
+ *errorcodeptr = ERR29;
+ goto FAILED;
+ }
+ goto HANDLE_RECURSION;
/* ------------------------------------------------------------ */
recno = 0;
while(IS_DIGIT(*ptr))
+ {
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+ {
+ while (IS_DIGIT(*ptr)) ptr++;
+ *errorcodeptr = ERR61;
+ goto FAILED;
+ }
recno = recno * 10 + *ptr++ - CHAR_0;
+ }
if (*ptr != (pcre_uchar)terminator)
{
HANDLE_RECURSION:
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
called = cd->start_code;
/* When we are actually compiling, find the bracket that is being
goto FAILED;
}
- /* Assertions used not to be repeatable, but this was changed for Perl
- compatibility, so all kinds can now be repeated. We copy code into a
+ /* All assertions used not to be repeatable, but this was changed for Perl
+ compatibility. All kinds can now be repeated except for assertions that are
+ conditions (Perl also forbids these to be repeated). We copy code into a
non-register variable (tempcode) in order to be able to pass its address
- because some compilers complain otherwise. */
+ because some compilers complain otherwise. At the start of a conditional
+ group whose condition is an assertion, cd->iscondassert is set. We unset it
+ here so as to allow assertions later in the group to be quantified. */
+
+ if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
+ cd->iscondassert)
+ {
+ previous = NULL;
+ cd->iscondassert = FALSE;
+ }
+ else
+ {
+ previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
+ }
- previous = code; /* For handling repetition */
*code = bravalue;
tempcode = code;
tempreqvary = cd->req_varyopt; /* Save value before bracket */
const pcre_uchar *p;
pcre_uint32 cf;
- save_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
+ item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
if (*p != (pcre_uchar)terminator)
{
*errorcodeptr = ERR57;
- break;
+ goto FAILED;
}
ptr++;
goto HANDLE_NUMERICAL_RECURSION;
ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
{
*errorcodeptr = ERR69;
- break;
+ goto FAILED;
}
is_recurse = FALSE;
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
HANDLE_REFERENCE:
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
PUT2INC(code, 0, recno);
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
goto FAILED;
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
*code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
{
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
}
}
ONE_CHAR:
previous = code;
+ item_hwm_offset = cd->hwm - cd->start_workspace;
/* For caseless UTF-8 mode when UCP support is available, check whether
this character has more than one other case. If so, generate a special
int fixed_length;
*code = OP_END;
fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
- FALSE, cd);
+ FALSE, cd, NULL);
DPRINTF(("fixed length = %d\n", fixed_length));
if (fixed_length == -3)
{
case OP_RREF:
case OP_DNRREF:
case OP_DEF:
+ case OP_FAIL:
return FALSE;
default: /* Assertion */
cd->name_entry_size = 0;
cd->name_table = NULL;
cd->dupnames = FALSE;
+cd->dupgroups = FALSE;
cd->namedrefcount = 0;
cd->start_code = cworkspace;
cd->hwm = cworkspace;
+cd->iscondassert = FALSE;
cd->start_workspace = cworkspace;
cd->workspace_size = COMPILE_WORK_SIZE;
cd->named_groups = named_groups;
goto PCRE_EARLY_ERROR_RETURN;
}
-/* If there are groups with duplicate names and there are also references by
-name, we must allow for the possibility of named references to duplicated
-groups. These require an extra data item each. */
-
-if (cd->dupnames && cd->namedrefcount > 0)
- length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
-
/* Compute the size of the data block for storing the compiled pattern. Integer
overflow should no longer be possible because nowadays we limit the maximum
value of cd->names_found and cd->name_entry_size. */
codestart = cd->name_table + re->name_entry_size * re->name_count;
cd->start_code = codestart;
cd->hwm = (pcre_uchar *)(cd->start_workspace);
+cd->iscondassert = FALSE;
cd->req_varyopt = 0;
cd->had_accept = FALSE;
cd->had_pruneorskip = FALSE;
int offset, recno;
cd->hwm -= LINK_SIZE;
offset = GET(cd->hwm, 0);
+
+ /* Check that the hwm handling hasn't gone wrong. This whole area is
+ rewritten in PCRE2 because there are some obscure cases. */
+
+ if (offset == 0 || codestart[offset-1] != OP_RECURSE)
+ {
+ errorcode = ERR10;
+ break;
+ }
+
recno = GET(codestart, offset);
if (recno != prev_recno)
{
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
-/* Unless disabled, check whether single character iterators can be
-auto-possessified. The function overwrites the appropriate opcode values. */
+/* Unless disabled, check whether any single character iterators can be
+auto-possessified. The function overwrites the appropriate opcode values, so
+the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
+used in this code because at least one compiler gives a warning about loss of
+"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
+function call. */
-if ((options & PCRE_NO_AUTO_POSSESS) == 0)
- auto_possessify((pcre_uchar *)codestart, utf, cd);
+if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
+ {
+ pcre_uchar *temp = (pcre_uchar *)codestart;
+ auto_possessify(temp, utf, cd);
+ }
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
int end_op = *be;
*be = OP_END;
fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
- cd);
+ cd, NULL);
*be = end_op;
DPRINTF(("fixed length = %d\n", fixed_length));
if (fixed_length < 0)
}
/* End of pcre_compile.c */
-