Patch from Graham Inggs to add symbols support (Closes: #767374)

[pcre3.git] / pcre_compile.c
diff --git a/pcre_compile.c b/pcre_compile.c

index c76ca1418d97679764ff32c4370cc74a3f06e741..1bc2b7f030a1a1c9eb728cf804f97f258d59b3c6 100644 (file)
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -6,7 +6,7 @@
  and semantics are as close as possible to those of the Perl 5 language.
  
                         Written by Philip Hazel
-           Copyright (c) 1997-2014 University of Cambridge
+           Copyright (c) 1997-2016 University of Cambridge
  
  -----------------------------------------------------------------------------
  Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,8 @@ supporting internal functions that are not used by other modules. */
  #endif
  
  #define NLBLOCK cd             /* Block containing newline information */
-#define PSSTART start_pattern  /* Field containing processed string start */
-#define PSEND   end_pattern    /* Field containing processed string end */
+#define PSSTART start_pattern  /* Field containing pattern start */
+#define PSEND   end_pattern    /* Field containing pattern end */
  
  #include "pcre_internal.h"
  
@@ -174,7 +174,7 @@ static const short int escapes[] = {
       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
-     CHAR_GRAVE_ACCENT,       7,
+     CHAR_GRAVE_ACCENT,       ESC_a,
       -ESC_b,                  0,
       -ESC_d,                  ESC_e,
       ESC_f,                   0,
@@ -202,9 +202,9 @@ static const short int escapes[] = {
  /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
-/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
+/*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
-/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
+/*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
@@ -219,6 +219,12 @@ static const short int escapes[] = {
  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  };
+
+/* We also need a table of characters that may follow \c in an EBCDIC
+environment for characters 0-31. */
+
+static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
+
  #endif
  
  
@@ -458,7 +464,7 @@ static const char error_texts[] =
    "range out of order in character class\0"
    "nothing to repeat\0"
    /* 10 */
-  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
+  "internal error: invalid forward reference offset\0"
    "internal error: unexpected repeat\0"
    "unrecognized character after (? or (?-\0"
    "POSIX named classes are supported only within a class\0"
@@ -527,7 +533,11 @@ static const char error_texts[] =
    "different names for subpatterns of the same number are not allowed\0"
    "(*MARK) must have an argument\0"
    "this version of PCRE is not compiled with Unicode property support\0"
+#ifndef EBCDIC
    "\\c must be followed by an ASCII character\0"
+#else
+  "\\c must be followed by a letter or one of [\\]^_?\0"
+#endif
    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
    /* 70 */
    "internal error: unknown opcode in find_fixedlength()\0"
@@ -549,6 +559,8 @@ static const char error_texts[] =
    "group name must start with a non-digit\0"
    /* 85 */
    "parentheses are too deeply nested (stack check)\0"
+  "digits missing in \\x{} or \\o{}\0"
+  "regular expression is too complicated\0"
    ;
  
  /* Table to identify digits and hex digits. This is used when compiling
@@ -1259,6 +1271,7 @@ else
  
      case CHAR_o:
      if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
+    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
        {
        ptr += 2;
        c = 0;
@@ -1328,6 +1341,11 @@ else
        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
          {
          ptr += 2;
+        if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
+          {
+          *errorcodeptr = ERR86;
+          break;
+          }
          c = 0;
          overflow = FALSE;
          while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
@@ -1418,7 +1436,16 @@ else
      c ^= 0x40;
  #else             /* EBCDIC coding */
      if (c >= CHAR_a && c <= CHAR_z) c += 64;
-    c ^= 0xC0;
+    if (c == CHAR_QUESTION_MARK)
+      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
+    else
+      {
+      for (i = 0; i < 32; i++)
+        {
+        if (c == ebcdic_escape_c[i]) break;
+        }
+      if (i < 32) c = i; else *errorcodeptr = ERR68;
+      }
  #endif
      break;
  
@@ -1583,7 +1610,7 @@ read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
  int min = 0;
  int max = -1;
  
-while (IS_DIGIT(*p)) 
+while (IS_DIGIT(*p))
    {
    min = min * 10 + (int)(*p++ - CHAR_0);
    if (min > 65535)
@@ -1591,14 +1618,14 @@ while (IS_DIGIT(*p))
      *errorcodeptr = ERR5;
      return p;
      }
-  }   
+  }
  
  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
    {
    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
      {
      max = 0;
-    while(IS_DIGIT(*p)) 
+    while(IS_DIGIT(*p))
        {
        max = max * 10 + (int)(*p++ - CHAR_0);
        if (max > 65535)
@@ -1606,7 +1633,7 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
          *errorcodeptr = ERR5;
          return p;
          }
-      }   
+      }
      if (max < min)
        {
        *errorcodeptr = ERR4;
@@ -1697,6 +1724,7 @@ Arguments:
    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
    atend    TRUE if called when the pattern is complete
    cd       the "compile data" structure
+  recurses    chain of recurse_check to catch mutual recursion
  
  Returns:   the fixed length,
               or -1 if there is no fixed length,
@@ -1706,10 +1734,11 @@ Returns:   the fixed length,
  */
  
  static int
-find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
+find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
+  recurse_check *recurses)
  {
  int length = -1;
-
+recurse_check this_recurse;
  register int branchlength = 0;
  register pcre_uchar *cc = code + 1 + LINK_SIZE;
  
@@ -1734,7 +1763,8 @@ for (;;)
      case OP_ONCE:
      case OP_ONCE_NC:
      case OP_COND:
-    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
+    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
+      recurses);
      if (d < 0) return d;
      branchlength += d;
      do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -1768,7 +1798,15 @@ for (;;)
      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
      if (cc > cs && cc < ce) return -1;                    /* Recursion */
-    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
+    else   /* Check for mutual recursion */
+      {
+      recurse_check *r = recurses;
+      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+      if (r != NULL) return -1;   /* Mutual recursion */
+      }
+    this_recurse.prev = recurses;
+    this_recurse.group = cs;
+    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
      if (d < 0) return d;
      branchlength += d;
      cc += 1 + LINK_SIZE;
@@ -1781,7 +1819,7 @@ for (;;)
      case OP_ASSERTBACK:
      case OP_ASSERTBACK_NOT:
      do cc += GET(cc, 1); while (*cc == OP_ALT);
-    cc += PRIV(OP_lengths)[*cc];
+    cc += 1 + LINK_SIZE;
      break;
  
      /* Skip over things that don't match chars */
@@ -2122,32 +2160,60 @@ for (;;)
        {
        case OP_CHAR:
        case OP_CHARI:
+      case OP_NOT:
+      case OP_NOTI:
        case OP_EXACT:
        case OP_EXACTI:
+      case OP_NOTEXACT:
+      case OP_NOTEXACTI:
        case OP_UPTO:
        case OP_UPTOI:
+      case OP_NOTUPTO:
+      case OP_NOTUPTOI:
        case OP_MINUPTO:
        case OP_MINUPTOI:
+      case OP_NOTMINUPTO:
+      case OP_NOTMINUPTOI:
        case OP_POSUPTO:
        case OP_POSUPTOI:
+      case OP_NOTPOSUPTO:
+      case OP_NOTPOSUPTOI:
        case OP_STAR:
        case OP_STARI:
+      case OP_NOTSTAR:
+      case OP_NOTSTARI:
        case OP_MINSTAR:
        case OP_MINSTARI:
+      case OP_NOTMINSTAR:
+      case OP_NOTMINSTARI:
        case OP_POSSTAR:
        case OP_POSSTARI:
+      case OP_NOTPOSSTAR:
+      case OP_NOTPOSSTARI:
        case OP_PLUS:
        case OP_PLUSI:
+      case OP_NOTPLUS:
+      case OP_NOTPLUSI:
        case OP_MINPLUS:
        case OP_MINPLUSI:
+      case OP_NOTMINPLUS:
+      case OP_NOTMINPLUSI:
        case OP_POSPLUS:
        case OP_POSPLUSI:
+      case OP_NOTPOSPLUS:
+      case OP_NOTPOSPLUSI:
        case OP_QUERY:
        case OP_QUERYI:
+      case OP_NOTQUERY:
+      case OP_NOTQUERYI:
        case OP_MINQUERY:
        case OP_MINQUERYI:
+      case OP_NOTMINQUERY:
+      case OP_NOTMINQUERYI:
        case OP_POSQUERY:
        case OP_POSQUERYI:
+      case OP_NOTPOSQUERY:
+      case OP_NOTPOSQUERYI:
        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
        break;
        }
@@ -2327,11 +2393,6 @@ Arguments:
  Returns:      TRUE if what is matched could be empty
  */
  
-typedef struct recurse_check {
-  struct recurse_check *prev;
-  const pcre_uchar *group;
-} recurse_check;
-
  static BOOL
  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
    BOOL utf, compile_data *cd, recurse_check *recurses)
@@ -2367,6 +2428,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
    if (c == OP_RECURSE)
      {
      const pcre_uchar *scode = cd->start_code + GET(code, 1);
+    const pcre_uchar *endgroup = scode;
      BOOL empty_branch;
  
      /* Test for forward reference or uncompleted reference. This is disabled
@@ -2381,20 +2443,16 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
        }
  
-    /* If we are scanning a completed pattern, there are no forward references
-    and all groups are complete. We need to detect whether this is a recursive
-    call, as otherwise there will be an infinite loop. If it is a recursion,
-    just skip over it. Simple recursions are easily detected. For mutual
-    recursions we keep a chain on the stack. */
+    /* If the reference is to a completed group, we need to detect whether this
+    is a recursive call, as otherwise there will be an infinite loop. If it is
+    a recursion, just skip over it. Simple recursions are easily detected. For
+    mutual recursions we keep a chain on the stack. */
  
+    do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
+    if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
      else
        {
        recurse_check *r = recurses;
-      const pcre_uchar *endgroup = scode;
-
-      do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
-      if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
-
        for (r = recurses; r != NULL; r = r->prev)
          if (r->group == scode) break;
        if (r != NULL) continue;   /* Mutual recursion */
@@ -2449,7 +2507,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
    if (c == OP_BRA  || c == OP_BRAPOS ||
        c == OP_CBRA || c == OP_CBRAPOS ||
        c == OP_ONCE || c == OP_ONCE_NC ||
-      c == OP_COND)
+      c == OP_COND || c == OP_SCOND)
      {
      BOOL empty_branch;
      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
@@ -2465,8 +2523,8 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
        empty_branch = FALSE;
        do
          {
-        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
-          empty_branch = TRUE;
+        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
+          recurses)) empty_branch = TRUE;
          code += GET(code, 1);
          }
        while (*code == OP_ALT);
@@ -3035,7 +3093,7 @@ switch(c)
      end += 1 + 2 * IMM2_SIZE;
      break;
      }
-  list[2] = end - code;
+  list[2] = (pcre_uint32)(end - code);
    return end;
    }
  return NULL;    /* Opcode not accepted */
@@ -3061,7 +3119,7 @@ Returns:      TRUE if the auto-possessification is possible
  
  static BOOL
  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
-  const pcre_uint32 *base_list, const pcre_uchar *base_end)
+  const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
  {
  pcre_uchar c;
  pcre_uint32 list[8];
@@ -3076,6 +3134,10 @@ const pcre_uint8 *class_bitset;
  const pcre_uint8 *set1, *set2, *set_end;
  pcre_uint32 chr;
  BOOL accepted, invert_bits;
+BOOL entered_a_group = FALSE;
+
+if (*rec_limit == 0) return FALSE;
+--(*rec_limit);
  
  /* Note: the base_list[1] contains whether the current opcode has greedy
  (represented by a non-zero value) quantifier. This is a different from
@@ -3129,8 +3191,10 @@ for(;;)
        case OP_ONCE:
        case OP_ONCE_NC:
        /* Atomic sub-patterns and assertions can always auto-possessify their
-      last iterator. */
-      return TRUE;
+      last iterator. However, if the group was entered as a result of checking
+      a previous iterator, this is not possible. */
+
+      return !entered_a_group;
        }
  
      code += PRIV(OP_lengths)[c];
@@ -3145,10 +3209,13 @@ for(;;)
  
      while (*next_code == OP_ALT)
        {
-      if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
+      if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
+        return FALSE;
        code = next_code + 1 + LINK_SIZE;
        next_code += GET(next_code, 1);
        }
+
+    entered_a_group = TRUE;
      continue;
  
      case OP_BRAZERO:
@@ -3163,11 +3230,14 @@ for(;;)
      /* The bracket content will be checked by the
      OP_BRA/OP_CBRA case above. */
      next_code += 1 + LINK_SIZE;
-    if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
+    if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
        return FALSE;
  
      code += PRIV(OP_lengths)[c];
      continue;
+
+    default:
+    break;
      }
  
    /* Check for a supported opcode, and load its properties. */
@@ -3406,8 +3476,7 @@ for(;;)
             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
  
-    if (!accepted)
-      return FALSE;
+    if (!accepted) return FALSE;
  
      if (list[1] == 0) return TRUE;
      /* Might be an empty repeat. */
@@ -3594,11 +3663,20 @@ register pcre_uchar c;
  const pcre_uchar *end;
  pcre_uchar *repeat_opcode;
  pcre_uint32 list[8];
+int rec_limit;
  
  for (;;)
    {
    c = *code;
  
+  /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
+  it may compile without complaining, but may get into a loop here if the code
+  pointer points to a bad value. This is, of course a documentated possibility,
+  when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
+  just give up on this optimization. */
+
+  if (c >= OP_TABLE_LENGTH) return;
+
    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
      {
      c -= get_repeat_base(c) - OP_STAR;
@@ -3606,7 +3684,8 @@ for (;;)
        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
  
-    if (end != NULL && compare_opcodes(end, utf, cd, list, end))
+    rec_limit = 1000;
+    if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
        {
        switch(c)
          {
@@ -3662,7 +3741,8 @@ for (;;)
  
        list[1] = (c & 1) == 0;
  
-      if (compare_opcodes(end, utf, cd, list, end))
+      rec_limit = 1000;
+      if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
          {
          switch (c)
            {
@@ -3826,11 +3906,11 @@ didn't consider this to be a POSIX class. Likewise for [:1234:].
  The problem in trying to be exactly like Perl is in the handling of escapes. We
  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
-below handles the special case of \], but does not try to do any other escape
-processing. This makes it different from Perl for cases such as [:l\ower:]
-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
-"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
-I think.
+below handles the special cases \\ and \], but does not try to do any other
+escape processing. This makes it different from Perl for cases such as
+[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
+not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
+when Perl does, I think.
  
  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
  It seems that the appearance of a nested POSIX class supersedes an apparent
@@ -3857,21 +3937,16 @@ pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
  for (++ptr; *ptr != CHAR_NULL; ptr++)
    {
-  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+  if (*ptr == CHAR_BACKSLASH &&
+      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
+       ptr[1] == CHAR_BACKSLASH))
      ptr++;
-  else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
-  else
+  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
+            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
+  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
      {
-    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
-      {
-      *endptr = ptr;
-      return TRUE;
-      }
-    if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
-         (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
-          ptr[1] == CHAR_EQUALS_SIGN) &&
-        check_posix_syntax(ptr, endptr))
-      return FALSE;
+    *endptr = ptr;
+    return TRUE;
      }
    }
  return FALSE;
@@ -3925,48 +4000,42 @@ have their offsets adjusted. That one of the jobs of this function. Before it
  is called, the partially compiled regex must be temporarily terminated with
  OP_END.
  
-This function has been extended with the possibility of forward references for
-recursions and subroutine calls. It must also check the list of such references
-for the group we are dealing with. If it finds that one of the recursions in
-the current group is on this list, it adjusts the offset in the list, not the
-value in the reference (which is a group number).
+This function has been extended to cope with forward references for recursions
+and subroutine calls. It must check the list of such references for the
+group we are dealing with. If it finds that one of the recursions in the
+current group is on this list, it does not adjust the value in the reference
+(which is a group number). After the group has been scanned, all the offsets in
+the forward reference list for the group are adjusted.
  
  Arguments:
    group      points to the start of the group
    adjust     the amount by which the group is to be moved
    utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
    cd         contains pointers to tables etc.
-  save_hwm   the hwm forward reference pointer at the start of the group
+  save_hwm_offset   the hwm forward reference offset at the start of the group
  
  Returns:     nothing
  */
  
  static void
  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
-  pcre_uchar *save_hwm)
+  size_t save_hwm_offset)
  {
+int offset;
+pcre_uchar *hc;
  pcre_uchar *ptr = group;
  
  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
    {
-  int offset;
-  pcre_uchar *hc;
-
-  /* See if this recursion is on the forward reference list. If so, adjust the
-  reference. */
-
-  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
+  for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
+       hc += LINK_SIZE)
      {
      offset = (int)GET(hc, 0);
-    if (cd->start_code + offset == ptr + 1)
-      {
-      PUT(hc, 0, offset + adjust);
-      break;
-      }
+    if (cd->start_code + offset == ptr + 1) break;
      }
  
-  /* Otherwise, adjust the recursion offset if it's after the start of this
-  group. */
+  /* If we have not found this recursion on the forward reference list, adjust
+  the recursion's offset if it's after the start of this group. */
  
    if (hc >= cd->hwm)
      {
@@ -3976,6 +4045,15 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
  
    ptr += 1 + LINK_SIZE;
    }
+
+/* Now adjust all forward reference offsets for the group. */
+
+for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
+     hc += LINK_SIZE)
+  {
+  offset = (int)GET(hc, 0);
+  PUT(hc, 0, offset + adjust);
+  }
  }
  
  
@@ -4160,7 +4238,11 @@ if ((options & PCRE_CASELESS) != 0)
        range. Otherwise, use a recursive call to add the additional range. */
  
        else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
-      else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
+      else if (od > end && oc <= end + 1)
+        {
+        end = od;       /* Extend upwards */
+        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
+        }
        else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
        }
      }
@@ -4400,7 +4482,7 @@ const pcre_uchar *tempptr;
  const pcre_uchar *nestptr = NULL;
  pcre_uchar *previous = NULL;
  pcre_uchar *previous_callout = NULL;
-pcre_uchar *save_hwm = NULL;
+size_t item_hwm_offset = 0;
  pcre_uint8 classbits[32];
  
  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
@@ -4510,7 +4592,8 @@ for (;; ptr++)
      if (code > cd->start_workspace + cd->workspace_size -
          WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
        {
-      *errorcodeptr = ERR52;
+      *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
+        ERR52 : ERR87;
        goto FAILED;
        }
  
@@ -4558,8 +4641,7 @@ for (;; ptr++)
    /* In the real compile phase, just check the workspace used by the forward
    reference list. */
  
-  else if (cd->hwm > cd->start_workspace + cd->workspace_size -
-           WORK_SIZE_SAFETY_MARGIN)
+  else if (cd->hwm > cd->start_workspace + cd->workspace_size)
      {
      *errorcodeptr = ERR52;
      goto FAILED;
@@ -4680,7 +4762,8 @@ for (;; ptr++)
      previous = NULL;
      if ((options & PCRE_MULTILINE) != 0)
        {
-      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
+      if (firstcharflags == REQ_UNSET)
+        zerofirstcharflags = firstcharflags = REQ_NONE;
        *code++ = OP_CIRCM;
        }
      else *code++ = OP_CIRC;
@@ -4701,6 +4784,7 @@ for (;; ptr++)
      zeroreqchar = reqchar;
      zeroreqcharflags = reqcharflags;
      previous = code;
+    item_hwm_offset = cd->hwm - cd->start_workspace;
      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
      break;
  
@@ -4752,6 +4836,7 @@ for (;; ptr++)
      /* Handle a real character class. */
  
      previous = code;
+    item_hwm_offset = cd->hwm - cd->start_workspace;
  
      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
      they are encountered at the top level, so we'll do that too. */
@@ -4857,10 +4942,11 @@ for (;; ptr++)
        (which is on the stack). We have to remember that there was XCLASS data,
        however. */
  
+      if (class_uchardata > class_uchardata_base) xclass = TRUE;
+
        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
          {
-        xclass = TRUE;
-        *lengthptr += class_uchardata - class_uchardata_base;
+        *lengthptr += (int)(class_uchardata - class_uchardata_base);
          class_uchardata = class_uchardata_base;
          }
  #endif
@@ -4962,10 +5048,26 @@ for (;; ptr++)
              ptr = tempptr + 1;
              continue;
  
-            /* For all other POSIX classes, no special action is taken in UCP
-            mode. Fall through to the non_UCP case. */
+            /* For the other POSIX classes (ascii, xdigit) we are going to fall
+            through to the non-UCP case and build a bit map for characters with
+            code points less than 256. If we are in a negated POSIX class
+            within a non-negated overall class, characters with code points
+            greater than 255 must all match. In the special case where we have
+            not yet generated any xclass data, and this is the final item in
+            the overall class, we need do nothing: later on, the opcode
+            OP_NCLASS will be used to indicate that characters greater than 255
+            are acceptable. If we have already seen an xclass item or one may
+            follow (we have to assume that it might if this is not the end of
+            the class), explicitly match all wide codepoints. */
  
              default:
+            if (!negate_class && local_negate &&
+                (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
+              {
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
+              }
              break;
              }
            }
@@ -5129,9 +5231,9 @@ for (;; ptr++)
                cd, PRIV(vspace_list));
              continue;
  
-#ifdef SUPPORT_UCP
              case ESC_p:
              case ESC_P:
+#ifdef SUPPORT_UCP
                {
                BOOL negated;
                unsigned int ptype = 0, pdata = 0;
@@ -5145,6 +5247,9 @@ for (;; ptr++)
                class_has_8bitchar--;                /* Undo! */
                continue;
                }
+#else
+            *errorcodeptr = ERR45;
+            goto FAILED;
  #endif
              /* Unrecognized escapes are faulted if PCRE is running in its
              strict mode. By default, for compatibility with Perl, they are
@@ -5301,16 +5406,20 @@ for (;; ptr++)
        CLASS_SINGLE_CHARACTER:
        if (class_one_char < 2) class_one_char++;
  
-      /* If class_one_char is 1, we have the first single character in the
-      class, and there have been no prior ranges, or XCLASS items generated by
-      escapes. If this is the final character in the class, we can optimize by
-      turning the item into a 1-character OP_CHAR[I] if it's positive, or
-      OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
-      to be set. Otherwise, there can be no first char if this item is first,
-      whatever repeat count may follow. In the case of reqchar, save the
-      previous value for reinstating. */
+      /* If xclass_has_prop is false and class_one_char is 1, we have the first
+      single character in the class, and there have been no prior ranges, or
+      XCLASS items generated by escapes. If this is the final character in the
+      class, we can optimize by turning the item into a 1-character OP_CHAR[I]
+      if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
+      can cause firstchar to be set. Otherwise, there can be no first char if
+      this item is first, whatever repeat count may follow. In the case of
+      reqchar, save the previous value for reinstating. */
  
-      if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+      if (!inescq &&
+#ifdef SUPPORT_UCP
+          !xclass_has_prop &&
+#endif
+          class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
          {
          ptr++;
          zeroreqchar = reqchar;
@@ -5426,9 +5535,10 @@ for (;; ptr++)
      actual compiled code. */
  
  #ifdef SUPPORT_UTF
-    if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
+    if (xclass && (xclass_has_prop || !should_flip_negation ||
+        (options & PCRE_UCP) != 0))
  #elif !defined COMPILE_PCRE8
-    if (xclass && !should_flip_negation)
+    if (xclass && (xclass_has_prop || !should_flip_negation))
  #endif
  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
        {
@@ -5458,6 +5568,12 @@ for (;; ptr++)
        PUT(previous, 1, (int)(code - previous));
        break;   /* End of class handling */
        }
+
+    /* Even though any XCLASS list is now discarded, we must allow for
+    its memory. */
+
+    if (lengthptr != NULL)
+      *lengthptr += (int)(class_uchardata - class_uchardata_base);
  #endif
  
      /* If there are no characters > 255, or they are all to be included or
@@ -5858,6 +5974,7 @@ for (;; ptr++)
        {
        register int i;
        int len = (int)(code - previous);
+      size_t base_hwm_offset = item_hwm_offset;
        pcre_uchar *bralink = NULL;
        pcre_uchar *brazeroptr = NULL;
  
@@ -5912,7 +6029,7 @@ for (;; ptr++)
          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
            {
            *code = OP_END;
-          adjust_recurse(previous, 1, utf, cd, save_hwm);
+          adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
            memmove(previous + 1, previous, IN_UCHARS(len));
            code++;
            if (repeat_max == 0)
@@ -5936,7 +6053,7 @@ for (;; ptr++)
            {
            int offset;
            *code = OP_END;
-          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
+          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
            code += 2 + LINK_SIZE;
            *previous++ = OP_BRAZERO + repeat_type;
@@ -5999,26 +6116,25 @@ for (;; ptr++)
              for (i = 1; i < repeat_min; i++)
                {
                pcre_uchar *hc;
-              pcre_uchar *this_hwm = cd->hwm;
+              size_t this_hwm_offset = cd->hwm - cd->start_workspace;
                memcpy(code, previous, IN_UCHARS(len));
  
                while (cd->hwm > cd->start_workspace + cd->workspace_size -
-                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
+                     WORK_SIZE_SAFETY_MARGIN -
+                     (this_hwm_offset - base_hwm_offset))
                  {
-                int save_offset = save_hwm - cd->start_workspace;
-                int this_offset = this_hwm - cd->start_workspace;
                  *errorcodeptr = expand_workspace(cd);
                  if (*errorcodeptr != 0) goto FAILED;
-                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
-                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
                  }
  
-              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
+              for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
+                   hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
+                   hc += LINK_SIZE)
                  {
                  PUT(cd->hwm, 0, GET(hc, 0) + len);
                  cd->hwm += LINK_SIZE;
                  }
-              save_hwm = this_hwm;
+              base_hwm_offset = this_hwm_offset;
                code += len;
                }
              }
@@ -6063,7 +6179,7 @@ for (;; ptr++)
          else for (i = repeat_max - 1; i >= 0; i--)
            {
            pcre_uchar *hc;
-          pcre_uchar *this_hwm = cd->hwm;
+          size_t this_hwm_offset = cd->hwm - cd->start_workspace;
  
            *code++ = OP_BRAZERO + repeat_type;
  
@@ -6085,22 +6201,21 @@ for (;; ptr++)
            copying them. */
  
            while (cd->hwm > cd->start_workspace + cd->workspace_size -
-                 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
+                 WORK_SIZE_SAFETY_MARGIN -
+                 (this_hwm_offset - base_hwm_offset))
              {
-            int save_offset = save_hwm - cd->start_workspace;
-            int this_offset = this_hwm - cd->start_workspace;
              *errorcodeptr = expand_workspace(cd);
              if (*errorcodeptr != 0) goto FAILED;
-            save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
-            this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
              }
  
-          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
+          for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
+               hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
+               hc += LINK_SIZE)
              {
              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
              cd->hwm += LINK_SIZE;
              }
-          save_hwm = this_hwm;
+          base_hwm_offset = this_hwm_offset;
            code += len;
            }
  
@@ -6183,6 +6298,12 @@ for (;; ptr++)
              while (*scode == OP_ALT);
              }
  
+          /* A conditional group with only one branch has an implicit empty
+          alternative branch. */
+
+          if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
+            *bracode = OP_SCOND;
+
            /* Handle possessive quantifiers. */
  
            if (possessive_quantifier)
@@ -6196,11 +6317,11 @@ for (;; ptr++)
                {
                int nlen = (int)(code - bracode);
                *code = OP_END;
-              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
+              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
                memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
                code += 1 + LINK_SIZE;
                nlen += 1 + LINK_SIZE;
-              *bracode = OP_BRAPOS;
+              *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
                *code++ = OP_KETRPOS;
                PUTINC(code, 0, nlen);
                PUT(bracode, 1, nlen);
@@ -6330,7 +6451,7 @@ for (;; ptr++)
          else
            {
            *code = OP_END;
-          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
+          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
            memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
            code += 1 + LINK_SIZE;
            len += 1 + LINK_SIZE;
@@ -6379,7 +6500,7 @@ for (;; ptr++)
  
          default:
          *code = OP_END;
-        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
+        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
          code += 1 + LINK_SIZE;
          len += 1 + LINK_SIZE;
@@ -6408,15 +6529,25 @@ for (;; ptr++)
      parenthesis forms.  */
  
      case CHAR_LEFT_PARENTHESIS:
-    newoptions = options;
-    skipbytes = 0;
-    bravalue = OP_CBRA;
-    save_hwm = cd->hwm;
-    reset_bracount = FALSE;
+    ptr++;
  
-    /* First deal with various "verbs" that can be introduced by '*'. */
+    /* First deal with comments. Putting this code right at the start ensures
+    that comments have no bad side effects. */
+
+    if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
+      {
+      ptr += 2;
+      while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
+      if (*ptr == CHAR_NULL)
+        {
+        *errorcodeptr = ERR18;
+        goto FAILED;
+        }
+      continue;
+      }
+
+    /* Now deal with various "verbs" that can be introduced by '*'. */
  
-    ptr++;
      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
           || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
        {
@@ -6475,8 +6606,21 @@ for (;; ptr++)
              cd->had_accept = TRUE;
              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
                {
-              *code++ = OP_CLOSE;
-              PUT2INC(code, 0, oc->number);
+              if (lengthptr != NULL)
+                {
+#ifdef COMPILE_PCRE8
+                *lengthptr += 1 + IMM2_SIZE;
+#elif defined COMPILE_PCRE16
+                *lengthptr += 2 + IMM2_SIZE;
+#elif defined COMPILE_PCRE32
+                *lengthptr += 4 + IMM2_SIZE;
+#endif
+                }
+              else
+                {
+                *code++ = OP_CLOSE;
+                PUT2INC(code, 0, oc->number);
+                }
                }
              setverb = *code++ =
                (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
@@ -6505,9 +6649,17 @@ for (;; ptr++)
                goto FAILED;
                }
              setverb = *code++ = verbs[i].op_arg;
-            *code++ = arglen;
-            memcpy(code, arg, IN_UCHARS(arglen));
-            code += arglen;
+            if (lengthptr != NULL)    /* In pass 1 just add in the length */
+              {                       /* to avoid potential workspace */
+              *lengthptr += arglen;   /* overflow. */
+              *code++ = 0;
+              }
+            else
+              {
+              *code++ = arglen;
+              memcpy(code, arg, IN_UCHARS(arglen));
+              code += arglen;
+              }
              *code++ = 0;
              }
  
@@ -6537,10 +6689,18 @@ for (;; ptr++)
        goto FAILED;
        }
  
+    /* Initialize for "real" parentheses */
+
+    newoptions = options;
+    skipbytes = 0;
+    bravalue = OP_CBRA;
+    item_hwm_offset = cd->hwm - cd->start_workspace;
+    reset_bracount = FALSE;
+
      /* Deal with the extended parentheses; all are introduced by '?', and the
      appearance of any of them means that this is not a capturing group. */
  
-    else if (*ptr == CHAR_QUESTION_MARK)
+    if (*ptr == CHAR_QUESTION_MARK)
        {
        int i, set, unset, namelen;
        int *optset;
@@ -6549,20 +6709,10 @@ for (;; ptr++)
  
        switch (*(++ptr))
          {
-        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
-        ptr++;
-        while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
-        if (*ptr == CHAR_NULL)
-          {
-          *errorcodeptr = ERR18;
-          goto FAILED;
-          }
-        continue;
-
-
          /* ------------------------------------------------------------ */
          case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
          reset_bracount = TRUE;
+        cd->dupgroups = TRUE;     /* Record (?| encountered */
          /* Fall through */
  
          /* ------------------------------------------------------------ */
@@ -6608,8 +6758,13 @@ for (;; ptr++)
          if (tempptr[1] == CHAR_QUESTION_MARK &&
                (tempptr[2] == CHAR_EQUALS_SIGN ||
                 tempptr[2] == CHAR_EXCLAMATION_MARK ||
-               tempptr[2] == CHAR_LESS_THAN_SIGN))
+                 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
+                   (tempptr[3] == CHAR_EQUALS_SIGN ||
+                    tempptr[3] == CHAR_EXCLAMATION_MARK))))
+          {
+          cd->iscondassert = TRUE;
            break;
+          }
  
          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
@@ -6658,6 +6813,12 @@ for (;; ptr++)
            {
            while (IS_DIGIT(*ptr))
              {
+            if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
+              {
+              while (IS_DIGIT(*ptr)) ptr++;
+              *errorcodeptr = ERR61;
+              goto FAILED;
+              }
              recno = recno * 10 + (int)(*ptr - CHAR_0);
              ptr++;
              }
@@ -6686,7 +6847,7 @@ for (;; ptr++)
              ptr++;
              }
            namelen = (int)(ptr - name);
-          if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
+          if (lengthptr != NULL) skipbytes += IMM2_SIZE;
            }
  
          /* Check the terminator */
@@ -6722,6 +6883,7 @@ for (;; ptr++)
              goto FAILED;
              }
            PUT2(code, 2+LINK_SIZE, recno);
+          if (recno > cd->top_backref) cd->top_backref = recno;
            break;
            }
  
@@ -6744,12 +6906,15 @@ for (;; ptr++)
            int offset = i++;
            int count = 1;
            recno = GET2(slot, 0);   /* Number from first found */
+          if (recno > cd->top_backref) cd->top_backref = recno;
            for (; i < cd->names_found; i++)
              {
              slot += cd->name_entry_size;
-            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
+            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
+              (slot+IMM2_SIZE)[namelen] != 0) break;
              count++;
              }
+
            if (count > 1)
              {
              PUT2(code, 2+LINK_SIZE, offset);
@@ -6788,6 +6953,11 @@ for (;; ptr++)
                *errorcodeptr = ERR15;
                goto FAILED;
                }
+            if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
+              {
+              *errorcodeptr = ERR61;
+              goto FAILED;
+              }
              recno = recno * 10 + name[i] - CHAR_0;
              }
            if (recno == 0) recno = RREF_ANY;
@@ -7064,6 +7234,7 @@ for (;; ptr++)
          if (lengthptr != NULL)
            {
            named_group *ng;
+          recno = 0;
  
            if (namelen == 0)
              {
@@ -7081,23 +7252,65 @@ for (;; ptr++)
              goto FAILED;
              }
  
-          /* The name table does not exist in the first pass; instead we must
-          scan the list of names encountered so far in order to get the
-          number. If the name is not found, set the value to 0 for a forward
-          reference. */
-
-          ng = cd->named_groups;
-          for (i = 0; i < cd->names_found; i++, ng++)
-            {
-            if (namelen == ng->length &&
-                STRNCMP_UC_UC(name, ng->name, namelen) == 0)
-              break;
-            }
-          recno = (i < cd->names_found)? ng->number : 0;
-
            /* Count named back references. */
  
            if (!is_recurse) cd->namedrefcount++;
+
+          /* We have to allow for a named reference to a duplicated name (this
+          cannot be determined until the second pass). This needs an extra
+          16-bit data item. */
+
+          *lengthptr += IMM2_SIZE;
+
+          /* If this is a forward reference and we are within a (?|...) group,
+          the reference may end up as the number of a group which we are
+          currently inside, that is, it could be a recursive reference. In the
+          real compile this will be picked up and the reference wrapped with
+          OP_ONCE to make it atomic, so we must space in case this occurs. */
+
+          /* In fact, this can happen for a non-forward reference because
+          another group with the same number might be created later. This
+          issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
+          only mode, we finesse the bug by allowing more memory always. */
+
+          *lengthptr += 2 + 2*LINK_SIZE;
+
+          /* It is even worse than that. The current reference may be to an
+          existing named group with a different number (so apparently not
+          recursive) but which later on is also attached to a group with the
+          current number. This can only happen if $(| has been previous
+          encountered. In that case, we allow yet more memory, just in case.
+          (Again, this is fixed "properly" in PCRE2. */
+
+          if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
+
+          /* Otherwise, check for recursion here. The name table does not exist
+          in the first pass; instead we must scan the list of names encountered
+          so far in order to get the number. If the name is not found, leave
+          the value of recno as 0 for a forward reference. */
+
+          else
+            {
+            ng = cd->named_groups;
+            for (i = 0; i < cd->names_found; i++, ng++)
+              {
+              if (namelen == ng->length &&
+                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
+                {
+                open_capitem *oc;
+                recno = ng->number;
+                if (is_recurse) break;
+                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+                  {
+                  if (oc->number == recno)
+                    {
+                    oc->flag = TRUE;
+                    break;
+                    }
+                  }
+                }
+              }
+            }
            }
  
          /* In the real compile, search the name table. We check the name
@@ -7152,6 +7365,7 @@ for (;; ptr++)
              {
              if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
              previous = code;
+            item_hwm_offset = cd->hwm - cd->start_workspace;
              *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
              PUT2INC(code, 0, index);
              PUT2INC(code, 0, count);
@@ -7189,9 +7403,14 @@ for (;; ptr++)
  
  
          /* ------------------------------------------------------------ */
-        case CHAR_R:              /* Recursion */
-        ptr++;                    /* Same as (?0)      */
-        /* Fall through */
+        case CHAR_R:              /* Recursion, same as (?0) */
+        recno = 0;
+        if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
+          {
+          *errorcodeptr = ERR29;
+          goto FAILED;
+          }
+        goto HANDLE_RECURSION;
  
  
          /* ------------------------------------------------------------ */
@@ -7228,7 +7447,15 @@ for (;; ptr++)
  
            recno = 0;
            while(IS_DIGIT(*ptr))
+            {
+            if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+              {
+              while (IS_DIGIT(*ptr)) ptr++;
+              *errorcodeptr = ERR61;
+              goto FAILED;
+              }
              recno = recno * 10 + *ptr++ - CHAR_0;
+            }
  
            if (*ptr != (pcre_uchar)terminator)
              {
@@ -7265,6 +7492,7 @@ for (;; ptr++)
            HANDLE_RECURSION:
  
            previous = code;
+          item_hwm_offset = cd->hwm - cd->start_workspace;
            called = cd->start_code;
  
            /* When we are actually compiling, find the bracket that is being
@@ -7452,12 +7680,26 @@ for (;; ptr++)
        goto FAILED;
        }
  
-    /* Assertions used not to be repeatable, but this was changed for Perl
-    compatibility, so all kinds can now be repeated. We copy code into a
+    /* All assertions used not to be repeatable, but this was changed for Perl
+    compatibility. All kinds can now be repeated except for assertions that are
+    conditions (Perl also forbids these to be repeated). We copy code into a
      non-register variable (tempcode) in order to be able to pass its address
-    because some compilers complain otherwise. */
+    because some compilers complain otherwise. At the start of a conditional
+    group whose condition is an assertion, cd->iscondassert is set. We unset it
+    here so as to allow assertions later in the group to be quantified. */
+
+    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
+        cd->iscondassert)
+      {
+      previous = NULL;
+      cd->iscondassert = FALSE;
+      }
+    else
+      {
+      previous = code;
+      item_hwm_offset = cd->hwm - cd->start_workspace;
+      }
  
-    previous = code;                      /* For handling repetition */
      *code = bravalue;
      tempcode = code;
      tempreqvary = cd->req_varyopt;        /* Save value before bracket */
@@ -7704,7 +7946,7 @@ for (;; ptr++)
          const pcre_uchar *p;
          pcre_uint32 cf;
  
-        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
+        item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
  
@@ -7733,7 +7975,7 @@ for (;; ptr++)
          if (*p != (pcre_uchar)terminator)
            {
            *errorcodeptr = ERR57;
-          break;
+          goto FAILED;
            }
          ptr++;
          goto HANDLE_NUMERICAL_RECURSION;
@@ -7748,7 +7990,7 @@ for (;; ptr++)
            ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
            {
            *errorcodeptr = ERR69;
-          break;
+          goto FAILED;
            }
          is_recurse = FALSE;
          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
@@ -7772,6 +8014,7 @@ for (;; ptr++)
          HANDLE_REFERENCE:
          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
          previous = code;
+        item_hwm_offset = cd->hwm - cd->start_workspace;
          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
          PUT2INC(code, 0, recno);
          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
@@ -7801,6 +8044,7 @@ for (;; ptr++)
          if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
            goto FAILED;
          previous = code;
+        item_hwm_offset = cd->hwm - cd->start_workspace;
          *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
          *code++ = ptype;
          *code++ = pdata;
@@ -7841,6 +8085,7 @@ for (;; ptr++)
  
            {
            previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
+          item_hwm_offset = cd->hwm - cd->start_workspace;
            *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
            }
          }
@@ -7884,6 +8129,7 @@ for (;; ptr++)
  
      ONE_CHAR:
      previous = code;
+    item_hwm_offset = cd->hwm - cd->start_workspace;
  
      /* For caseless UTF-8 mode when UCP support is available, check whether
      this character has more than one other case. If so, generate a special
@@ -8031,6 +8277,7 @@ int length;
  unsigned int orig_bracount;
  unsigned int max_bracount;
  branch_chain bc;
+size_t save_hwm_offset;
  
  /* If set, call the external function that checks for stack availability. */
  
@@ -8048,6 +8295,8 @@ bc.current_branch = code;
  firstchar = reqchar = 0;
  firstcharflags = reqcharflags = REQ_UNSET;
  
+save_hwm_offset = cd->hwm - cd->start_workspace;
+
  /* Accumulate the length for use in the pre-compile phase. Start with the
  length of the BRA and KET and any extra bytes that are required at the
  beginning. We accumulate in a local variable to save frequent testing of
@@ -8189,7 +8438,7 @@ for (;;)
        int fixed_length;
        *code = OP_END;
        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
-        FALSE, cd);
+        FALSE, cd, NULL);
        DPRINTF(("fixed length = %d\n", fixed_length));
        if (fixed_length == -3)
          {
@@ -8241,12 +8490,16 @@ for (;;)
  
      /* If it was a capturing subpattern, check to see if it contained any
      recursive back references. If so, we must wrap it in atomic brackets.
-    In any event, remove the block from the chain. */
+    Because we are moving code along, we must ensure that any pending recursive
+    references are updated. In any event, remove the block from the chain. */
  
      if (capnumber > 0)
        {
        if (cd->open_caps->flag)
          {
+        *code = OP_END;
+        adjust_recurse(start_bracket, 1 + LINK_SIZE,
+          (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
          memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
            IN_UCHARS(code - start_bracket));
          *start_bracket = OP_ONCE;
@@ -8470,6 +8723,7 @@ do {
         case OP_RREF:
         case OP_DNRREF:
         case OP_DEF:
+       case OP_FAIL:
         return FALSE;
  
         default:     /* Assertion */
@@ -9051,9 +9305,11 @@ cd->names_found = 0;
  cd->name_entry_size = 0;
  cd->name_table = NULL;
  cd->dupnames = FALSE;
+cd->dupgroups = FALSE;
  cd->namedrefcount = 0;
  cd->start_code = cworkspace;
  cd->hwm = cworkspace;
+cd->iscondassert = FALSE;
  cd->start_workspace = cworkspace;
  cd->workspace_size = COMPILE_WORK_SIZE;
  cd->named_groups = named_groups;
@@ -9091,13 +9347,6 @@ if (length > MAX_PATTERN_SIZE)
    goto PCRE_EARLY_ERROR_RETURN;
    }
  
-/* If there are groups with duplicate names and there are also references by
-name, we must allow for the possibility of named references to duplicated
-groups. These require an extra data item each. */
-
-if (cd->dupnames && cd->namedrefcount > 0)
-  length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
-
  /* Compute the size of the data block for storing the compiled pattern. Integer
  overflow should no longer be possible because nowadays we limit the maximum
  value of cd->names_found and cd->name_entry_size. */
@@ -9156,6 +9405,7 @@ cd->name_table = (pcre_uchar *)re + re->name_table_offset;
  codestart = cd->name_table + re->name_entry_size * re->name_count;
  cd->start_code = codestart;
  cd->hwm = (pcre_uchar *)(cd->start_workspace);
+cd->iscondassert = FALSE;
  cd->req_varyopt = 0;
  cd->had_accept = FALSE;
  cd->had_pruneorskip = FALSE;
@@ -9228,6 +9478,16 @@ if (cd->hwm > cd->start_workspace)
      int offset, recno;
      cd->hwm -= LINK_SIZE;
      offset = GET(cd->hwm, 0);
+
+    /* Check that the hwm handling hasn't gone wrong. This whole area is
+    rewritten in PCRE2 because there are some obscure cases. */
+
+    if (offset == 0 || codestart[offset-1] != OP_RECURSE)
+      {
+      errorcode = ERR10;
+      break;
+      }
+
      recno = GET(codestart, offset);
      if (recno != prev_recno)
        {
@@ -9251,11 +9511,18 @@ subpattern. */
  
  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
  
-/* Unless disabled, check whether single character iterators can be
-auto-possessified. The function overwrites the appropriate opcode values. */
+/* Unless disabled, check whether any single character iterators can be
+auto-possessified. The function overwrites the appropriate opcode values, so
+the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
+used in this code because at least one compiler gives a warning about loss of
+"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
+function call. */
  
-if ((options & PCRE_NO_AUTO_POSSESS) == 0)
-  auto_possessify((pcre_uchar *)codestart, utf, cd);
+if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
+  {
+  pcre_uchar *temp = (pcre_uchar *)codestart;
+  auto_possessify(temp, utf, cd);
+  }
  
  /* If there were any lookbehind assertions that contained OP_RECURSE
  (recursions or subroutine calls), a flag is set for them to be checked here,
@@ -9285,7 +9552,7 @@ if (errorcode == 0 && cd->check_lookbehind)
        int end_op = *be;
        *be = OP_END;
        fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
-        cd);
+        cd, NULL);
        *be = end_op;
        DPRINTF(("fixed length = %d\n", fixed_length));
        if (fixed_length < 0)
@@ -9478,4 +9745,3 @@ return (pcre32 *)re;
  }
  
  /* End of pcre_compile.c */
-