pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2016 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing pattern start */
  51 #define PSEND   end_pattern    /* Field containing pattern end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
  57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
  58 library. We do not need to select pcre16_printint.c specially, because the
  59 COMPILE_PCREx macro will already be appropriately set. */
  60
  61 #ifdef PCRE_DEBUG
  62 /* pcre_printint.c should not include any headers */
  63 #define PCRE_INCLUDED
  64 #include "pcre_printint.c"
  65 #undef PCRE_INCLUDED
  66 #endif
  67
  68
  69 /* Macro for setting individual bits in class bitmaps. */
  70
  71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
  72
  73 /* Maximum length value to check against when making sure that the integer that
  74 holds the compiled pattern length does not overflow. We make it a bit less than
  75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  76 to check them every time. */
  77
  78 #define OFLOW_MAX (INT_MAX - 20)
  79
  80 /* Definitions to allow mutual recursion */
  81
  82 static int
  83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
  84     const pcre_uint32 *, unsigned int);
  85
  86 static BOOL
  87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
  88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
  89     compile_data *, int *);
  90
  91
  92
  93 /*************************************************
  94 *      Code parameters and static tables         *
  95 *************************************************/
  96
  97 /* This value specifies the size of stack workspace that is used during the
  98 first pre-compile phase that determines how much memory is required. The regex
  99 is partly compiled into this space, but the compiled parts are discarded as
 100 soon as they can be, so that hopefully there will never be an overrun. The code
 101 does, however, check for an overrun. The largest amount I've seen used is 218,
 102 so this number is very generous.
 103
 104 The same workspace is used during the second, actual compile phase for
 105 remembering forward references to groups so that they can be filled in at the
 106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
 107 is 4 there is plenty of room for most patterns. However, the memory can get
 108 filled up by repetitions of forward references, for example patterns like
 109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
 110 that the workspace is expanded using malloc() in this situation. The value
 111 below is therefore a minimum, and we put a maximum on it for safety. The
 112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
 113 kicks in at the same number of forward references in all cases. */
 114
 115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
 116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
 117
 118 /* This value determines the size of the initial vector that is used for
 119 remembering named groups during the pre-compile. It is allocated on the stack,
 120 but if it is too small, it is expanded using malloc(), in a similar way to the
 121 workspace. The value is the number of slots in the list. */
 122
 123 #define NAMED_GROUP_LIST_SIZE  20
 124
 125 /* The overrun tests check for a slightly smaller size so that they detect the
 126 overrun before it actually does run off the end of the data block. */
 127
 128 #define WORK_SIZE_SAFETY_MARGIN (100)
 129
 130 /* Private flags added to firstchar and reqchar. */
 131
 132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
 133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
 134 /* Negative values for the firstchar and reqchar flags */
 135 #define REQ_UNSET       (-2)
 136 #define REQ_NONE        (-1)
 137
 138 /* Repeated character flags. */
 139
 140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
 141
 142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 143 are simple data values; negative values are for special things like \d and so
 144 on. Zero means further processing is needed (for things like \x), or the escape
 145 is invalid. */
 146
 147 #ifndef EBCDIC
 148
 149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
 150 in UTF-8 mode. */
 151
 152 static const short int escapes[] = {
 153      0,                       0,
 154      0,                       0,
 155      0,                       0,
 156      0,                       0,
 157      0,                       0,
 158      CHAR_COLON,              CHAR_SEMICOLON,
 159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
 160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
 161      CHAR_COMMERCIAL_AT,      -ESC_A,
 162      -ESC_B,                  -ESC_C,
 163      -ESC_D,                  -ESC_E,
 164      0,                       -ESC_G,
 165      -ESC_H,                  0,
 166      0,                       -ESC_K,
 167      0,                       0,
 168      -ESC_N,                  0,
 169      -ESC_P,                  -ESC_Q,
 170      -ESC_R,                  -ESC_S,
 171      0,                       0,
 172      -ESC_V,                  -ESC_W,
 173      -ESC_X,                  0,
 174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
 175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
 176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
 177      CHAR_GRAVE_ACCENT,       ESC_a,
 178      -ESC_b,                  0,
 179      -ESC_d,                  ESC_e,
 180      ESC_f,                   0,
 181      -ESC_h,                  0,
 182      0,                       -ESC_k,
 183      0,                       0,
 184      ESC_n,                   0,
 185      -ESC_p,                  0,
 186      ESC_r,                   -ESC_s,
 187      ESC_tee,                 0,
 188      -ESC_v,                  -ESC_w,
 189      0,                       0,
 190      -ESC_z
 191 };
 192
 193 #else
 194
 195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
 196
 197 static const short int escapes[] = {
 198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
 208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
 216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 221 };
 222
 223 /* We also need a table of characters that may follow \c in an EBCDIC
 224 environment for characters 0-31. */
 225
 226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
 227
 228 #endif
 229
 230
 231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 232 searched linearly. Put all the names into a single string, in order to reduce
 233 the number of relocations when a shared library is dynamically linked. The
 234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
 235 platforms. */
 236
 237 typedef struct verbitem {
 238   int   len;                 /* Length of verb name */
 239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
 240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
 241 } verbitem;
 242
 243 static const char verbnames[] =
 244   "\0"                       /* Empty name is a shorthand for MARK */
 245   STRING_MARK0
 246   STRING_ACCEPT0
 247   STRING_COMMIT0
 248   STRING_F0
 249   STRING_FAIL0
 250   STRING_PRUNE0
 251   STRING_SKIP0
 252   STRING_THEN;
 253
 254 static const verbitem verbs[] = {
 255   { 0, -1,        OP_MARK },
 256   { 4, -1,        OP_MARK },
 257   { 6, OP_ACCEPT, -1 },
 258   { 6, OP_COMMIT, -1 },
 259   { 1, OP_FAIL,   -1 },
 260   { 4, OP_FAIL,   -1 },
 261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
 262   { 4, OP_SKIP,   OP_SKIP_ARG  },
 263   { 4, OP_THEN,   OP_THEN_ARG  }
 264 };
 265
 266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 267
 268
 269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
 270 another regex library. */
 271
 272 static const pcre_uchar sub_start_of_word[] = {
 273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
 274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
 275
 276 static const pcre_uchar sub_end_of_word[] = {
 277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
 278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
 279   CHAR_RIGHT_PARENTHESIS, '\0' };
 280
 281
 282 /* Tables of names of POSIX character classes and their lengths. The names are
 283 now all in a single string, to reduce the number of relocations when a shared
 284 library is dynamically loaded. The list of lengths is terminated by a zero
 285 length entry. The first three must be alpha, lower, upper, as this is assumed
 286 for handling case independence. The indices for graph, print, and punct are
 287 needed, so identify them. */
 288
 289 static const char posix_names[] =
 290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
 291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
 292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
 293   STRING_word0  STRING_xdigit;
 294
 295 static const pcre_uint8 posix_name_lengths[] = {
 296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 297
 298 #define PC_GRAPH  8
 299 #define PC_PRINT  9
 300 #define PC_PUNCT 10
 301
 302
 303 /* Table of class bit maps for each POSIX class. Each class is formed from a
 304 base map, with an optional addition or removal of another map. Then, for some
 305 classes, there is some additional tweaking: for [:blank:] the vertical space
 306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 307 character is removed. The triples in the table consist of the base map offset,
 308 second map offset or -1 if no second map, and a non-negative value for map
 309 addition or a negative value for map subtraction (if there are two maps). The
 310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 311 remove vertical space characters, 2 => remove underscore. */
 312
 313 static const int posix_class_maps[] = {
 314   cbit_word,  cbit_digit, -2,             /* alpha */
 315   cbit_lower, -1,          0,             /* lower */
 316   cbit_upper, -1,          0,             /* upper */
 317   cbit_word,  -1,          2,             /* alnum - word without underscore */
 318   cbit_print, cbit_cntrl,  0,             /* ascii */
 319   cbit_space, -1,          1,             /* blank - a GNU extension */
 320   cbit_cntrl, -1,          0,             /* cntrl */
 321   cbit_digit, -1,          0,             /* digit */
 322   cbit_graph, -1,          0,             /* graph */
 323   cbit_print, -1,          0,             /* print */
 324   cbit_punct, -1,          0,             /* punct */
 325   cbit_space, -1,          0,             /* space */
 326   cbit_word,  -1,          0,             /* word - a Perl extension */
 327   cbit_xdigit,-1,          0              /* xdigit */
 328 };
 329
 330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
 331 Unicode property escapes. */
 332
 333 #ifdef SUPPORT_UCP
 334 static const pcre_uchar string_PNd[]  = {
 335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 337 static const pcre_uchar string_pNd[]  = {
 338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 340 static const pcre_uchar string_PXsp[] = {
 341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 343 static const pcre_uchar string_pXsp[] = {
 344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 346 static const pcre_uchar string_PXwd[] = {
 347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 349 static const pcre_uchar string_pXwd[] = {
 350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 352
 353 static const pcre_uchar *substitutes[] = {
 354   string_PNd,           /* \D */
 355   string_pNd,           /* \d */
 356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
 357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
 358   string_PXwd,          /* \W */
 359   string_pXwd           /* \w */
 360 };
 361
 362 /* The POSIX class substitutes must be in the order of the POSIX class names,
 363 defined above, and there are both positive and negative cases. NULL means no
 364 general substitute of a Unicode property escape (\p or \P). However, for some
 365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
 366 directly. */
 367
 368 static const pcre_uchar string_pL[] =   {
 369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 371 static const pcre_uchar string_pLl[] =  {
 372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 374 static const pcre_uchar string_pLu[] =  {
 375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 377 static const pcre_uchar string_pXan[] = {
 378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 380 static const pcre_uchar string_h[] =    {
 381   CHAR_BACKSLASH, CHAR_h, '\0' };
 382 static const pcre_uchar string_pXps[] = {
 383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 385 static const pcre_uchar string_PL[] =   {
 386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 388 static const pcre_uchar string_PLl[] =  {
 389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 391 static const pcre_uchar string_PLu[] =  {
 392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 394 static const pcre_uchar string_PXan[] = {
 395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 397 static const pcre_uchar string_H[] =    {
 398   CHAR_BACKSLASH, CHAR_H, '\0' };
 399 static const pcre_uchar string_PXps[] = {
 400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 402
 403 static const pcre_uchar *posix_substitutes[] = {
 404   string_pL,            /* alpha */
 405   string_pLl,           /* lower */
 406   string_pLu,           /* upper */
 407   string_pXan,          /* alnum */
 408   NULL,                 /* ascii */
 409   string_h,             /* blank */
 410   NULL,                 /* cntrl */
 411   string_pNd,           /* digit */
 412   NULL,                 /* graph */
 413   NULL,                 /* print */
 414   NULL,                 /* punct */
 415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
 416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
 417   NULL,                 /* xdigit */
 418   /* Negated cases */
 419   string_PL,            /* ^alpha */
 420   string_PLl,           /* ^lower */
 421   string_PLu,           /* ^upper */
 422   string_PXan,          /* ^alnum */
 423   NULL,                 /* ^ascii */
 424   string_H,             /* ^blank */
 425   NULL,                 /* ^cntrl */
 426   string_PNd,           /* ^digit */
 427   NULL,                 /* ^graph */
 428   NULL,                 /* ^print */
 429   NULL,                 /* ^punct */
 430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
 431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
 432   NULL                  /* ^xdigit */
 433 };
 434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
 435 #endif
 436
 437 #define STRING(a)  # a
 438 #define XSTRING(s) STRING(s)
 439
 440 /* The texts of compile-time error messages. These are "char *" because they
 441 are passed to the outside world. Do not ever re-use any error number, because
 442 they are documented. Always add a new error instead. Messages marked DEAD below
 443 are no longer used. This used to be a table of strings, but in order to reduce
 444 the number of relocations needed when a shared library is loaded dynamically,
 445 it is now one long string. We cannot use a table of offsets, because the
 446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 447 simply count through to the one we want - this isn't a performance issue
 448 because these strings are used only when there is a compilation error.
 449
 450 Each substring ends with \0 to insert a null character. This includes the final
 451 substring, so that the whole string ends with \0\0, which can be detected when
 452 counting through. */
 453
 454 static const char error_texts[] =
 455   "no error\0"
 456   "\\ at end of pattern\0"
 457   "\\c at end of pattern\0"
 458   "unrecognized character follows \\\0"
 459   "numbers out of order in {} quantifier\0"
 460   /* 5 */
 461   "number too big in {} quantifier\0"
 462   "missing terminating ] for character class\0"
 463   "invalid escape sequence in character class\0"
 464   "range out of order in character class\0"
 465   "nothing to repeat\0"
 466   /* 10 */
 467   "internal error: invalid forward reference offset\0"
 468   "internal error: unexpected repeat\0"
 469   "unrecognized character after (? or (?-\0"
 470   "POSIX named classes are supported only within a class\0"
 471   "missing )\0"
 472   /* 15 */
 473   "reference to non-existent subpattern\0"
 474   "erroffset passed as NULL\0"
 475   "unknown option bit(s) set\0"
 476   "missing ) after comment\0"
 477   "parentheses nested too deeply\0"  /** DEAD **/
 478   /* 20 */
 479   "regular expression is too large\0"
 480   "failed to get memory\0"
 481   "unmatched parentheses\0"
 482   "internal error: code overflow\0"
 483   "unrecognized character after (?<\0"
 484   /* 25 */
 485   "lookbehind assertion is not fixed length\0"
 486   "malformed number or name after (?(\0"
 487   "conditional group contains more than two branches\0"
 488   "assertion expected after (?(\0"
 489   "(?R or (?[+-]digits must be followed by )\0"
 490   /* 30 */
 491   "unknown POSIX class name\0"
 492   "POSIX collating elements are not supported\0"
 493   "this version of PCRE is compiled without UTF support\0"
 494   "spare error\0"  /** DEAD **/
 495   "character value in \\x{} or \\o{} is too large\0"
 496   /* 35 */
 497   "invalid condition (?(0)\0"
 498   "\\C not allowed in lookbehind assertion\0"
 499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
 500   "number after (?C is > 255\0"
 501   "closing ) for (?C expected\0"
 502   /* 40 */
 503   "recursive call could loop indefinitely\0"
 504   "unrecognized character after (?P\0"
 505   "syntax error in subpattern name (missing terminator)\0"
 506   "two named subpatterns have the same name\0"
 507   "invalid UTF-8 string\0"
 508   /* 45 */
 509   "support for \\P, \\p, and \\X has not been compiled\0"
 510   "malformed \\P or \\p sequence\0"
 511   "unknown property name after \\P or \\p\0"
 512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 514   /* 50 */
 515   "repeated subpattern is too long\0"    /** DEAD **/
 516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
 517   "internal error: overran compiling workspace\0"
 518   "internal error: previously-checked referenced subpattern not found\0"
 519   "DEFINE group contains more than one branch\0"
 520   /* 55 */
 521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
 522   "inconsistent NEWLINE options\0"
 523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 524   "a numbered reference must not be zero\0"
 525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
 526   /* 60 */
 527   "(*VERB) not recognized or malformed\0"
 528   "number is too big\0"
 529   "subpattern name expected\0"
 530   "digit expected after (?+\0"
 531   "] is an invalid data character in JavaScript compatibility mode\0"
 532   /* 65 */
 533   "different names for subpatterns of the same number are not allowed\0"
 534   "(*MARK) must have an argument\0"
 535   "this version of PCRE is not compiled with Unicode property support\0"
 536 #ifndef EBCDIC
 537   "\\c must be followed by an ASCII character\0"
 538 #else
 539   "\\c must be followed by a letter or one of [\\]^_?\0"
 540 #endif
 541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
 542   /* 70 */
 543   "internal error: unknown opcode in find_fixedlength()\0"
 544   "\\N is not supported in a class\0"
 545   "too many forward references\0"
 546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
 547   "invalid UTF-16 string\0"
 548   /* 75 */
 549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
 550   "character value in \\u.... sequence is too large\0"
 551   "invalid UTF-32 string\0"
 552   "setting UTF is disabled by the application\0"
 553   "non-hex character in \\x{} (closing brace missing?)\0"
 554   /* 80 */
 555   "non-octal character in \\o{} (closing brace missing?)\0"
 556   "missing opening brace after \\o\0"
 557   "parentheses are too deeply nested\0"
 558   "invalid range in character class\0"
 559   "group name must start with a non-digit\0"
 560   /* 85 */
 561   "parentheses are too deeply nested (stack check)\0"
 562   "digits missing in \\x{} or \\o{}\0"
 563   "regular expression is too complicated\0"
 564   ;
 565
 566 /* Table to identify digits and hex digits. This is used when compiling
 567 patterns. Note that the tables in chartables are dependent on the locale, and
 568 may mark arbitrary characters as digits - but the PCRE compiling code expects
 569 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 570 a private table here. It costs 256 bytes, but it is a lot faster than doing
 571 character value tests (at least in some simple cases I timed), and in some
 572 applications one wants PCRE to compile efficiently as well as match
 573 efficiently.
 574
 575 For convenience, we use the same bit definitions as in chartables:
 576
 577   0x04   decimal digit
 578   0x08   hexadecimal digit
 579
 580 Then we can use ctype_digit and ctype_xdigit in the code. */
 581
 582 /* Using a simple comparison for decimal numbers rather than a memory read
 583 is much faster, and the resulting code is simpler (the compiler turns it
 584 into a subtraction and unsigned comparison). */
 585
 586 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
 587
 588 #ifndef EBCDIC
 589
 590 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
 591 UTF-8 mode. */
 592
 593 static const pcre_uint8 digitab[] =
 594   {
 595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 601   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 602   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 603   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 607   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 627
 628 #else
 629
 630 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
 631
 632 static const pcre_uint8 digitab[] =
 633   {
 634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 650   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 658   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 664   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 665   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 666
 667 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
 668   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 669   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 670   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 672   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 676   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 677   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 678   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 679   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 681   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 683   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 684   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 685   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 686   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 687   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 688   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 689   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 690   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 691   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 692   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 693   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 694   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 695   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 696   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 697   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 698   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 699   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 700 #endif
 701
 702
 703 /* This table is used to check whether auto-possessification is possible
 704 between adjacent character-type opcodes. The left-hand (repeated) opcode is
 705 used to select the row, and the right-hand opcode is use to select the column.
 706 A value of 1 means that auto-possessification is OK. For example, the second
 707 value in the first row means that \D+\d can be turned into \D++\d.
 708
 709 The Unicode property types (\P and \p) have to be present to fill out the table
 710 because of what their opcode values are, but the table values should always be
 711 zero because property types are handled separately in the code. The last four
 712 columns apply to items that cannot be repeated, so there is no need to have
 713 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
 714 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 715
 716 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
 717 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
 718
 719 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
 720 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
 721   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
 722   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
 723   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
 724   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
 725   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
 726   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
 727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
 728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
 729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
 730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
 731   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
 732   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
 733   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
 734   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
 735   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
 736   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
 737   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
 738 };
 739
 740
 741 /* This table is used to check whether auto-possessification is possible
 742 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
 743 left-hand (repeated) opcode is used to select the row, and the right-hand
 744 opcode is used to select the column. The values are as follows:
 745
 746   0   Always return FALSE (never auto-possessify)
 747   1   Character groups are distinct (possessify if both are OP_PROP)
 748   2   Check character categories in the same group (general or particular)
 749   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
 750
 751   4   Check left general category vs right particular category
 752   5   Check right general category vs left particular category
 753
 754   6   Left alphanum vs right general category
 755   7   Left space vs right general category
 756   8   Left word vs right general category
 757
 758   9   Right alphanum vs left general category
 759  10   Right space vs left general category
 760  11   Right word vs left general category
 761
 762  12   Left alphanum vs right particular category
 763  13   Left space vs right particular category
 764  14   Left word vs right particular category
 765
 766  15   Right alphanum vs left particular category
 767  16   Right space vs left particular category
 768  17   Right word vs left particular category
 769 */
 770
 771 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
 772 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
 773   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
 774   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
 775   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
 776   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
 777   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
 778   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
 779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
 780   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
 781   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
 782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
 783   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
 784 };
 785
 786 /* This table is used to check whether auto-possessification is possible
 787 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
 788 specifies a general category and the other specifies a particular category. The
 789 row is selected by the general category and the column by the particular
 790 category. The value is 1 if the particular category is not part of the general
 791 category. */
 792
 793 static const pcre_uint8 catposstab[7][30] = {
 794 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
 795   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
 796   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
 797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
 798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
 799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
 800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
 801   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
 802 };
 803
 804 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
 805 a general or particular category. The properties in each row are those
 806 that apply to the character set in question. Duplication means that a little
 807 unnecessary work is done when checking, but this keeps things much simpler
 808 because they can all use the same code. For more details see the comment where
 809 this table is used.
 810
 811 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
 812 "space", but from Perl 5.18 it's included, so both categories are treated the
 813 same here. */
 814
 815 static const pcre_uint8 posspropstab[3][4] = {
 816   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
 817   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
 818   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
 819 };
 820
 821 /* This table is used when converting repeating opcodes into possessified
 822 versions as a result of an explicit possessive quantifier such as ++. A zero
 823 value means there is no possessified version - in those cases the item in
 824 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
 825 because all relevant opcodes are less than that. */
 826
 827 static const pcre_uint8 opcode_possessify[] = {
 828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
 829   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
 830
 831   0,                       /* NOTI */
 832   OP_POSSTAR, 0,           /* STAR, MINSTAR */
 833   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
 834   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
 835   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
 836   0,                       /* EXACT */
 837   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
 838
 839   OP_POSSTARI, 0,          /* STARI, MINSTARI */
 840   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
 841   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
 842   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
 843   0,                       /* EXACTI */
 844   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
 845
 846   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
 847   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
 848   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
 849   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
 850   0,                       /* NOTEXACT */
 851   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
 852
 853   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
 854   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
 855   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
 856   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
 857   0,                       /* NOTEXACTI */
 858   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
 859
 860   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
 861   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
 862   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
 863   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
 864   0,                       /* TYPEEXACT */
 865   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
 866
 867   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
 868   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
 869   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
 870   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
 871   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
 872
 873   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
 874   0, 0,                    /* REF, REFI */
 875   0, 0,                    /* DNREF, DNREFI */
 876   0, 0                     /* RECURSE, CALLOUT */
 877 };
 878
 879
 880
 881 /*************************************************
 882 *            Find an error text                  *
 883 *************************************************/
 884
 885 /* The error texts are now all in one long string, to save on relocations. As
 886 some of the text is of unknown length, we can't use a table of offsets.
 887 Instead, just count through the strings. This is not a performance issue
 888 because it happens only when there has been a compilation error.
 889
 890 Argument:   the error number
 891 Returns:    pointer to the error string
 892 */
 893
 894 static const char *
 895 find_error_text(int n)
 896 {
 897 const char *s = error_texts;
 898 for (; n > 0; n--)
 899   {
 900   while (*s++ != CHAR_NULL) {};
 901   if (*s == CHAR_NULL) return "Error text not found (please report)";
 902   }
 903 return s;
 904 }
 905
 906
 907
 908 /*************************************************
 909 *           Expand the workspace                 *
 910 *************************************************/
 911
 912 /* This function is called during the second compiling phase, if the number of
 913 forward references fills the existing workspace, which is originally a block on
 914 the stack. A larger block is obtained from malloc() unless the ultimate limit
 915 has been reached or the increase will be rather small.
 916
 917 Argument: pointer to the compile data block
 918 Returns:  0 if all went well, else an error number
 919 */
 920
 921 static int
 922 expand_workspace(compile_data *cd)
 923 {
 924 pcre_uchar *newspace;
 925 int newsize = cd->workspace_size * 2;
 926
 927 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
 928 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
 929     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
 930  return ERR72;
 931
 932 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
 933 if (newspace == NULL) return ERR21;
 934 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
 935 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
 936 if (cd->workspace_size > COMPILE_WORK_SIZE)
 937   (PUBL(free))((void *)cd->start_workspace);
 938 cd->start_workspace = newspace;
 939 cd->workspace_size = newsize;
 940 return 0;
 941 }
 942
 943
 944
 945 /*************************************************
 946 *            Check for counted repeat            *
 947 *************************************************/
 948
 949 /* This function is called when a '{' is encountered in a place where it might
 950 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 951 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 952 where the ddds are digits.
 953
 954 Arguments:
 955   p         pointer to the first char after '{'
 956
 957 Returns:    TRUE or FALSE
 958 */
 959
 960 static BOOL
 961 is_counted_repeat(const pcre_uchar *p)
 962 {
 963 if (!IS_DIGIT(*p)) return FALSE;
 964 p++;
 965 while (IS_DIGIT(*p)) p++;
 966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 967
 968 if (*p++ != CHAR_COMMA) return FALSE;
 969 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 970
 971 if (!IS_DIGIT(*p)) return FALSE;
 972 p++;
 973 while (IS_DIGIT(*p)) p++;
 974
 975 return (*p == CHAR_RIGHT_CURLY_BRACKET);
 976 }
 977
 978
 979
 980 /*************************************************
 981 *            Handle escapes                      *
 982 *************************************************/
 983
 984 /* This function is called when a \ has been encountered. It either returns a
 985 positive value for a simple escape such as \n, or 0 for a data character which
 986 will be placed in chptr. A backreference to group n is returned as negative n.
 987 When UTF-8 is enabled, a positive value greater than 255 may be returned in
 988 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
 989 character of the escape sequence.
 990
 991 Arguments:
 992   ptrptr         points to the pattern position pointer
 993   chptr          points to a returned data character
 994   errorcodeptr   points to the errorcode variable
 995   bracount       number of previous extracting brackets
 996   options        the options bits
 997   isclass        TRUE if inside a character class
 998
 999 Returns:         zero => a data character
1000                  positive => a special escape sequence
1001                  negative => a back reference
1002                  on error, errorcodeptr is set
1003 */
1004
1005 static int
1006 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1007   int bracount, int options, BOOL isclass)
1008 {
1009 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1010 BOOL utf = (options & PCRE_UTF8) != 0;
1011 const pcre_uchar *ptr = *ptrptr + 1;
1012 pcre_uint32 c;
1013 int escape = 0;
1014 int i;
1015
1016 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1017 ptr--;                            /* Set pointer back to the last byte */
1018
1019 /* If backslash is at the end of the pattern, it's an error. */
1020
1021 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1022
1023 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1024 in a table. A non-zero result is something that can be returned immediately.
1025 Otherwise further processing may be required. */
1026
1027 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1028 /* Not alphanumeric */
1029 else if (c < CHAR_0 || c > CHAR_z) {}
1030 else if ((i = escapes[c - CHAR_0]) != 0)
1031   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1032
1033 #else           /* EBCDIC coding */
1034 /* Not alphanumeric */
1035 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1036 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1037 #endif
1038
1039 /* Escapes that need further processing, or are illegal. */
1040
1041 else
1042   {
1043   const pcre_uchar *oldptr;
1044   BOOL braced, negated, overflow;
1045   int s;
1046
1047   switch (c)
1048     {
1049     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1050     error. */
1051
1052     case CHAR_l:
1053     case CHAR_L:
1054     *errorcodeptr = ERR37;
1055     break;
1056
1057     case CHAR_u:
1058     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1059       {
1060       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1061       Otherwise it is a lowercase u letter. */
1062       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1063         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1064         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1065         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1066         {
1067         c = 0;
1068         for (i = 0; i < 4; ++i)
1069           {
1070           register pcre_uint32 cc = *(++ptr);
1071 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1072           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1073           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1074 #else           /* EBCDIC coding */
1075           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1076           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1077 #endif
1078           }
1079
1080 #if defined COMPILE_PCRE8
1081         if (c > (utf ? 0x10ffffU : 0xffU))
1082 #elif defined COMPILE_PCRE16
1083         if (c > (utf ? 0x10ffffU : 0xffffU))
1084 #elif defined COMPILE_PCRE32
1085         if (utf && c > 0x10ffffU)
1086 #endif
1087           {
1088           *errorcodeptr = ERR76;
1089           }
1090         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1091         }
1092       }
1093     else
1094       *errorcodeptr = ERR37;
1095     break;
1096
1097     case CHAR_U:
1098     /* In JavaScript, \U is an uppercase U letter. */
1099     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1100     break;
1101
1102     /* In a character class, \g is just a literal "g". Outside a character
1103     class, \g must be followed by one of a number of specific things:
1104
1105     (1) A number, either plain or braced. If positive, it is an absolute
1106     backreference. If negative, it is a relative backreference. This is a Perl
1107     5.10 feature.
1108
1109     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1110     is part of Perl's movement towards a unified syntax for back references. As
1111     this is synonymous with \k{name}, we fudge it up by pretending it really
1112     was \k.
1113
1114     (3) For Oniguruma compatibility we also support \g followed by a name or a
1115     number either in angle brackets or in single quotes. However, these are
1116     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1117     the ESC_g code (cf \k). */
1118
1119     case CHAR_g:
1120     if (isclass) break;
1121     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1122       {
1123       escape = ESC_g;
1124       break;
1125       }
1126
1127     /* Handle the Perl-compatible cases */
1128
1129     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1130       {
1131       const pcre_uchar *p;
1132       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1133         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1134       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1135         {
1136         escape = ESC_k;
1137         break;
1138         }
1139       braced = TRUE;
1140       ptr++;
1141       }
1142     else braced = FALSE;
1143
1144     if (ptr[1] == CHAR_MINUS)
1145       {
1146       negated = TRUE;
1147       ptr++;
1148       }
1149     else negated = FALSE;
1150
1151     /* The integer range is limited by the machine's int representation. */
1152     s = 0;
1153     overflow = FALSE;
1154     while (IS_DIGIT(ptr[1]))
1155       {
1156       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1157         {
1158         overflow = TRUE;
1159         break;
1160         }
1161       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1162       }
1163     if (overflow) /* Integer overflow */
1164       {
1165       while (IS_DIGIT(ptr[1]))
1166         ptr++;
1167       *errorcodeptr = ERR61;
1168       break;
1169       }
1170
1171     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1172       {
1173       *errorcodeptr = ERR57;
1174       break;
1175       }
1176
1177     if (s == 0)
1178       {
1179       *errorcodeptr = ERR58;
1180       break;
1181       }
1182
1183     if (negated)
1184       {
1185       if (s > bracount)
1186         {
1187         *errorcodeptr = ERR15;
1188         break;
1189         }
1190       s = bracount - (s - 1);
1191       }
1192
1193     escape = -s;
1194     break;
1195
1196     /* The handling of escape sequences consisting of a string of digits
1197     starting with one that is not zero is not straightforward. Perl has changed
1198     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1199     recommended to avoid the ambiguities in the old syntax.
1200
1201     Outside a character class, the digits are read as a decimal number. If the
1202     number is less than 8 (used to be 10), or if there are that many previous
1203     extracting left brackets, then it is a back reference. Otherwise, up to
1204     three octal digits are read to form an escaped byte. Thus \123 is likely to
1205     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1206     the octal value is greater than 377, the least significant 8 bits are
1207     taken. \8 and \9 are treated as the literal characters 8 and 9.
1208
1209     Inside a character class, \ followed by a digit is always either a literal
1210     8 or 9 or an octal number. */
1211
1212     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1213     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1214
1215     if (!isclass)
1216       {
1217       oldptr = ptr;
1218       /* The integer range is limited by the machine's int representation. */
1219       s = (int)(c -CHAR_0);
1220       overflow = FALSE;
1221       while (IS_DIGIT(ptr[1]))
1222         {
1223         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1224           {
1225           overflow = TRUE;
1226           break;
1227           }
1228         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1229         }
1230       if (overflow) /* Integer overflow */
1231         {
1232         while (IS_DIGIT(ptr[1]))
1233           ptr++;
1234         *errorcodeptr = ERR61;
1235         break;
1236         }
1237       if (s < 8 || s <= bracount)  /* Check for back reference */
1238         {
1239         escape = -s;
1240         break;
1241         }
1242       ptr = oldptr;      /* Put the pointer back and fall through */
1243       }
1244
1245     /* Handle a digit following \ when the number is not a back reference. If
1246     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1247     then treat the digit as a following literal. At least by Perl 5.18 this
1248     changed so as not to insert the binary zero. */
1249
1250     if ((c = *ptr) >= CHAR_8) break;
1251
1252     /* Fall through with a digit less than 8 */
1253
1254     /* \0 always starts an octal number, but we may drop through to here with a
1255     larger first octal digit. The original code used just to take the least
1256     significant 8 bits of octal numbers (I think this is what early Perls used
1257     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1258     but no more than 3 octal digits. */
1259
1260     case CHAR_0:
1261     c -= CHAR_0;
1262     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1263         c = c * 8 + *(++ptr) - CHAR_0;
1264 #ifdef COMPILE_PCRE8
1265     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1266 #endif
1267     break;
1268
1269     /* \o is a relatively new Perl feature, supporting a more general way of
1270     specifying character codes in octal. The only supported form is \o{ddd}. */
1271
1272     case CHAR_o:
1273     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1274     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1275       {
1276       ptr += 2;
1277       c = 0;
1278       overflow = FALSE;
1279       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1280         {
1281         register pcre_uint32 cc = *ptr++;
1282         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1283 #ifdef COMPILE_PCRE32
1284         if (c >= 0x20000000l) { overflow = TRUE; break; }
1285 #endif
1286         c = (c << 3) + cc - CHAR_0 ;
1287 #if defined COMPILE_PCRE8
1288         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1289 #elif defined COMPILE_PCRE16
1290         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1291 #elif defined COMPILE_PCRE32
1292         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1293 #endif
1294         }
1295       if (overflow)
1296         {
1297         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1298         *errorcodeptr = ERR34;
1299         }
1300       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1301         {
1302         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1303         }
1304       else *errorcodeptr = ERR80;
1305       }
1306     break;
1307
1308     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1309     numbers. Otherwise it is a lowercase x letter. */
1310
1311     case CHAR_x:
1312     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1313       {
1314       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1315         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1316         {
1317         c = 0;
1318         for (i = 0; i < 2; ++i)
1319           {
1320           register pcre_uint32 cc = *(++ptr);
1321 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1322           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1323           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1324 #else           /* EBCDIC coding */
1325           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1326           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1327 #endif
1328           }
1329         }
1330       }    /* End JavaScript handling */
1331
1332     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1333     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1334     digits. If not, { used to be treated as a data character. However, Perl
1335     seems to read hex digits up to the first non-such, and ignore the rest, so
1336     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1337     now gives an error. */
1338
1339     else
1340       {
1341       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1342         {
1343         ptr += 2;
1344         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1345           {
1346           *errorcodeptr = ERR86;
1347           break;
1348           }
1349         c = 0;
1350         overflow = FALSE;
1351         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1352           {
1353           register pcre_uint32 cc = *ptr++;
1354           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1355
1356 #ifdef COMPILE_PCRE32
1357           if (c >= 0x10000000l) { overflow = TRUE; break; }
1358 #endif
1359
1360 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1361           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1362           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1363 #else           /* EBCDIC coding */
1364           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1365           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1366 #endif
1367
1368 #if defined COMPILE_PCRE8
1369           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1370 #elif defined COMPILE_PCRE16
1371           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1372 #elif defined COMPILE_PCRE32
1373           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1374 #endif
1375           }
1376
1377         if (overflow)
1378           {
1379           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1380           *errorcodeptr = ERR34;
1381           }
1382
1383         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1384           {
1385           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1386           }
1387
1388         /* If the sequence of hex digits does not end with '}', give an error.
1389         We used just to recognize this construct and fall through to the normal
1390         \x handling, but nowadays Perl gives an error, which seems much more
1391         sensible, so we do too. */
1392
1393         else *errorcodeptr = ERR79;
1394         }   /* End of \x{} processing */
1395
1396       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1397
1398       else
1399         {
1400         c = 0;
1401         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1402           {
1403           pcre_uint32 cc;                          /* Some compilers don't like */
1404           cc = *(++ptr);                           /* ++ in initializers */
1405 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1406           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1407           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1408 #else           /* EBCDIC coding */
1409           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1410           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1411 #endif
1412           }
1413         }     /* End of \xdd handling */
1414       }       /* End of Perl-style \x handling */
1415     break;
1416
1417     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1418     An error is given if the byte following \c is not an ASCII character. This
1419     coding is ASCII-specific, but then the whole concept of \cx is
1420     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1421
1422     case CHAR_c:
1423     c = *(++ptr);
1424     if (c == CHAR_NULL)
1425       {
1426       *errorcodeptr = ERR2;
1427       break;
1428       }
1429 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1430     if (c > 127)  /* Excludes all non-ASCII in either mode */
1431       {
1432       *errorcodeptr = ERR68;
1433       break;
1434       }
1435     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1436     c ^= 0x40;
1437 #else             /* EBCDIC coding */
1438     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1439     if (c == CHAR_QUESTION_MARK)
1440       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1441     else
1442       {
1443       for (i = 0; i < 32; i++)
1444         {
1445         if (c == ebcdic_escape_c[i]) break;
1446         }
1447       if (i < 32) c = i; else *errorcodeptr = ERR68;
1448       }
1449 #endif
1450     break;
1451
1452     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1453     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1454     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1455     odd, but there used to be some cases other than the default, and there may
1456     be again in future, so I haven't "optimized" it. */
1457
1458     default:
1459     if ((options & PCRE_EXTRA) != 0) switch(c)
1460       {
1461       default:
1462       *errorcodeptr = ERR3;
1463       break;
1464       }
1465     break;
1466     }
1467   }
1468
1469 /* Perl supports \N{name} for character names, as well as plain \N for "not
1470 newline". PCRE does not support \N{name}. However, it does support
1471 quantification such as \N{2,3}. */
1472
1473 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1474      !is_counted_repeat(ptr+2))
1475   *errorcodeptr = ERR37;
1476
1477 /* If PCRE_UCP is set, we change the values for \d etc. */
1478
1479 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1480   escape += (ESC_DU - ESC_D);
1481
1482 /* Set the pointer to the final character before returning. */
1483
1484 *ptrptr = ptr;
1485 *chptr = c;
1486 return escape;
1487 }
1488
1489
1490
1491 #ifdef SUPPORT_UCP
1492 /*************************************************
1493 *               Handle \P and \p                 *
1494 *************************************************/
1495
1496 /* This function is called after \P or \p has been encountered, provided that
1497 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1498 pointing at the P or p. On exit, it is pointing at the final character of the
1499 escape sequence.
1500
1501 Argument:
1502   ptrptr         points to the pattern position pointer
1503   negptr         points to a boolean that is set TRUE for negation else FALSE
1504   ptypeptr       points to an unsigned int that is set to the type value
1505   pdataptr       points to an unsigned int that is set to the detailed property value
1506   errorcodeptr   points to the error code variable
1507
1508 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1509 */
1510
1511 static BOOL
1512 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1513   unsigned int *pdataptr, int *errorcodeptr)
1514 {
1515 pcre_uchar c;
1516 int i, bot, top;
1517 const pcre_uchar *ptr = *ptrptr;
1518 pcre_uchar name[32];
1519
1520 c = *(++ptr);
1521 if (c == CHAR_NULL) goto ERROR_RETURN;
1522
1523 *negptr = FALSE;
1524
1525 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1526 negation. */
1527
1528 if (c == CHAR_LEFT_CURLY_BRACKET)
1529   {
1530   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1531     {
1532     *negptr = TRUE;
1533     ptr++;
1534     }
1535   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1536     {
1537     c = *(++ptr);
1538     if (c == CHAR_NULL) goto ERROR_RETURN;
1539     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1540     name[i] = c;
1541     }
1542   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1543   name[i] = 0;
1544   }
1545
1546 /* Otherwise there is just one following character */
1547
1548 else
1549   {
1550   name[0] = c;
1551   name[1] = 0;
1552   }
1553
1554 *ptrptr = ptr;
1555
1556 /* Search for a recognized property name using binary chop */
1557
1558 bot = 0;
1559 top = PRIV(utt_size);
1560
1561 while (bot < top)
1562   {
1563   int r;
1564   i = (bot + top) >> 1;
1565   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1566   if (r == 0)
1567     {
1568     *ptypeptr = PRIV(utt)[i].type;
1569     *pdataptr = PRIV(utt)[i].value;
1570     return TRUE;
1571     }
1572   if (r > 0) bot = i + 1; else top = i;
1573   }
1574
1575 *errorcodeptr = ERR47;
1576 *ptrptr = ptr;
1577 return FALSE;
1578
1579 ERROR_RETURN:
1580 *errorcodeptr = ERR46;
1581 *ptrptr = ptr;
1582 return FALSE;
1583 }
1584 #endif
1585
1586
1587
1588 /*************************************************
1589 *         Read repeat counts                     *
1590 *************************************************/
1591
1592 /* Read an item of the form {n,m} and return the values. This is called only
1593 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1594 so the syntax is guaranteed to be correct, but we need to check the values.
1595
1596 Arguments:
1597   p              pointer to first char after '{'
1598   minp           pointer to int for min
1599   maxp           pointer to int for max
1600                  returned as -1 if no max
1601   errorcodeptr   points to error code variable
1602
1603 Returns:         pointer to '}' on success;
1604                  current ptr on error, with errorcodeptr set non-zero
1605 */
1606
1607 static const pcre_uchar *
1608 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1609 {
1610 int min = 0;
1611 int max = -1;
1612
1613 while (IS_DIGIT(*p))
1614   {
1615   min = min * 10 + (int)(*p++ - CHAR_0);
1616   if (min > 65535)
1617     {
1618     *errorcodeptr = ERR5;
1619     return p;
1620     }
1621   }
1622
1623 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1624   {
1625   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1626     {
1627     max = 0;
1628     while(IS_DIGIT(*p))
1629       {
1630       max = max * 10 + (int)(*p++ - CHAR_0);
1631       if (max > 65535)
1632         {
1633         *errorcodeptr = ERR5;
1634         return p;
1635         }
1636       }
1637     if (max < min)
1638       {
1639       *errorcodeptr = ERR4;
1640       return p;
1641       }
1642     }
1643   }
1644
1645 *minp = min;
1646 *maxp = max;
1647 return p;
1648 }
1649
1650
1651
1652 /*************************************************
1653 *      Find first significant op code            *
1654 *************************************************/
1655
1656 /* This is called by several functions that scan a compiled expression looking
1657 for a fixed first character, or an anchoring op code etc. It skips over things
1658 that do not influence this. For some calls, it makes sense to skip negative
1659 forward and all backward assertions, and also the \b assertion; for others it
1660 does not.
1661
1662 Arguments:
1663   code         pointer to the start of the group
1664   skipassert   TRUE if certain assertions are to be skipped
1665
1666 Returns:       pointer to the first significant opcode
1667 */
1668
1669 static const pcre_uchar*
1670 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1671 {
1672 for (;;)
1673   {
1674   switch ((int)*code)
1675     {
1676     case OP_ASSERT_NOT:
1677     case OP_ASSERTBACK:
1678     case OP_ASSERTBACK_NOT:
1679     if (!skipassert) return code;
1680     do code += GET(code, 1); while (*code == OP_ALT);
1681     code += PRIV(OP_lengths)[*code];
1682     break;
1683
1684     case OP_WORD_BOUNDARY:
1685     case OP_NOT_WORD_BOUNDARY:
1686     if (!skipassert) return code;
1687     /* Fall through */
1688
1689     case OP_CALLOUT:
1690     case OP_CREF:
1691     case OP_DNCREF:
1692     case OP_RREF:
1693     case OP_DNRREF:
1694     case OP_DEF:
1695     code += PRIV(OP_lengths)[*code];
1696     break;
1697
1698     default:
1699     return code;
1700     }
1701   }
1702 /* Control never reaches here */
1703 }
1704
1705
1706
1707 /*************************************************
1708 *        Find the fixed length of a branch       *
1709 *************************************************/
1710
1711 /* Scan a branch and compute the fixed length of subject that will match it,
1712 if the length is fixed. This is needed for dealing with backward assertions.
1713 In UTF8 mode, the result is in characters rather than bytes. The branch is
1714 temporarily terminated with OP_END when this function is called.
1715
1716 This function is called when a backward assertion is encountered, so that if it
1717 fails, the error message can point to the correct place in the pattern.
1718 However, we cannot do this when the assertion contains subroutine calls,
1719 because they can be forward references. We solve this by remembering this case
1720 and doing the check at the end; a flag specifies which mode we are running in.
1721
1722 Arguments:
1723   code     points to the start of the pattern (the bracket)
1724   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1725   atend    TRUE if called when the pattern is complete
1726   cd       the "compile data" structure
1727   recurses    chain of recurse_check to catch mutual recursion
1728
1729 Returns:   the fixed length,
1730              or -1 if there is no fixed length,
1731              or -2 if \C was encountered (in UTF-8 mode only)
1732              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1733              or -4 if an unknown opcode was encountered (internal error)
1734 */
1735
1736 static int
1737 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1738   recurse_check *recurses)
1739 {
1740 int length = -1;
1741 recurse_check this_recurse;
1742 register int branchlength = 0;
1743 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1744
1745 /* Scan along the opcodes for this branch. If we get to the end of the
1746 branch, check the length against that of the other branches. */
1747
1748 for (;;)
1749   {
1750   int d;
1751   pcre_uchar *ce, *cs;
1752   register pcre_uchar op = *cc;
1753
1754   switch (op)
1755     {
1756     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1757     OP_BRA (normal non-capturing bracket) because the other variants of these
1758     opcodes are all concerned with unlimited repeated groups, which of course
1759     are not of fixed length. */
1760
1761     case OP_CBRA:
1762     case OP_BRA:
1763     case OP_ONCE:
1764     case OP_ONCE_NC:
1765     case OP_COND:
1766     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1767       recurses);
1768     if (d < 0) return d;
1769     branchlength += d;
1770     do cc += GET(cc, 1); while (*cc == OP_ALT);
1771     cc += 1 + LINK_SIZE;
1772     break;
1773
1774     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1775     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1776     an ALT. If it is END it's the end of the outer call. All can be handled by
1777     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1778     because they all imply an unlimited repeat. */
1779
1780     case OP_ALT:
1781     case OP_KET:
1782     case OP_END:
1783     case OP_ACCEPT:
1784     case OP_ASSERT_ACCEPT:
1785     if (length < 0) length = branchlength;
1786       else if (length != branchlength) return -1;
1787     if (*cc != OP_ALT) return length;
1788     cc += 1 + LINK_SIZE;
1789     branchlength = 0;
1790     break;
1791
1792     /* A true recursion implies not fixed length, but a subroutine call may
1793     be OK. If the subroutine is a forward reference, we can't deal with
1794     it until the end of the pattern, so return -3. */
1795
1796     case OP_RECURSE:
1797     if (!atend) return -3;
1798     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1799     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1800     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1801     else   /* Check for mutual recursion */
1802       {
1803       recurse_check *r = recurses;
1804       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1805       if (r != NULL) return -1;   /* Mutual recursion */
1806       }
1807     this_recurse.prev = recurses;
1808     this_recurse.group = cs;
1809     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1810     if (d < 0) return d;
1811     branchlength += d;
1812     cc += 1 + LINK_SIZE;
1813     break;
1814
1815     /* Skip over assertive subpatterns */
1816
1817     case OP_ASSERT:
1818     case OP_ASSERT_NOT:
1819     case OP_ASSERTBACK:
1820     case OP_ASSERTBACK_NOT:
1821     do cc += GET(cc, 1); while (*cc == OP_ALT);
1822     cc += 1 + LINK_SIZE;
1823     break;
1824
1825     /* Skip over things that don't match chars */
1826
1827     case OP_MARK:
1828     case OP_PRUNE_ARG:
1829     case OP_SKIP_ARG:
1830     case OP_THEN_ARG:
1831     cc += cc[1] + PRIV(OP_lengths)[*cc];
1832     break;
1833
1834     case OP_CALLOUT:
1835     case OP_CIRC:
1836     case OP_CIRCM:
1837     case OP_CLOSE:
1838     case OP_COMMIT:
1839     case OP_CREF:
1840     case OP_DEF:
1841     case OP_DNCREF:
1842     case OP_DNRREF:
1843     case OP_DOLL:
1844     case OP_DOLLM:
1845     case OP_EOD:
1846     case OP_EODN:
1847     case OP_FAIL:
1848     case OP_NOT_WORD_BOUNDARY:
1849     case OP_PRUNE:
1850     case OP_REVERSE:
1851     case OP_RREF:
1852     case OP_SET_SOM:
1853     case OP_SKIP:
1854     case OP_SOD:
1855     case OP_SOM:
1856     case OP_THEN:
1857     case OP_WORD_BOUNDARY:
1858     cc += PRIV(OP_lengths)[*cc];
1859     break;
1860
1861     /* Handle literal characters */
1862
1863     case OP_CHAR:
1864     case OP_CHARI:
1865     case OP_NOT:
1866     case OP_NOTI:
1867     branchlength++;
1868     cc += 2;
1869 #ifdef SUPPORT_UTF
1870     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1871 #endif
1872     break;
1873
1874     /* Handle exact repetitions. The count is already in characters, but we
1875     need to skip over a multibyte character in UTF8 mode.  */
1876
1877     case OP_EXACT:
1878     case OP_EXACTI:
1879     case OP_NOTEXACT:
1880     case OP_NOTEXACTI:
1881     branchlength += (int)GET2(cc,1);
1882     cc += 2 + IMM2_SIZE;
1883 #ifdef SUPPORT_UTF
1884     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1885 #endif
1886     break;
1887
1888     case OP_TYPEEXACT:
1889     branchlength += GET2(cc,1);
1890     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1891       cc += 2;
1892     cc += 1 + IMM2_SIZE + 1;
1893     break;
1894
1895     /* Handle single-char matchers */
1896
1897     case OP_PROP:
1898     case OP_NOTPROP:
1899     cc += 2;
1900     /* Fall through */
1901
1902     case OP_HSPACE:
1903     case OP_VSPACE:
1904     case OP_NOT_HSPACE:
1905     case OP_NOT_VSPACE:
1906     case OP_NOT_DIGIT:
1907     case OP_DIGIT:
1908     case OP_NOT_WHITESPACE:
1909     case OP_WHITESPACE:
1910     case OP_NOT_WORDCHAR:
1911     case OP_WORDCHAR:
1912     case OP_ANY:
1913     case OP_ALLANY:
1914     branchlength++;
1915     cc++;
1916     break;
1917
1918     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1919     otherwise \C is coded as OP_ALLANY. */
1920
1921     case OP_ANYBYTE:
1922     return -2;
1923
1924     /* Check a class for variable quantification */
1925
1926     case OP_CLASS:
1927     case OP_NCLASS:
1928 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1929     case OP_XCLASS:
1930     /* The original code caused an unsigned overflow in 64 bit systems,
1931     so now we use a conditional statement. */
1932     if (op == OP_XCLASS)
1933       cc += GET(cc, 1);
1934     else
1935       cc += PRIV(OP_lengths)[OP_CLASS];
1936 #else
1937     cc += PRIV(OP_lengths)[OP_CLASS];
1938 #endif
1939
1940     switch (*cc)
1941       {
1942       case OP_CRSTAR:
1943       case OP_CRMINSTAR:
1944       case OP_CRPLUS:
1945       case OP_CRMINPLUS:
1946       case OP_CRQUERY:
1947       case OP_CRMINQUERY:
1948       case OP_CRPOSSTAR:
1949       case OP_CRPOSPLUS:
1950       case OP_CRPOSQUERY:
1951       return -1;
1952
1953       case OP_CRRANGE:
1954       case OP_CRMINRANGE:
1955       case OP_CRPOSRANGE:
1956       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1957       branchlength += (int)GET2(cc,1);
1958       cc += 1 + 2 * IMM2_SIZE;
1959       break;
1960
1961       default:
1962       branchlength++;
1963       }
1964     break;
1965
1966     /* Anything else is variable length */
1967
1968     case OP_ANYNL:
1969     case OP_BRAMINZERO:
1970     case OP_BRAPOS:
1971     case OP_BRAPOSZERO:
1972     case OP_BRAZERO:
1973     case OP_CBRAPOS:
1974     case OP_EXTUNI:
1975     case OP_KETRMAX:
1976     case OP_KETRMIN:
1977     case OP_KETRPOS:
1978     case OP_MINPLUS:
1979     case OP_MINPLUSI:
1980     case OP_MINQUERY:
1981     case OP_MINQUERYI:
1982     case OP_MINSTAR:
1983     case OP_MINSTARI:
1984     case OP_MINUPTO:
1985     case OP_MINUPTOI:
1986     case OP_NOTMINPLUS:
1987     case OP_NOTMINPLUSI:
1988     case OP_NOTMINQUERY:
1989     case OP_NOTMINQUERYI:
1990     case OP_NOTMINSTAR:
1991     case OP_NOTMINSTARI:
1992     case OP_NOTMINUPTO:
1993     case OP_NOTMINUPTOI:
1994     case OP_NOTPLUS:
1995     case OP_NOTPLUSI:
1996     case OP_NOTPOSPLUS:
1997     case OP_NOTPOSPLUSI:
1998     case OP_NOTPOSQUERY:
1999     case OP_NOTPOSQUERYI:
2000     case OP_NOTPOSSTAR:
2001     case OP_NOTPOSSTARI:
2002     case OP_NOTPOSUPTO:
2003     case OP_NOTPOSUPTOI:
2004     case OP_NOTQUERY:
2005     case OP_NOTQUERYI:
2006     case OP_NOTSTAR:
2007     case OP_NOTSTARI:
2008     case OP_NOTUPTO:
2009     case OP_NOTUPTOI:
2010     case OP_PLUS:
2011     case OP_PLUSI:
2012     case OP_POSPLUS:
2013     case OP_POSPLUSI:
2014     case OP_POSQUERY:
2015     case OP_POSQUERYI:
2016     case OP_POSSTAR:
2017     case OP_POSSTARI:
2018     case OP_POSUPTO:
2019     case OP_POSUPTOI:
2020     case OP_QUERY:
2021     case OP_QUERYI:
2022     case OP_REF:
2023     case OP_REFI:
2024     case OP_DNREF:
2025     case OP_DNREFI:
2026     case OP_SBRA:
2027     case OP_SBRAPOS:
2028     case OP_SCBRA:
2029     case OP_SCBRAPOS:
2030     case OP_SCOND:
2031     case OP_SKIPZERO:
2032     case OP_STAR:
2033     case OP_STARI:
2034     case OP_TYPEMINPLUS:
2035     case OP_TYPEMINQUERY:
2036     case OP_TYPEMINSTAR:
2037     case OP_TYPEMINUPTO:
2038     case OP_TYPEPLUS:
2039     case OP_TYPEPOSPLUS:
2040     case OP_TYPEPOSQUERY:
2041     case OP_TYPEPOSSTAR:
2042     case OP_TYPEPOSUPTO:
2043     case OP_TYPEQUERY:
2044     case OP_TYPESTAR:
2045     case OP_TYPEUPTO:
2046     case OP_UPTO:
2047     case OP_UPTOI:
2048     return -1;
2049
2050     /* Catch unrecognized opcodes so that when new ones are added they
2051     are not forgotten, as has happened in the past. */
2052
2053     default:
2054     return -4;
2055     }
2056   }
2057 /* Control never gets here */
2058 }
2059
2060
2061
2062 /*************************************************
2063 *    Scan compiled regex for specific bracket    *
2064 *************************************************/
2065
2066 /* This little function scans through a compiled pattern until it finds a
2067 capturing bracket with the given number, or, if the number is negative, an
2068 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2069 so that it can be called from pcre_study() when finding the minimum matching
2070 length.
2071
2072 Arguments:
2073   code        points to start of expression
2074   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2075   number      the required bracket number or negative to find a lookbehind
2076
2077 Returns:      pointer to the opcode for the bracket, or NULL if not found
2078 */
2079
2080 const pcre_uchar *
2081 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2082 {
2083 for (;;)
2084   {
2085   register pcre_uchar c = *code;
2086
2087   if (c == OP_END) return NULL;
2088
2089   /* XCLASS is used for classes that cannot be represented just by a bit
2090   map. This includes negated single high-valued characters. The length in
2091   the table is zero; the actual length is stored in the compiled code. */
2092
2093   if (c == OP_XCLASS) code += GET(code, 1);
2094
2095   /* Handle recursion */
2096
2097   else if (c == OP_REVERSE)
2098     {
2099     if (number < 0) return (pcre_uchar *)code;
2100     code += PRIV(OP_lengths)[c];
2101     }
2102
2103   /* Handle capturing bracket */
2104
2105   else if (c == OP_CBRA || c == OP_SCBRA ||
2106            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2107     {
2108     int n = (int)GET2(code, 1+LINK_SIZE);
2109     if (n == number) return (pcre_uchar *)code;
2110     code += PRIV(OP_lengths)[c];
2111     }
2112
2113   /* Otherwise, we can get the item's length from the table, except that for
2114   repeated character types, we have to test for \p and \P, which have an extra
2115   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2116   must add in its length. */
2117
2118   else
2119     {
2120     switch(c)
2121       {
2122       case OP_TYPESTAR:
2123       case OP_TYPEMINSTAR:
2124       case OP_TYPEPLUS:
2125       case OP_TYPEMINPLUS:
2126       case OP_TYPEQUERY:
2127       case OP_TYPEMINQUERY:
2128       case OP_TYPEPOSSTAR:
2129       case OP_TYPEPOSPLUS:
2130       case OP_TYPEPOSQUERY:
2131       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2132       break;
2133
2134       case OP_TYPEUPTO:
2135       case OP_TYPEMINUPTO:
2136       case OP_TYPEEXACT:
2137       case OP_TYPEPOSUPTO:
2138       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2139         code += 2;
2140       break;
2141
2142       case OP_MARK:
2143       case OP_PRUNE_ARG:
2144       case OP_SKIP_ARG:
2145       case OP_THEN_ARG:
2146       code += code[1];
2147       break;
2148       }
2149
2150     /* Add in the fixed length from the table */
2151
2152     code += PRIV(OP_lengths)[c];
2153
2154   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2155   a multi-byte character. The length in the table is a minimum, so we have to
2156   arrange to skip the extra bytes. */
2157
2158 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2159     if (utf) switch(c)
2160       {
2161       case OP_CHAR:
2162       case OP_CHARI:
2163       case OP_NOT:
2164       case OP_NOTI:
2165       case OP_EXACT:
2166       case OP_EXACTI:
2167       case OP_NOTEXACT:
2168       case OP_NOTEXACTI:
2169       case OP_UPTO:
2170       case OP_UPTOI:
2171       case OP_NOTUPTO:
2172       case OP_NOTUPTOI:
2173       case OP_MINUPTO:
2174       case OP_MINUPTOI:
2175       case OP_NOTMINUPTO:
2176       case OP_NOTMINUPTOI:
2177       case OP_POSUPTO:
2178       case OP_POSUPTOI:
2179       case OP_NOTPOSUPTO:
2180       case OP_NOTPOSUPTOI:
2181       case OP_STAR:
2182       case OP_STARI:
2183       case OP_NOTSTAR:
2184       case OP_NOTSTARI:
2185       case OP_MINSTAR:
2186       case OP_MINSTARI:
2187       case OP_NOTMINSTAR:
2188       case OP_NOTMINSTARI:
2189       case OP_POSSTAR:
2190       case OP_POSSTARI:
2191       case OP_NOTPOSSTAR:
2192       case OP_NOTPOSSTARI:
2193       case OP_PLUS:
2194       case OP_PLUSI:
2195       case OP_NOTPLUS:
2196       case OP_NOTPLUSI:
2197       case OP_MINPLUS:
2198       case OP_MINPLUSI:
2199       case OP_NOTMINPLUS:
2200       case OP_NOTMINPLUSI:
2201       case OP_POSPLUS:
2202       case OP_POSPLUSI:
2203       case OP_NOTPOSPLUS:
2204       case OP_NOTPOSPLUSI:
2205       case OP_QUERY:
2206       case OP_QUERYI:
2207       case OP_NOTQUERY:
2208       case OP_NOTQUERYI:
2209       case OP_MINQUERY:
2210       case OP_MINQUERYI:
2211       case OP_NOTMINQUERY:
2212       case OP_NOTMINQUERYI:
2213       case OP_POSQUERY:
2214       case OP_POSQUERYI:
2215       case OP_NOTPOSQUERY:
2216       case OP_NOTPOSQUERYI:
2217       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2218       break;
2219       }
2220 #else
2221     (void)(utf);  /* Keep compiler happy by referencing function argument */
2222 #endif
2223     }
2224   }
2225 }
2226
2227
2228
2229 /*************************************************
2230 *   Scan compiled regex for recursion reference  *
2231 *************************************************/
2232
2233 /* This little function scans through a compiled pattern until it finds an
2234 instance of OP_RECURSE.
2235
2236 Arguments:
2237   code        points to start of expression
2238   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2239
2240 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2241 */
2242
2243 static const pcre_uchar *
2244 find_recurse(const pcre_uchar *code, BOOL utf)
2245 {
2246 for (;;)
2247   {
2248   register pcre_uchar c = *code;
2249   if (c == OP_END) return NULL;
2250   if (c == OP_RECURSE) return code;
2251
2252   /* XCLASS is used for classes that cannot be represented just by a bit
2253   map. This includes negated single high-valued characters. The length in
2254   the table is zero; the actual length is stored in the compiled code. */
2255
2256   if (c == OP_XCLASS) code += GET(code, 1);
2257
2258   /* Otherwise, we can get the item's length from the table, except that for
2259   repeated character types, we have to test for \p and \P, which have an extra
2260   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2261   must add in its length. */
2262
2263   else
2264     {
2265     switch(c)
2266       {
2267       case OP_TYPESTAR:
2268       case OP_TYPEMINSTAR:
2269       case OP_TYPEPLUS:
2270       case OP_TYPEMINPLUS:
2271       case OP_TYPEQUERY:
2272       case OP_TYPEMINQUERY:
2273       case OP_TYPEPOSSTAR:
2274       case OP_TYPEPOSPLUS:
2275       case OP_TYPEPOSQUERY:
2276       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2277       break;
2278
2279       case OP_TYPEPOSUPTO:
2280       case OP_TYPEUPTO:
2281       case OP_TYPEMINUPTO:
2282       case OP_TYPEEXACT:
2283       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2284         code += 2;
2285       break;
2286
2287       case OP_MARK:
2288       case OP_PRUNE_ARG:
2289       case OP_SKIP_ARG:
2290       case OP_THEN_ARG:
2291       code += code[1];
2292       break;
2293       }
2294
2295     /* Add in the fixed length from the table */
2296
2297     code += PRIV(OP_lengths)[c];
2298
2299     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2300     by a multi-byte character. The length in the table is a minimum, so we have
2301     to arrange to skip the extra bytes. */
2302
2303 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2304     if (utf) switch(c)
2305       {
2306       case OP_CHAR:
2307       case OP_CHARI:
2308       case OP_NOT:
2309       case OP_NOTI:
2310       case OP_EXACT:
2311       case OP_EXACTI:
2312       case OP_NOTEXACT:
2313       case OP_NOTEXACTI:
2314       case OP_UPTO:
2315       case OP_UPTOI:
2316       case OP_NOTUPTO:
2317       case OP_NOTUPTOI:
2318       case OP_MINUPTO:
2319       case OP_MINUPTOI:
2320       case OP_NOTMINUPTO:
2321       case OP_NOTMINUPTOI:
2322       case OP_POSUPTO:
2323       case OP_POSUPTOI:
2324       case OP_NOTPOSUPTO:
2325       case OP_NOTPOSUPTOI:
2326       case OP_STAR:
2327       case OP_STARI:
2328       case OP_NOTSTAR:
2329       case OP_NOTSTARI:
2330       case OP_MINSTAR:
2331       case OP_MINSTARI:
2332       case OP_NOTMINSTAR:
2333       case OP_NOTMINSTARI:
2334       case OP_POSSTAR:
2335       case OP_POSSTARI:
2336       case OP_NOTPOSSTAR:
2337       case OP_NOTPOSSTARI:
2338       case OP_PLUS:
2339       case OP_PLUSI:
2340       case OP_NOTPLUS:
2341       case OP_NOTPLUSI:
2342       case OP_MINPLUS:
2343       case OP_MINPLUSI:
2344       case OP_NOTMINPLUS:
2345       case OP_NOTMINPLUSI:
2346       case OP_POSPLUS:
2347       case OP_POSPLUSI:
2348       case OP_NOTPOSPLUS:
2349       case OP_NOTPOSPLUSI:
2350       case OP_QUERY:
2351       case OP_QUERYI:
2352       case OP_NOTQUERY:
2353       case OP_NOTQUERYI:
2354       case OP_MINQUERY:
2355       case OP_MINQUERYI:
2356       case OP_NOTMINQUERY:
2357       case OP_NOTMINQUERYI:
2358       case OP_POSQUERY:
2359       case OP_POSQUERYI:
2360       case OP_NOTPOSQUERY:
2361       case OP_NOTPOSQUERYI:
2362       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2363       break;
2364       }
2365 #else
2366     (void)(utf);  /* Keep compiler happy by referencing function argument */
2367 #endif
2368     }
2369   }
2370 }
2371
2372
2373
2374 /*************************************************
2375 *    Scan compiled branch for non-emptiness      *
2376 *************************************************/
2377
2378 /* This function scans through a branch of a compiled pattern to see whether it
2379 can match the empty string or not. It is called from could_be_empty()
2380 below and from compile_branch() when checking for an unlimited repeat of a
2381 group that can match nothing. Note that first_significant_code() skips over
2382 backward and negative forward assertions when its final argument is TRUE. If we
2383 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2384 bracket whose current branch will already have been scanned.
2385
2386 Arguments:
2387   code        points to start of search
2388   endcode     points to where to stop
2389   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2390   cd          contains pointers to tables etc.
2391   recurses    chain of recurse_check to catch mutual recursion
2392
2393 Returns:      TRUE if what is matched could be empty
2394 */
2395
2396 static BOOL
2397 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2398   BOOL utf, compile_data *cd, recurse_check *recurses)
2399 {
2400 register pcre_uchar c;
2401 recurse_check this_recurse;
2402
2403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2404      code < endcode;
2405      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2406   {
2407   const pcre_uchar *ccode;
2408
2409   c = *code;
2410
2411   /* Skip over forward assertions; the other assertions are skipped by
2412   first_significant_code() with a TRUE final argument. */
2413
2414   if (c == OP_ASSERT)
2415     {
2416     do code += GET(code, 1); while (*code == OP_ALT);
2417     c = *code;
2418     continue;
2419     }
2420
2421   /* For a recursion/subroutine call, if its end has been reached, which
2422   implies a backward reference subroutine call, we can scan it. If it's a
2423   forward reference subroutine call, we can't. To detect forward reference
2424   we have to scan up the list that is kept in the workspace. This function is
2425   called only when doing the real compile, not during the pre-compile that
2426   measures the size of the compiled pattern. */
2427
2428   if (c == OP_RECURSE)
2429     {
2430     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2431     const pcre_uchar *endgroup = scode;
2432     BOOL empty_branch;
2433
2434     /* Test for forward reference or uncompleted reference. This is disabled
2435     when called to scan a completed pattern by setting cd->start_workspace to
2436     NULL. */
2437
2438     if (cd->start_workspace != NULL)
2439       {
2440       const pcre_uchar *tcode;
2441       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2442         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2443       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2444       }
2445
2446     /* If the reference is to a completed group, we need to detect whether this
2447     is a recursive call, as otherwise there will be an infinite loop. If it is
2448     a recursion, just skip over it. Simple recursions are easily detected. For
2449     mutual recursions we keep a chain on the stack. */
2450
2451     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2452     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2453     else
2454       {
2455       recurse_check *r = recurses;
2456       for (r = recurses; r != NULL; r = r->prev)
2457         if (r->group == scode) break;
2458       if (r != NULL) continue;   /* Mutual recursion */
2459       }
2460
2461     /* Completed reference; scan the referenced group, remembering it on the
2462     stack chain to detect mutual recursions. */
2463
2464     empty_branch = FALSE;
2465     this_recurse.prev = recurses;
2466     this_recurse.group = scode;
2467
2468     do
2469       {
2470       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2471         {
2472         empty_branch = TRUE;
2473         break;
2474         }
2475       scode += GET(scode, 1);
2476       }
2477     while (*scode == OP_ALT);
2478
2479     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2480     continue;
2481     }
2482
2483   /* Groups with zero repeats can of course be empty; skip them. */
2484
2485   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2486       c == OP_BRAPOSZERO)
2487     {
2488     code += PRIV(OP_lengths)[c];
2489     do code += GET(code, 1); while (*code == OP_ALT);
2490     c = *code;
2491     continue;
2492     }
2493
2494   /* A nested group that is already marked as "could be empty" can just be
2495   skipped. */
2496
2497   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2498       c == OP_SCBRA || c == OP_SCBRAPOS)
2499     {
2500     do code += GET(code, 1); while (*code == OP_ALT);
2501     c = *code;
2502     continue;
2503     }
2504
2505   /* For other groups, scan the branches. */
2506
2507   if (c == OP_BRA  || c == OP_BRAPOS ||
2508       c == OP_CBRA || c == OP_CBRAPOS ||
2509       c == OP_ONCE || c == OP_ONCE_NC ||
2510       c == OP_COND || c == OP_SCOND)
2511     {
2512     BOOL empty_branch;
2513     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2514
2515     /* If a conditional group has only one branch, there is a second, implied,
2516     empty branch, so just skip over the conditional, because it could be empty.
2517     Otherwise, scan the individual branches of the group. */
2518
2519     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2520       code += GET(code, 1);
2521     else
2522       {
2523       empty_branch = FALSE;
2524       do
2525         {
2526         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2527           recurses)) empty_branch = TRUE;
2528         code += GET(code, 1);
2529         }
2530       while (*code == OP_ALT);
2531       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2532       }
2533
2534     c = *code;
2535     continue;
2536     }
2537
2538   /* Handle the other opcodes */
2539
2540   switch (c)
2541     {
2542     /* Check for quantifiers after a class. XCLASS is used for classes that
2543     cannot be represented just by a bit map. This includes negated single
2544     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2545     actual length is stored in the compiled code, so we must update "code"
2546     here. */
2547
2548 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2549     case OP_XCLASS:
2550     ccode = code += GET(code, 1);
2551     goto CHECK_CLASS_REPEAT;
2552 #endif
2553
2554     case OP_CLASS:
2555     case OP_NCLASS:
2556     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2557
2558 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2559     CHECK_CLASS_REPEAT:
2560 #endif
2561
2562     switch (*ccode)
2563       {
2564       case OP_CRSTAR:            /* These could be empty; continue */
2565       case OP_CRMINSTAR:
2566       case OP_CRQUERY:
2567       case OP_CRMINQUERY:
2568       case OP_CRPOSSTAR:
2569       case OP_CRPOSQUERY:
2570       break;
2571
2572       default:                   /* Non-repeat => class must match */
2573       case OP_CRPLUS:            /* These repeats aren't empty */
2574       case OP_CRMINPLUS:
2575       case OP_CRPOSPLUS:
2576       return FALSE;
2577
2578       case OP_CRRANGE:
2579       case OP_CRMINRANGE:
2580       case OP_CRPOSRANGE:
2581       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2582       break;
2583       }
2584     break;
2585
2586     /* Opcodes that must match a character */
2587
2588     case OP_ANY:
2589     case OP_ALLANY:
2590     case OP_ANYBYTE:
2591
2592     case OP_PROP:
2593     case OP_NOTPROP:
2594     case OP_ANYNL:
2595
2596     case OP_NOT_HSPACE:
2597     case OP_HSPACE:
2598     case OP_NOT_VSPACE:
2599     case OP_VSPACE:
2600     case OP_EXTUNI:
2601
2602     case OP_NOT_DIGIT:
2603     case OP_DIGIT:
2604     case OP_NOT_WHITESPACE:
2605     case OP_WHITESPACE:
2606     case OP_NOT_WORDCHAR:
2607     case OP_WORDCHAR:
2608
2609     case OP_CHAR:
2610     case OP_CHARI:
2611     case OP_NOT:
2612     case OP_NOTI:
2613
2614     case OP_PLUS:
2615     case OP_PLUSI:
2616     case OP_MINPLUS:
2617     case OP_MINPLUSI:
2618
2619     case OP_NOTPLUS:
2620     case OP_NOTPLUSI:
2621     case OP_NOTMINPLUS:
2622     case OP_NOTMINPLUSI:
2623
2624     case OP_POSPLUS:
2625     case OP_POSPLUSI:
2626     case OP_NOTPOSPLUS:
2627     case OP_NOTPOSPLUSI:
2628
2629     case OP_EXACT:
2630     case OP_EXACTI:
2631     case OP_NOTEXACT:
2632     case OP_NOTEXACTI:
2633
2634     case OP_TYPEPLUS:
2635     case OP_TYPEMINPLUS:
2636     case OP_TYPEPOSPLUS:
2637     case OP_TYPEEXACT:
2638
2639     return FALSE;
2640
2641     /* These are going to continue, as they may be empty, but we have to
2642     fudge the length for the \p and \P cases. */
2643
2644     case OP_TYPESTAR:
2645     case OP_TYPEMINSTAR:
2646     case OP_TYPEPOSSTAR:
2647     case OP_TYPEQUERY:
2648     case OP_TYPEMINQUERY:
2649     case OP_TYPEPOSQUERY:
2650     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2651     break;
2652
2653     /* Same for these */
2654
2655     case OP_TYPEUPTO:
2656     case OP_TYPEMINUPTO:
2657     case OP_TYPEPOSUPTO:
2658     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2659       code += 2;
2660     break;
2661
2662     /* End of branch */
2663
2664     case OP_KET:
2665     case OP_KETRMAX:
2666     case OP_KETRMIN:
2667     case OP_KETRPOS:
2668     case OP_ALT:
2669     return TRUE;
2670
2671     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2672     MINUPTO, and POSUPTO and their caseless and negative versions may be
2673     followed by a multibyte character. */
2674
2675 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2676     case OP_STAR:
2677     case OP_STARI:
2678     case OP_NOTSTAR:
2679     case OP_NOTSTARI:
2680
2681     case OP_MINSTAR:
2682     case OP_MINSTARI:
2683     case OP_NOTMINSTAR:
2684     case OP_NOTMINSTARI:
2685
2686     case OP_POSSTAR:
2687     case OP_POSSTARI:
2688     case OP_NOTPOSSTAR:
2689     case OP_NOTPOSSTARI:
2690
2691     case OP_QUERY:
2692     case OP_QUERYI:
2693     case OP_NOTQUERY:
2694     case OP_NOTQUERYI:
2695
2696     case OP_MINQUERY:
2697     case OP_MINQUERYI:
2698     case OP_NOTMINQUERY:
2699     case OP_NOTMINQUERYI:
2700
2701     case OP_POSQUERY:
2702     case OP_POSQUERYI:
2703     case OP_NOTPOSQUERY:
2704     case OP_NOTPOSQUERYI:
2705
2706     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2707     break;
2708
2709     case OP_UPTO:
2710     case OP_UPTOI:
2711     case OP_NOTUPTO:
2712     case OP_NOTUPTOI:
2713
2714     case OP_MINUPTO:
2715     case OP_MINUPTOI:
2716     case OP_NOTMINUPTO:
2717     case OP_NOTMINUPTOI:
2718
2719     case OP_POSUPTO:
2720     case OP_POSUPTOI:
2721     case OP_NOTPOSUPTO:
2722     case OP_NOTPOSUPTOI:
2723
2724     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2725     break;
2726 #endif
2727
2728     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2729     string. */
2730
2731     case OP_MARK:
2732     case OP_PRUNE_ARG:
2733     case OP_SKIP_ARG:
2734     case OP_THEN_ARG:
2735     code += code[1];
2736     break;
2737
2738     /* None of the remaining opcodes are required to match a character. */
2739
2740     default:
2741     break;
2742     }
2743   }
2744
2745 return TRUE;
2746 }
2747
2748
2749
2750 /*************************************************
2751 *    Scan compiled regex for non-emptiness       *
2752 *************************************************/
2753
2754 /* This function is called to check for left recursive calls. We want to check
2755 the current branch of the current pattern to see if it could match the empty
2756 string. If it could, we must look outwards for branches at other levels,
2757 stopping when we pass beyond the bracket which is the subject of the recursion.
2758 This function is called only during the real compile, not during the
2759 pre-compile.
2760
2761 Arguments:
2762   code        points to start of the recursion
2763   endcode     points to where to stop (current RECURSE item)
2764   bcptr       points to the chain of current (unclosed) branch starts
2765   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2766   cd          pointers to tables etc
2767
2768 Returns:      TRUE if what is matched could be empty
2769 */
2770
2771 static BOOL
2772 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2773   branch_chain *bcptr, BOOL utf, compile_data *cd)
2774 {
2775 while (bcptr != NULL && bcptr->current_branch >= code)
2776   {
2777   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2778     return FALSE;
2779   bcptr = bcptr->outer;
2780   }
2781 return TRUE;
2782 }
2783
2784
2785
2786 /*************************************************
2787 *        Base opcode of repeated opcodes         *
2788 *************************************************/
2789
2790 /* Returns the base opcode for repeated single character type opcodes. If the
2791 opcode is not a repeated character type, it returns with the original value.
2792
2793 Arguments:  c opcode
2794 Returns:    base opcode for the type
2795 */
2796
2797 static pcre_uchar
2798 get_repeat_base(pcre_uchar c)
2799 {
2800 return (c > OP_TYPEPOSUPTO)? c :
2801        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2802        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2803        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2804        (c >= OP_STARI)?      OP_STARI :
2805                              OP_STAR;
2806 }
2807
2808
2809
2810 #ifdef SUPPORT_UCP
2811 /*************************************************
2812 *        Check a character and a property        *
2813 *************************************************/
2814
2815 /* This function is called by check_auto_possessive() when a property item
2816 is adjacent to a fixed character.
2817
2818 Arguments:
2819   c            the character
2820   ptype        the property type
2821   pdata        the data for the type
2822   negated      TRUE if it's a negated property (\P or \p{^)
2823
2824 Returns:       TRUE if auto-possessifying is OK
2825 */
2826
2827 static BOOL
2828 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2829   BOOL negated)
2830 {
2831 const pcre_uint32 *p;
2832 const ucd_record *prop = GET_UCD(c);
2833
2834 switch(ptype)
2835   {
2836   case PT_LAMP:
2837   return (prop->chartype == ucp_Lu ||
2838           prop->chartype == ucp_Ll ||
2839           prop->chartype == ucp_Lt) == negated;
2840
2841   case PT_GC:
2842   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2843
2844   case PT_PC:
2845   return (pdata == prop->chartype) == negated;
2846
2847   case PT_SC:
2848   return (pdata == prop->script) == negated;
2849
2850   /* These are specials */
2851
2852   case PT_ALNUM:
2853   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2854           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2855
2856   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2857   means that Perl space and POSIX space are now identical. PCRE was changed
2858   at release 8.34. */
2859
2860   case PT_SPACE:    /* Perl space */
2861   case PT_PXSPACE:  /* POSIX space */
2862   switch(c)
2863     {
2864     HSPACE_CASES:
2865     VSPACE_CASES:
2866     return negated;
2867
2868     default:
2869     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2870     }
2871   break;  /* Control never reaches here */
2872
2873   case PT_WORD:
2874   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2875           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2876           c == CHAR_UNDERSCORE) == negated;
2877
2878   case PT_CLIST:
2879   p = PRIV(ucd_caseless_sets) + prop->caseset;
2880   for (;;)
2881     {
2882     if (c < *p) return !negated;
2883     if (c == *p++) return negated;
2884     }
2885   break;  /* Control never reaches here */
2886   }
2887
2888 return FALSE;
2889 }
2890 #endif  /* SUPPORT_UCP */
2891
2892
2893
2894 /*************************************************
2895 *        Fill the character property list        *
2896 *************************************************/
2897
2898 /* Checks whether the code points to an opcode that can take part in auto-
2899 possessification, and if so, fills a list with its properties.
2900
2901 Arguments:
2902   code        points to start of expression
2903   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2904   fcc         points to case-flipping table
2905   list        points to output list
2906               list[0] will be filled with the opcode
2907               list[1] will be non-zero if this opcode
2908                 can match an empty character string
2909               list[2..7] depends on the opcode
2910
2911 Returns:      points to the start of the next opcode if *code is accepted
2912               NULL if *code is not accepted
2913 */
2914
2915 static const pcre_uchar *
2916 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2917   const pcre_uint8 *fcc, pcre_uint32 *list)
2918 {
2919 pcre_uchar c = *code;
2920 pcre_uchar base;
2921 const pcre_uchar *end;
2922 pcre_uint32 chr;
2923
2924 #ifdef SUPPORT_UCP
2925 pcre_uint32 *clist_dest;
2926 const pcre_uint32 *clist_src;
2927 #else
2928 utf = utf;  /* Suppress "unused parameter" compiler warning */
2929 #endif
2930
2931 list[0] = c;
2932 list[1] = FALSE;
2933 code++;
2934
2935 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2936   {
2937   base = get_repeat_base(c);
2938   c -= (base - OP_STAR);
2939
2940   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2941     code += IMM2_SIZE;
2942
2943   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2944
2945   switch(base)
2946     {
2947     case OP_STAR:
2948     list[0] = OP_CHAR;
2949     break;
2950
2951     case OP_STARI:
2952     list[0] = OP_CHARI;
2953     break;
2954
2955     case OP_NOTSTAR:
2956     list[0] = OP_NOT;
2957     break;
2958
2959     case OP_NOTSTARI:
2960     list[0] = OP_NOTI;
2961     break;
2962
2963     case OP_TYPESTAR:
2964     list[0] = *code;
2965     code++;
2966     break;
2967     }
2968   c = list[0];
2969   }
2970
2971 switch(c)
2972   {
2973   case OP_NOT_DIGIT:
2974   case OP_DIGIT:
2975   case OP_NOT_WHITESPACE:
2976   case OP_WHITESPACE:
2977   case OP_NOT_WORDCHAR:
2978   case OP_WORDCHAR:
2979   case OP_ANY:
2980   case OP_ALLANY:
2981   case OP_ANYNL:
2982   case OP_NOT_HSPACE:
2983   case OP_HSPACE:
2984   case OP_NOT_VSPACE:
2985   case OP_VSPACE:
2986   case OP_EXTUNI:
2987   case OP_EODN:
2988   case OP_EOD:
2989   case OP_DOLL:
2990   case OP_DOLLM:
2991   return code;
2992
2993   case OP_CHAR:
2994   case OP_NOT:
2995   GETCHARINCTEST(chr, code);
2996   list[2] = chr;
2997   list[3] = NOTACHAR;
2998   return code;
2999
3000   case OP_CHARI:
3001   case OP_NOTI:
3002   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3003   GETCHARINCTEST(chr, code);
3004   list[2] = chr;
3005
3006 #ifdef SUPPORT_UCP
3007   if (chr < 128 || (chr < 256 && !utf))
3008     list[3] = fcc[chr];
3009   else
3010     list[3] = UCD_OTHERCASE(chr);
3011 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3012   list[3] = (chr < 256) ? fcc[chr] : chr;
3013 #else
3014   list[3] = fcc[chr];
3015 #endif
3016
3017   /* The othercase might be the same value. */
3018
3019   if (chr == list[3])
3020     list[3] = NOTACHAR;
3021   else
3022     list[4] = NOTACHAR;
3023   return code;
3024
3025 #ifdef SUPPORT_UCP
3026   case OP_PROP:
3027   case OP_NOTPROP:
3028   if (code[0] != PT_CLIST)
3029     {
3030     list[2] = code[0];
3031     list[3] = code[1];
3032     return code + 2;
3033     }
3034
3035   /* Convert only if we have enough space. */
3036
3037   clist_src = PRIV(ucd_caseless_sets) + code[1];
3038   clist_dest = list + 2;
3039   code += 2;
3040
3041   do {
3042      if (clist_dest >= list + 8)
3043        {
3044        /* Early return if there is not enough space. This should never
3045        happen, since all clists are shorter than 5 character now. */
3046        list[2] = code[0];
3047        list[3] = code[1];
3048        return code;
3049        }
3050      *clist_dest++ = *clist_src;
3051      }
3052   while(*clist_src++ != NOTACHAR);
3053
3054   /* All characters are stored. The terminating NOTACHAR
3055   is copied form the clist itself. */
3056
3057   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3058   return code;
3059 #endif
3060
3061   case OP_NCLASS:
3062   case OP_CLASS:
3063 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3064   case OP_XCLASS:
3065   if (c == OP_XCLASS)
3066     end = code + GET(code, 0) - 1;
3067   else
3068 #endif
3069     end = code + 32 / sizeof(pcre_uchar);
3070
3071   switch(*end)
3072     {
3073     case OP_CRSTAR:
3074     case OP_CRMINSTAR:
3075     case OP_CRQUERY:
3076     case OP_CRMINQUERY:
3077     case OP_CRPOSSTAR:
3078     case OP_CRPOSQUERY:
3079     list[1] = TRUE;
3080     end++;
3081     break;
3082
3083     case OP_CRPLUS:
3084     case OP_CRMINPLUS:
3085     case OP_CRPOSPLUS:
3086     end++;
3087     break;
3088
3089     case OP_CRRANGE:
3090     case OP_CRMINRANGE:
3091     case OP_CRPOSRANGE:
3092     list[1] = (GET2(end, 1) == 0);
3093     end += 1 + 2 * IMM2_SIZE;
3094     break;
3095     }
3096   list[2] = (pcre_uint32)(end - code);
3097   return end;
3098   }
3099 return NULL;    /* Opcode not accepted */
3100 }
3101
3102
3103
3104 /*************************************************
3105 *    Scan further character sets for match       *
3106 *************************************************/
3107
3108 /* Checks whether the base and the current opcode have a common character, in
3109 which case the base cannot be possessified.
3110
3111 Arguments:
3112   code        points to the byte code
3113   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3114   cd          static compile data
3115   base_list   the data list of the base opcode
3116
3117 Returns:      TRUE if the auto-possessification is possible
3118 */
3119
3120 static BOOL
3121 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3122   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3123 {
3124 pcre_uchar c;
3125 pcre_uint32 list[8];
3126 const pcre_uint32 *chr_ptr;
3127 const pcre_uint32 *ochr_ptr;
3128 const pcre_uint32 *list_ptr;
3129 const pcre_uchar *next_code;
3130 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3131 const pcre_uchar *xclass_flags;
3132 #endif
3133 const pcre_uint8 *class_bitset;
3134 const pcre_uint8 *set1, *set2, *set_end;
3135 pcre_uint32 chr;
3136 BOOL accepted, invert_bits;
3137 BOOL entered_a_group = FALSE;
3138
3139 if (*rec_limit == 0) return FALSE;
3140 --(*rec_limit);
3141
3142 /* Note: the base_list[1] contains whether the current opcode has greedy
3143 (represented by a non-zero value) quantifier. This is a different from
3144 other character type lists, which stores here that the character iterator
3145 matches to an empty string (also represented by a non-zero value). */
3146
3147 for(;;)
3148   {
3149   /* All operations move the code pointer forward.
3150   Therefore infinite recursions are not possible. */
3151
3152   c = *code;
3153
3154   /* Skip over callouts */
3155
3156   if (c == OP_CALLOUT)
3157     {
3158     code += PRIV(OP_lengths)[c];
3159     continue;
3160     }
3161
3162   if (c == OP_ALT)
3163     {
3164     do code += GET(code, 1); while (*code == OP_ALT);
3165     c = *code;
3166     }
3167
3168   switch(c)
3169     {
3170     case OP_END:
3171     case OP_KETRPOS:
3172     /* TRUE only in greedy case. The non-greedy case could be replaced by
3173     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3174     uses more memory, which we cannot get at this stage.) */
3175
3176     return base_list[1] != 0;
3177
3178     case OP_KET:
3179     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3180     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3181     cannot be converted to a possessive form. */
3182
3183     if (base_list[1] == 0) return FALSE;
3184
3185     switch(*(code - GET(code, 1)))
3186       {
3187       case OP_ASSERT:
3188       case OP_ASSERT_NOT:
3189       case OP_ASSERTBACK:
3190       case OP_ASSERTBACK_NOT:
3191       case OP_ONCE:
3192       case OP_ONCE_NC:
3193       /* Atomic sub-patterns and assertions can always auto-possessify their
3194       last iterator. However, if the group was entered as a result of checking
3195       a previous iterator, this is not possible. */
3196
3197       return !entered_a_group;
3198       }
3199
3200     code += PRIV(OP_lengths)[c];
3201     continue;
3202
3203     case OP_ONCE:
3204     case OP_ONCE_NC:
3205     case OP_BRA:
3206     case OP_CBRA:
3207     next_code = code + GET(code, 1);
3208     code += PRIV(OP_lengths)[c];
3209
3210     while (*next_code == OP_ALT)
3211       {
3212       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3213         return FALSE;
3214       code = next_code + 1 + LINK_SIZE;
3215       next_code += GET(next_code, 1);
3216       }
3217
3218     entered_a_group = TRUE;
3219     continue;
3220
3221     case OP_BRAZERO:
3222     case OP_BRAMINZERO:
3223
3224     next_code = code + 1;
3225     if (*next_code != OP_BRA && *next_code != OP_CBRA
3226         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3227
3228     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3229
3230     /* The bracket content will be checked by the
3231     OP_BRA/OP_CBRA case above. */
3232     next_code += 1 + LINK_SIZE;
3233     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3234       return FALSE;
3235
3236     code += PRIV(OP_lengths)[c];
3237     continue;
3238
3239     default:
3240     break;
3241     }
3242
3243   /* Check for a supported opcode, and load its properties. */
3244
3245   code = get_chr_property_list(code, utf, cd->fcc, list);
3246   if (code == NULL) return FALSE;    /* Unsupported */
3247
3248   /* If either opcode is a small character list, set pointers for comparing
3249   characters from that list with another list, or with a property. */
3250
3251   if (base_list[0] == OP_CHAR)
3252     {
3253     chr_ptr = base_list + 2;
3254     list_ptr = list;
3255     }
3256   else if (list[0] == OP_CHAR)
3257     {
3258     chr_ptr = list + 2;
3259     list_ptr = base_list;
3260     }
3261
3262   /* Character bitsets can also be compared to certain opcodes. */
3263
3264   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3265 #ifdef COMPILE_PCRE8
3266       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3267       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3268 #endif
3269       )
3270     {
3271 #ifdef COMPILE_PCRE8
3272     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3273 #else
3274     if (base_list[0] == OP_CLASS)
3275 #endif
3276       {
3277       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3278       list_ptr = list;
3279       }
3280     else
3281       {
3282       set1 = (pcre_uint8 *)(code - list[2]);
3283       list_ptr = base_list;
3284       }
3285
3286     invert_bits = FALSE;
3287     switch(list_ptr[0])
3288       {
3289       case OP_CLASS:
3290       case OP_NCLASS:
3291       set2 = (pcre_uint8 *)
3292         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3293       break;
3294
3295 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3296       case OP_XCLASS:
3297       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3298       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3299       if ((*xclass_flags & XCL_MAP) == 0)
3300         {
3301         /* No bits are set for characters < 256. */
3302         if (list[1] == 0) return TRUE;
3303         /* Might be an empty repeat. */
3304         continue;
3305         }
3306       set2 = (pcre_uint8 *)(xclass_flags + 1);
3307       break;
3308 #endif
3309
3310       case OP_NOT_DIGIT:
3311       invert_bits = TRUE;
3312       /* Fall through */
3313       case OP_DIGIT:
3314       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3315       break;
3316
3317       case OP_NOT_WHITESPACE:
3318       invert_bits = TRUE;
3319       /* Fall through */
3320       case OP_WHITESPACE:
3321       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3322       break;
3323
3324       case OP_NOT_WORDCHAR:
3325       invert_bits = TRUE;
3326       /* Fall through */
3327       case OP_WORDCHAR:
3328       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3329       break;
3330
3331       default:
3332       return FALSE;
3333       }
3334
3335     /* Because the sets are unaligned, we need
3336     to perform byte comparison here. */
3337     set_end = set1 + 32;
3338     if (invert_bits)
3339       {
3340       do
3341         {
3342         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3343         }
3344       while (set1 < set_end);
3345       }
3346     else
3347       {
3348       do
3349         {
3350         if ((*set1++ & *set2++) != 0) return FALSE;
3351         }
3352       while (set1 < set_end);
3353       }
3354
3355     if (list[1] == 0) return TRUE;
3356     /* Might be an empty repeat. */
3357     continue;
3358     }
3359
3360   /* Some property combinations also acceptable. Unicode property opcodes are
3361   processed specially; the rest can be handled with a lookup table. */
3362
3363   else
3364     {
3365     pcre_uint32 leftop, rightop;
3366
3367     leftop = base_list[0];
3368     rightop = list[0];
3369
3370 #ifdef SUPPORT_UCP
3371     accepted = FALSE; /* Always set in non-unicode case. */
3372     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3373       {
3374       if (rightop == OP_EOD)
3375         accepted = TRUE;
3376       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3377         {
3378         int n;
3379         const pcre_uint8 *p;
3380         BOOL same = leftop == rightop;
3381         BOOL lisprop = leftop == OP_PROP;
3382         BOOL risprop = rightop == OP_PROP;
3383         BOOL bothprop = lisprop && risprop;
3384
3385         /* There's a table that specifies how each combination is to be
3386         processed:
3387           0   Always return FALSE (never auto-possessify)
3388           1   Character groups are distinct (possessify if both are OP_PROP)
3389           2   Check character categories in the same group (general or particular)
3390           3   Return TRUE if the two opcodes are not the same
3391           ... see comments below
3392         */
3393
3394         n = propposstab[base_list[2]][list[2]];
3395         switch(n)
3396           {
3397           case 0: break;
3398           case 1: accepted = bothprop; break;
3399           case 2: accepted = (base_list[3] == list[3]) != same; break;
3400           case 3: accepted = !same; break;
3401
3402           case 4:  /* Left general category, right particular category */
3403           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3404           break;
3405
3406           case 5:  /* Right general category, left particular category */
3407           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3408           break;
3409
3410           /* This code is logically tricky. Think hard before fiddling with it.
3411           The posspropstab table has four entries per row. Each row relates to
3412           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3413           Only WORD actually needs all four entries, but using repeats for the
3414           others means they can all use the same code below.
3415
3416           The first two entries in each row are Unicode general categories, and
3417           apply always, because all the characters they include are part of the
3418           PCRE character set. The third and fourth entries are a general and a
3419           particular category, respectively, that include one or more relevant
3420           characters. One or the other is used, depending on whether the check
3421           is for a general or a particular category. However, in both cases the
3422           category contains more characters than the specials that are defined
3423           for the property being tested against. Therefore, it cannot be used
3424           in a NOTPROP case.
3425
3426           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3427           Underscore is covered by ucp_P or ucp_Po. */
3428
3429           case 6:  /* Left alphanum vs right general category */
3430           case 7:  /* Left space vs right general category */
3431           case 8:  /* Left word vs right general category */
3432           p = posspropstab[n-6];
3433           accepted = risprop && lisprop ==
3434             (list[3] != p[0] &&
3435              list[3] != p[1] &&
3436             (list[3] != p[2] || !lisprop));
3437           break;
3438
3439           case 9:   /* Right alphanum vs left general category */
3440           case 10:  /* Right space vs left general category */
3441           case 11:  /* Right word vs left general category */
3442           p = posspropstab[n-9];
3443           accepted = lisprop && risprop ==
3444             (base_list[3] != p[0] &&
3445              base_list[3] != p[1] &&
3446             (base_list[3] != p[2] || !risprop));
3447           break;
3448
3449           case 12:  /* Left alphanum vs right particular category */
3450           case 13:  /* Left space vs right particular category */
3451           case 14:  /* Left word vs right particular category */
3452           p = posspropstab[n-12];
3453           accepted = risprop && lisprop ==
3454             (catposstab[p[0]][list[3]] &&
3455              catposstab[p[1]][list[3]] &&
3456             (list[3] != p[3] || !lisprop));
3457           break;
3458
3459           case 15:  /* Right alphanum vs left particular category */
3460           case 16:  /* Right space vs left particular category */
3461           case 17:  /* Right word vs left particular category */
3462           p = posspropstab[n-15];
3463           accepted = lisprop && risprop ==
3464             (catposstab[p[0]][base_list[3]] &&
3465              catposstab[p[1]][base_list[3]] &&
3466             (base_list[3] != p[3] || !risprop));
3467           break;
3468           }
3469         }
3470       }
3471
3472     else
3473 #endif  /* SUPPORT_UCP */
3474
3475     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3476            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3477            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3478
3479     if (!accepted) return FALSE;
3480
3481     if (list[1] == 0) return TRUE;
3482     /* Might be an empty repeat. */
3483     continue;
3484     }
3485
3486   /* Control reaches here only if one of the items is a small character list.
3487   All characters are checked against the other side. */
3488
3489   do
3490     {
3491     chr = *chr_ptr;
3492
3493     switch(list_ptr[0])
3494       {
3495       case OP_CHAR:
3496       ochr_ptr = list_ptr + 2;
3497       do
3498         {
3499         if (chr == *ochr_ptr) return FALSE;
3500         ochr_ptr++;
3501         }
3502       while(*ochr_ptr != NOTACHAR);
3503       break;
3504
3505       case OP_NOT:
3506       ochr_ptr = list_ptr + 2;
3507       do
3508         {
3509         if (chr == *ochr_ptr)
3510           break;
3511         ochr_ptr++;
3512         }
3513       while(*ochr_ptr != NOTACHAR);
3514       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3515       break;
3516
3517       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3518       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3519
3520       case OP_DIGIT:
3521       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3522       break;
3523
3524       case OP_NOT_DIGIT:
3525       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3526       break;
3527
3528       case OP_WHITESPACE:
3529       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3530       break;
3531
3532       case OP_NOT_WHITESPACE:
3533       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3534       break;
3535
3536       case OP_WORDCHAR:
3537       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3538       break;
3539
3540       case OP_NOT_WORDCHAR:
3541       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3542       break;
3543
3544       case OP_HSPACE:
3545       switch(chr)
3546         {
3547         HSPACE_CASES: return FALSE;
3548         default: break;
3549         }
3550       break;
3551
3552       case OP_NOT_HSPACE:
3553       switch(chr)
3554         {
3555         HSPACE_CASES: break;
3556         default: return FALSE;
3557         }
3558       break;
3559
3560       case OP_ANYNL:
3561       case OP_VSPACE:
3562       switch(chr)
3563         {
3564         VSPACE_CASES: return FALSE;
3565         default: break;
3566         }
3567       break;
3568
3569       case OP_NOT_VSPACE:
3570       switch(chr)
3571         {
3572         VSPACE_CASES: break;
3573         default: return FALSE;
3574         }
3575       break;
3576
3577       case OP_DOLL:
3578       case OP_EODN:
3579       switch (chr)
3580         {
3581         case CHAR_CR:
3582         case CHAR_LF:
3583         case CHAR_VT:
3584         case CHAR_FF:
3585         case CHAR_NEL:
3586 #ifndef EBCDIC
3587         case 0x2028:
3588         case 0x2029:
3589 #endif  /* Not EBCDIC */
3590         return FALSE;
3591         }
3592       break;
3593
3594       case OP_EOD:    /* Can always possessify before \z */
3595       break;
3596
3597 #ifdef SUPPORT_UCP
3598       case OP_PROP:
3599       case OP_NOTPROP:
3600       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3601             list_ptr[0] == OP_NOTPROP))
3602         return FALSE;
3603       break;
3604 #endif
3605
3606       case OP_NCLASS:
3607       if (chr > 255) return FALSE;
3608       /* Fall through */
3609
3610       case OP_CLASS:
3611       if (chr > 255) break;
3612       class_bitset = (pcre_uint8 *)
3613         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3614       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3615       break;
3616
3617 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3618       case OP_XCLASS:
3619       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3620           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3621       break;
3622 #endif
3623
3624       default:
3625       return FALSE;
3626       }
3627
3628     chr_ptr++;
3629     }
3630   while(*chr_ptr != NOTACHAR);
3631
3632   /* At least one character must be matched from this opcode. */
3633
3634   if (list[1] == 0) return TRUE;
3635   }
3636
3637 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3638 but some compilers complain about an unreachable statement. */
3639
3640 }
3641
3642
3643
3644 /*************************************************
3645 *    Scan compiled regex for auto-possession     *
3646 *************************************************/
3647
3648 /* Replaces single character iterations with their possessive alternatives
3649 if appropriate. This function modifies the compiled opcode!
3650
3651 Arguments:
3652   code        points to start of the byte code
3653   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3654   cd          static compile data
3655
3656 Returns:      nothing
3657 */
3658
3659 static void
3660 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3661 {
3662 register pcre_uchar c;
3663 const pcre_uchar *end;
3664 pcre_uchar *repeat_opcode;
3665 pcre_uint32 list[8];
3666 int rec_limit;
3667
3668 for (;;)
3669   {
3670   c = *code;
3671
3672   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3673   it may compile without complaining, but may get into a loop here if the code
3674   pointer points to a bad value. This is, of course a documentated possibility,
3675   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3676   just give up on this optimization. */
3677
3678   if (c >= OP_TABLE_LENGTH) return;
3679
3680   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3681     {
3682     c -= get_repeat_base(c) - OP_STAR;
3683     end = (c <= OP_MINUPTO) ?
3684       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3685     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3686
3687     rec_limit = 1000;
3688     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3689       {
3690       switch(c)
3691         {
3692         case OP_STAR:
3693         *code += OP_POSSTAR - OP_STAR;
3694         break;
3695
3696         case OP_MINSTAR:
3697         *code += OP_POSSTAR - OP_MINSTAR;
3698         break;
3699
3700         case OP_PLUS:
3701         *code += OP_POSPLUS - OP_PLUS;
3702         break;
3703
3704         case OP_MINPLUS:
3705         *code += OP_POSPLUS - OP_MINPLUS;
3706         break;
3707
3708         case OP_QUERY:
3709         *code += OP_POSQUERY - OP_QUERY;
3710         break;
3711
3712         case OP_MINQUERY:
3713         *code += OP_POSQUERY - OP_MINQUERY;
3714         break;
3715
3716         case OP_UPTO:
3717         *code += OP_POSUPTO - OP_UPTO;
3718         break;
3719
3720         case OP_MINUPTO:
3721         *code += OP_POSUPTO - OP_MINUPTO;
3722         break;
3723         }
3724       }
3725     c = *code;
3726     }
3727   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3728     {
3729 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3730     if (c == OP_XCLASS)
3731       repeat_opcode = code + GET(code, 1);
3732     else
3733 #endif
3734       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3735
3736     c = *repeat_opcode;
3737     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3738       {
3739       /* end must not be NULL. */
3740       end = get_chr_property_list(code, utf, cd->fcc, list);
3741
3742       list[1] = (c & 1) == 0;
3743
3744       rec_limit = 1000;
3745       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3746         {
3747         switch (c)
3748           {
3749           case OP_CRSTAR:
3750           case OP_CRMINSTAR:
3751           *repeat_opcode = OP_CRPOSSTAR;
3752           break;
3753
3754           case OP_CRPLUS:
3755           case OP_CRMINPLUS:
3756           *repeat_opcode = OP_CRPOSPLUS;
3757           break;
3758
3759           case OP_CRQUERY:
3760           case OP_CRMINQUERY:
3761           *repeat_opcode = OP_CRPOSQUERY;
3762           break;
3763
3764           case OP_CRRANGE:
3765           case OP_CRMINRANGE:
3766           *repeat_opcode = OP_CRPOSRANGE;
3767           break;
3768           }
3769         }
3770       }
3771     c = *code;
3772     }
3773
3774   switch(c)
3775     {
3776     case OP_END:
3777     return;
3778
3779     case OP_TYPESTAR:
3780     case OP_TYPEMINSTAR:
3781     case OP_TYPEPLUS:
3782     case OP_TYPEMINPLUS:
3783     case OP_TYPEQUERY:
3784     case OP_TYPEMINQUERY:
3785     case OP_TYPEPOSSTAR:
3786     case OP_TYPEPOSPLUS:
3787     case OP_TYPEPOSQUERY:
3788     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3789     break;
3790
3791     case OP_TYPEUPTO:
3792     case OP_TYPEMINUPTO:
3793     case OP_TYPEEXACT:
3794     case OP_TYPEPOSUPTO:
3795     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3796       code += 2;
3797     break;
3798
3799 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3800     case OP_XCLASS:
3801     code += GET(code, 1);
3802     break;
3803 #endif
3804
3805     case OP_MARK:
3806     case OP_PRUNE_ARG:
3807     case OP_SKIP_ARG:
3808     case OP_THEN_ARG:
3809     code += code[1];
3810     break;
3811     }
3812
3813   /* Add in the fixed length from the table */
3814
3815   code += PRIV(OP_lengths)[c];
3816
3817   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3818   a multi-byte character. The length in the table is a minimum, so we have to
3819   arrange to skip the extra bytes. */
3820
3821 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3822   if (utf) switch(c)
3823     {
3824     case OP_CHAR:
3825     case OP_CHARI:
3826     case OP_NOT:
3827     case OP_NOTI:
3828     case OP_STAR:
3829     case OP_MINSTAR:
3830     case OP_PLUS:
3831     case OP_MINPLUS:
3832     case OP_QUERY:
3833     case OP_MINQUERY:
3834     case OP_UPTO:
3835     case OP_MINUPTO:
3836     case OP_EXACT:
3837     case OP_POSSTAR:
3838     case OP_POSPLUS:
3839     case OP_POSQUERY:
3840     case OP_POSUPTO:
3841     case OP_STARI:
3842     case OP_MINSTARI:
3843     case OP_PLUSI:
3844     case OP_MINPLUSI:
3845     case OP_QUERYI:
3846     case OP_MINQUERYI:
3847     case OP_UPTOI:
3848     case OP_MINUPTOI:
3849     case OP_EXACTI:
3850     case OP_POSSTARI:
3851     case OP_POSPLUSI:
3852     case OP_POSQUERYI:
3853     case OP_POSUPTOI:
3854     case OP_NOTSTAR:
3855     case OP_NOTMINSTAR:
3856     case OP_NOTPLUS:
3857     case OP_NOTMINPLUS:
3858     case OP_NOTQUERY:
3859     case OP_NOTMINQUERY:
3860     case OP_NOTUPTO:
3861     case OP_NOTMINUPTO:
3862     case OP_NOTEXACT:
3863     case OP_NOTPOSSTAR:
3864     case OP_NOTPOSPLUS:
3865     case OP_NOTPOSQUERY:
3866     case OP_NOTPOSUPTO:
3867     case OP_NOTSTARI:
3868     case OP_NOTMINSTARI:
3869     case OP_NOTPLUSI:
3870     case OP_NOTMINPLUSI:
3871     case OP_NOTQUERYI:
3872     case OP_NOTMINQUERYI:
3873     case OP_NOTUPTOI:
3874     case OP_NOTMINUPTOI:
3875     case OP_NOTEXACTI:
3876     case OP_NOTPOSSTARI:
3877     case OP_NOTPOSPLUSI:
3878     case OP_NOTPOSQUERYI:
3879     case OP_NOTPOSUPTOI:
3880     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3881     break;
3882     }
3883 #else
3884   (void)(utf);  /* Keep compiler happy by referencing function argument */
3885 #endif
3886   }
3887 }
3888
3889
3890
3891 /*************************************************
3892 *           Check for POSIX class syntax         *
3893 *************************************************/
3894
3895 /* This function is called when the sequence "[:" or "[." or "[=" is
3896 encountered in a character class. It checks whether this is followed by a
3897 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3898 reach an unescaped ']' without the special preceding character, return FALSE.
3899
3900 Originally, this function only recognized a sequence of letters between the
3901 terminators, but it seems that Perl recognizes any sequence of characters,
3902 though of course unknown POSIX names are subsequently rejected. Perl gives an
3903 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3904 didn't consider this to be a POSIX class. Likewise for [:1234:].
3905
3906 The problem in trying to be exactly like Perl is in the handling of escapes. We
3907 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3908 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3909 below handles the special cases \\ and \], but does not try to do any other
3910 escape processing. This makes it different from Perl for cases such as
3911 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3912 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3913 when Perl does, I think.
3914
3915 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3916 It seems that the appearance of a nested POSIX class supersedes an apparent
3917 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3918 a digit.
3919
3920 In Perl, unescaped square brackets may also appear as part of class names. For
3921 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3922 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3923 seem right at all. PCRE does not allow closing square brackets in POSIX class
3924 names.
3925
3926 Arguments:
3927   ptr      pointer to the initial [
3928   endptr   where to return the end pointer
3929
3930 Returns:   TRUE or FALSE
3931 */
3932
3933 static BOOL
3934 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3935 {
3936 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3937 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3938 for (++ptr; *ptr != CHAR_NULL; ptr++)
3939   {
3940   if (*ptr == CHAR_BACKSLASH &&
3941       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3942        ptr[1] == CHAR_BACKSLASH))
3943     ptr++;
3944   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3945             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3946   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3947     {
3948     *endptr = ptr;
3949     return TRUE;
3950     }
3951   }
3952 return FALSE;
3953 }
3954
3955
3956
3957
3958 /*************************************************
3959 *          Check POSIX class name                *
3960 *************************************************/
3961
3962 /* This function is called to check the name given in a POSIX-style class entry
3963 such as [:alnum:].
3964
3965 Arguments:
3966   ptr        points to the first letter
3967   len        the length of the name
3968
3969 Returns:     a value representing the name, or -1 if unknown
3970 */
3971
3972 static int
3973 check_posix_name(const pcre_uchar *ptr, int len)
3974 {
3975 const char *pn = posix_names;
3976 register int yield = 0;
3977 while (posix_name_lengths[yield] != 0)
3978   {
3979   if (len == posix_name_lengths[yield] &&
3980     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3981   pn += posix_name_lengths[yield] + 1;
3982   yield++;
3983   }
3984 return -1;
3985 }
3986
3987
3988 /*************************************************
3989 *    Adjust OP_RECURSE items in repeated group   *
3990 *************************************************/
3991
3992 /* OP_RECURSE items contain an offset from the start of the regex to the group
3993 that is referenced. This means that groups can be replicated for fixed
3994 repetition simply by copying (because the recursion is allowed to refer to
3995 earlier groups that are outside the current group). However, when a group is
3996 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3997 inserted before it, after it has been compiled. This means that any OP_RECURSE
3998 items within it that refer to the group itself or any contained groups have to
3999 have their offsets adjusted. That one of the jobs of this function. Before it
4000 is called, the partially compiled regex must be temporarily terminated with
4001 OP_END.
4002
4003 This function has been extended to cope with forward references for recursions
4004 and subroutine calls. It must check the list of such references for the
4005 group we are dealing with. If it finds that one of the recursions in the
4006 current group is on this list, it does not adjust the value in the reference
4007 (which is a group number). After the group has been scanned, all the offsets in
4008 the forward reference list for the group are adjusted.
4009
4010 Arguments:
4011   group      points to the start of the group
4012   adjust     the amount by which the group is to be moved
4013   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4014   cd         contains pointers to tables etc.
4015   save_hwm_offset   the hwm forward reference offset at the start of the group
4016
4017 Returns:     nothing
4018 */
4019
4020 static void
4021 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4022   size_t save_hwm_offset)
4023 {
4024 int offset;
4025 pcre_uchar *hc;
4026 pcre_uchar *ptr = group;
4027
4028 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4029   {
4030   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4031        hc += LINK_SIZE)
4032     {
4033     offset = (int)GET(hc, 0);
4034     if (cd->start_code + offset == ptr + 1) break;
4035     }
4036
4037   /* If we have not found this recursion on the forward reference list, adjust
4038   the recursion's offset if it's after the start of this group. */
4039
4040   if (hc >= cd->hwm)
4041     {
4042     offset = (int)GET(ptr, 1);
4043     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4044     }
4045
4046   ptr += 1 + LINK_SIZE;
4047   }
4048
4049 /* Now adjust all forward reference offsets for the group. */
4050
4051 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4052      hc += LINK_SIZE)
4053   {
4054   offset = (int)GET(hc, 0);
4055   PUT(hc, 0, offset + adjust);
4056   }
4057 }
4058
4059
4060
4061 /*************************************************
4062 *        Insert an automatic callout point       *
4063 *************************************************/
4064
4065 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4066 callout points before each pattern item.
4067
4068 Arguments:
4069   code           current code pointer
4070   ptr            current pattern pointer
4071   cd             pointers to tables etc
4072
4073 Returns:         new code pointer
4074 */
4075
4076 static pcre_uchar *
4077 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4078 {
4079 *code++ = OP_CALLOUT;
4080 *code++ = 255;
4081 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4082 PUT(code, LINK_SIZE, 0);                       /* Default length */
4083 return code + 2 * LINK_SIZE;
4084 }
4085
4086
4087
4088 /*************************************************
4089 *         Complete a callout item                *
4090 *************************************************/
4091
4092 /* A callout item contains the length of the next item in the pattern, which
4093 we can't fill in till after we have reached the relevant point. This is used
4094 for both automatic and manual callouts.
4095
4096 Arguments:
4097   previous_callout   points to previous callout item
4098   ptr                current pattern pointer
4099   cd                 pointers to tables etc
4100
4101 Returns:             nothing
4102 */
4103
4104 static void
4105 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4106 {
4107 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4108 PUT(previous_callout, 2 + LINK_SIZE, length);
4109 }
4110
4111
4112
4113 #ifdef SUPPORT_UCP
4114 /*************************************************
4115 *           Get othercase range                  *
4116 *************************************************/
4117
4118 /* This function is passed the start and end of a class range, in UTF-8 mode
4119 with UCP support. It searches up the characters, looking for ranges of
4120 characters in the "other" case. Each call returns the next one, updating the
4121 start address. A character with multiple other cases is returned on its own
4122 with a special return value.
4123
4124 Arguments:
4125   cptr        points to starting character value; updated
4126   d           end value
4127   ocptr       where to put start of othercase range
4128   odptr       where to put end of othercase range
4129
4130 Yield:        -1 when no more
4131                0 when a range is returned
4132               >0 the CASESET offset for char with multiple other cases
4133                 in this case, ocptr contains the original
4134 */
4135
4136 static int
4137 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4138   pcre_uint32 *odptr)
4139 {
4140 pcre_uint32 c, othercase, next;
4141 unsigned int co;
4142
4143 /* Find the first character that has an other case. If it has multiple other
4144 cases, return its case offset value. */
4145
4146 for (c = *cptr; c <= d; c++)
4147   {
4148   if ((co = UCD_CASESET(c)) != 0)
4149     {
4150     *ocptr = c++;   /* Character that has the set */
4151     *cptr = c;      /* Rest of input range */
4152     return (int)co;
4153     }
4154   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4155   }
4156
4157 if (c > d) return -1;  /* Reached end of range */
4158
4159 /* Found a character that has a single other case. Search for the end of the
4160 range, which is either the end of the input range, or a character that has zero
4161 or more than one other cases. */
4162
4163 *ocptr = othercase;
4164 next = othercase + 1;
4165
4166 for (++c; c <= d; c++)
4167   {
4168   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4169   next++;
4170   }
4171
4172 *odptr = next - 1;     /* End of othercase range */
4173 *cptr = c;             /* Rest of input range */
4174 return 0;
4175 }
4176 #endif  /* SUPPORT_UCP */
4177
4178
4179
4180 /*************************************************
4181 *        Add a character or range to a class     *
4182 *************************************************/
4183
4184 /* This function packages up the logic of adding a character or range of
4185 characters to a class. The character values in the arguments will be within the
4186 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4187 mutually recursive with the function immediately below.
4188
4189 Arguments:
4190   classbits     the bit map for characters < 256
4191   uchardptr     points to the pointer for extra data
4192   options       the options word
4193   cd            contains pointers to tables etc.
4194   start         start of range character
4195   end           end of range character
4196
4197 Returns:        the number of < 256 characters added
4198                 the pointer to extra data is updated
4199 */
4200
4201 static int
4202 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4203   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4204 {
4205 pcre_uint32 c;
4206 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4207 int n8 = 0;
4208
4209 /* If caseless matching is required, scan the range and process alternate
4210 cases. In Unicode, there are 8-bit characters that have alternate cases that
4211 are greater than 255 and vice-versa. Sometimes we can just extend the original
4212 range. */
4213
4214 if ((options & PCRE_CASELESS) != 0)
4215   {
4216 #ifdef SUPPORT_UCP
4217   if ((options & PCRE_UTF8) != 0)
4218     {
4219     int rc;
4220     pcre_uint32 oc, od;
4221
4222     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4223     c = start;
4224
4225     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4226       {
4227       /* Handle a single character that has more than one other case. */
4228
4229       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4230         PRIV(ucd_caseless_sets) + rc, oc);
4231
4232       /* Do nothing if the other case range is within the original range. */
4233
4234       else if (oc >= start && od <= end) continue;
4235
4236       /* Extend the original range if there is overlap, noting that if oc < c, we
4237       can't have od > end because a subrange is always shorter than the basic
4238       range. Otherwise, use a recursive call to add the additional range. */
4239
4240       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4241       else if (od > end && oc <= end + 1)
4242         {
4243         end = od;       /* Extend upwards */
4244         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4245         }
4246       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4247       }
4248     }
4249   else
4250 #endif  /* SUPPORT_UCP */
4251
4252   /* Not UTF-mode, or no UCP */
4253
4254   for (c = start; c <= classbits_end; c++)
4255     {
4256     SETBIT(classbits, cd->fcc[c]);
4257     n8++;
4258     }
4259   }
4260
4261 /* Now handle the original range. Adjust the final value according to the bit
4262 length - this means that the same lists of (e.g.) horizontal spaces can be used
4263 in all cases. */
4264
4265 #if defined COMPILE_PCRE8
4266 #ifdef SUPPORT_UTF
4267   if ((options & PCRE_UTF8) == 0)
4268 #endif
4269   if (end > 0xff) end = 0xff;
4270
4271 #elif defined COMPILE_PCRE16
4272 #ifdef SUPPORT_UTF
4273   if ((options & PCRE_UTF16) == 0)
4274 #endif
4275   if (end > 0xffff) end = 0xffff;
4276
4277 #endif /* COMPILE_PCRE[8|16] */
4278
4279 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4280
4281 for (c = start; c <= classbits_end; c++)
4282   {
4283   /* Regardless of start, c will always be <= 255. */
4284   SETBIT(classbits, c);
4285   n8++;
4286   }
4287
4288 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4289 if (start <= 0xff) start = 0xff + 1;
4290
4291 if (end >= start)
4292   {
4293   pcre_uchar *uchardata = *uchardptr;
4294 #ifdef SUPPORT_UTF
4295   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4296     {
4297     if (start < end)
4298       {
4299       *uchardata++ = XCL_RANGE;
4300       uchardata += PRIV(ord2utf)(start, uchardata);
4301       uchardata += PRIV(ord2utf)(end, uchardata);
4302       }
4303     else if (start == end)
4304       {
4305       *uchardata++ = XCL_SINGLE;
4306       uchardata += PRIV(ord2utf)(start, uchardata);
4307       }
4308     }
4309   else
4310 #endif  /* SUPPORT_UTF */
4311
4312   /* Without UTF support, character values are constrained by the bit length,
4313   and can only be > 256 for 16-bit and 32-bit libraries. */
4314
4315 #ifdef COMPILE_PCRE8
4316     {}
4317 #else
4318   if (start < end)
4319     {
4320     *uchardata++ = XCL_RANGE;
4321     *uchardata++ = start;
4322     *uchardata++ = end;
4323     }
4324   else if (start == end)
4325     {
4326     *uchardata++ = XCL_SINGLE;
4327     *uchardata++ = start;
4328     }
4329 #endif
4330
4331   *uchardptr = uchardata;   /* Updata extra data pointer */
4332   }
4333 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4334
4335 return n8;    /* Number of 8-bit characters */
4336 }
4337
4338
4339
4340
4341 /*************************************************
4342 *        Add a list of characters to a class     *
4343 *************************************************/
4344
4345 /* This function is used for adding a list of case-equivalent characters to a
4346 class, and also for adding a list of horizontal or vertical whitespace. If the
4347 list is in order (which it should be), ranges of characters are detected and
4348 handled appropriately. This function is mutually recursive with the function
4349 above.
4350
4351 Arguments:
4352   classbits     the bit map for characters < 256
4353   uchardptr     points to the pointer for extra data
4354   options       the options word
4355   cd            contains pointers to tables etc.
4356   p             points to row of 32-bit values, terminated by NOTACHAR
4357   except        character to omit; this is used when adding lists of
4358                   case-equivalent characters to avoid including the one we
4359                   already know about
4360
4361 Returns:        the number of < 256 characters added
4362                 the pointer to extra data is updated
4363 */
4364
4365 static int
4366 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4367   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4368 {
4369 int n8 = 0;
4370 while (p[0] < NOTACHAR)
4371   {
4372   int n = 0;
4373   if (p[0] != except)
4374     {
4375     while(p[n+1] == p[0] + n + 1) n++;
4376     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4377     }
4378   p += n + 1;
4379   }
4380 return n8;
4381 }
4382
4383
4384
4385 /*************************************************
4386 *    Add characters not in a list to a class     *
4387 *************************************************/
4388
4389 /* This function is used for adding the complement of a list of horizontal or
4390 vertical whitespace to a class. The list must be in order.
4391
4392 Arguments:
4393   classbits     the bit map for characters < 256
4394   uchardptr     points to the pointer for extra data
4395   options       the options word
4396   cd            contains pointers to tables etc.
4397   p             points to row of 32-bit values, terminated by NOTACHAR
4398
4399 Returns:        the number of < 256 characters added
4400                 the pointer to extra data is updated
4401 */
4402
4403 static int
4404 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4405   int options, compile_data *cd, const pcre_uint32 *p)
4406 {
4407 BOOL utf = (options & PCRE_UTF8) != 0;
4408 int n8 = 0;
4409 if (p[0] > 0)
4410   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4411 while (p[0] < NOTACHAR)
4412   {
4413   while (p[1] == p[0] + 1) p++;
4414   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4415     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4416   p++;
4417   }
4418 return n8;
4419 }
4420
4421
4422
4423 /*************************************************
4424 *           Compile one branch                   *
4425 *************************************************/
4426
4427 /* Scan the pattern, compiling it into the a vector. If the options are
4428 changed during the branch, the pointer is used to change the external options
4429 bits. This function is used during the pre-compile phase when we are trying
4430 to find out the amount of memory needed, as well as during the real compile
4431 phase. The value of lengthptr distinguishes the two phases.
4432
4433 Arguments:
4434   optionsptr        pointer to the option bits
4435   codeptr           points to the pointer to the current code point
4436   ptrptr            points to the current pattern pointer
4437   errorcodeptr      points to error code variable
4438   firstcharptr      place to put the first required character
4439   firstcharflagsptr place to put the first character flags, or a negative number
4440   reqcharptr        place to put the last required character
4441   reqcharflagsptr   place to put the last required character flags, or a negative number
4442   bcptr             points to current branch chain
4443   cond_depth        conditional nesting depth
4444   cd                contains pointers to tables etc.
4445   lengthptr         NULL during the real compile phase
4446                     points to length accumulator during pre-compile phase
4447
4448 Returns:            TRUE on success
4449                     FALSE, with *errorcodeptr set non-zero on error
4450 */
4451
4452 static BOOL
4453 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4454   const pcre_uchar **ptrptr, int *errorcodeptr,
4455   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4456   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4457   branch_chain *bcptr, int cond_depth,
4458   compile_data *cd, int *lengthptr)
4459 {
4460 int repeat_type, op_type;
4461 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4462 int bravalue = 0;
4463 int greedy_default, greedy_non_default;
4464 pcre_uint32 firstchar, reqchar;
4465 pcre_int32 firstcharflags, reqcharflags;
4466 pcre_uint32 zeroreqchar, zerofirstchar;
4467 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4468 pcre_int32 req_caseopt, reqvary, tempreqvary;
4469 int options = *optionsptr;               /* May change dynamically */
4470 int after_manual_callout = 0;
4471 int length_prevgroup = 0;
4472 register pcre_uint32 c;
4473 int escape;
4474 register pcre_uchar *code = *codeptr;
4475 pcre_uchar *last_code = code;
4476 pcre_uchar *orig_code = code;
4477 pcre_uchar *tempcode;
4478 BOOL inescq = FALSE;
4479 BOOL groupsetfirstchar = FALSE;
4480 const pcre_uchar *ptr = *ptrptr;
4481 const pcre_uchar *tempptr;
4482 const pcre_uchar *nestptr = NULL;
4483 pcre_uchar *previous = NULL;
4484 pcre_uchar *previous_callout = NULL;
4485 size_t item_hwm_offset = 0;
4486 pcre_uint8 classbits[32];
4487
4488 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4489 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4490 dynamically as we process the pattern. */
4491
4492 #ifdef SUPPORT_UTF
4493 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4494 BOOL utf = (options & PCRE_UTF8) != 0;
4495 #ifndef COMPILE_PCRE32
4496 pcre_uchar utf_chars[6];
4497 #endif
4498 #else
4499 BOOL utf = FALSE;
4500 #endif
4501
4502 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4503 class_uchardata always so that it can be passed to add_to_class() always,
4504 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4505 alternative calls for the different cases. */
4506
4507 pcre_uchar *class_uchardata;
4508 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4509 BOOL xclass;
4510 pcre_uchar *class_uchardata_base;
4511 #endif
4512
4513 #ifdef PCRE_DEBUG
4514 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4515 #endif
4516
4517 /* Set up the default and non-default settings for greediness */
4518
4519 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4520 greedy_non_default = greedy_default ^ 1;
4521
4522 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4523 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4524 matches a non-fixed char first char; reqchar just remains unset if we never
4525 find one.
4526
4527 When we hit a repeat whose minimum is zero, we may have to adjust these values
4528 to take the zero repeat into account. This is implemented by setting them to
4529 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4530 item types that can be repeated set these backoff variables appropriately. */
4531
4532 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4533 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4534
4535 /* The variable req_caseopt contains either the REQ_CASELESS value
4536 or zero, according to the current setting of the caseless flag. The
4537 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4538 firstchar or reqchar variables to record the case status of the
4539 value. This is used only for ASCII characters. */
4540
4541 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4542
4543 /* Switch on next character until the end of the branch */
4544
4545 for (;; ptr++)
4546   {
4547   BOOL negate_class;
4548   BOOL should_flip_negation;
4549   BOOL possessive_quantifier;
4550   BOOL is_quantifier;
4551   BOOL is_recurse;
4552   BOOL reset_bracount;
4553   int class_has_8bitchar;
4554   int class_one_char;
4555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4556   BOOL xclass_has_prop;
4557 #endif
4558   int newoptions;
4559   int recno;
4560   int refsign;
4561   int skipbytes;
4562   pcre_uint32 subreqchar, subfirstchar;
4563   pcre_int32 subreqcharflags, subfirstcharflags;
4564   int terminator;
4565   unsigned int mclength;
4566   unsigned int tempbracount;
4567   pcre_uint32 ec;
4568   pcre_uchar mcbuffer[8];
4569
4570   /* Get next character in the pattern */
4571
4572   c = *ptr;
4573
4574   /* If we are at the end of a nested substitution, revert to the outer level
4575   string. Nesting only happens one level deep. */
4576
4577   if (c == CHAR_NULL && nestptr != NULL)
4578     {
4579     ptr = nestptr;
4580     nestptr = NULL;
4581     c = *ptr;
4582     }
4583
4584   /* If we are in the pre-compile phase, accumulate the length used for the
4585   previous cycle of this loop. */
4586
4587   if (lengthptr != NULL)
4588     {
4589 #ifdef PCRE_DEBUG
4590     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4591 #endif
4592     if (code > cd->start_workspace + cd->workspace_size -
4593         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4594       {
4595       *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4596         ERR52 : ERR87;
4597       goto FAILED;
4598       }
4599
4600     /* There is at least one situation where code goes backwards: this is the
4601     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4602     the class is simply eliminated. However, it is created first, so we have to
4603     allow memory for it. Therefore, don't ever reduce the length at this point.
4604     */
4605
4606     if (code < last_code) code = last_code;
4607
4608     /* Paranoid check for integer overflow */
4609
4610     if (OFLOW_MAX - *lengthptr < code - last_code)
4611       {
4612       *errorcodeptr = ERR20;
4613       goto FAILED;
4614       }
4615
4616     *lengthptr += (int)(code - last_code);
4617     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4618       (int)(code - last_code), c, c));
4619
4620     /* If "previous" is set and it is not at the start of the work space, move
4621     it back to there, in order to avoid filling up the work space. Otherwise,
4622     if "previous" is NULL, reset the current code pointer to the start. */
4623
4624     if (previous != NULL)
4625       {
4626       if (previous > orig_code)
4627         {
4628         memmove(orig_code, previous, IN_UCHARS(code - previous));
4629         code -= previous - orig_code;
4630         previous = orig_code;
4631         }
4632       }
4633     else code = orig_code;
4634
4635     /* Remember where this code item starts so we can pick up the length
4636     next time round. */
4637
4638     last_code = code;
4639     }
4640
4641   /* In the real compile phase, just check the workspace used by the forward
4642   reference list. */
4643
4644   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4645     {
4646     *errorcodeptr = ERR52;
4647     goto FAILED;
4648     }
4649
4650   /* If in \Q...\E, check for the end; if not, we have a literal */
4651
4652   if (inescq && c != CHAR_NULL)
4653     {
4654     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4655       {
4656       inescq = FALSE;
4657       ptr++;
4658       continue;
4659       }
4660     else
4661       {
4662       if (previous_callout != NULL)
4663         {
4664         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4665           complete_callout(previous_callout, ptr, cd);
4666         previous_callout = NULL;
4667         }
4668       if ((options & PCRE_AUTO_CALLOUT) != 0)
4669         {
4670         previous_callout = code;
4671         code = auto_callout(code, ptr, cd);
4672         }
4673       goto NORMAL_CHAR;
4674       }
4675     /* Control does not reach here. */
4676     }
4677
4678   /* In extended mode, skip white space and comments. We need a loop in order
4679   to check for more white space and more comments after a comment. */
4680
4681   if ((options & PCRE_EXTENDED) != 0)
4682     {
4683     for (;;)
4684       {
4685       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4686       if (c != CHAR_NUMBER_SIGN) break;
4687       ptr++;
4688       while (*ptr != CHAR_NULL)
4689         {
4690         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4691           {                          /* IS_NEWLINE sets cd->nllen. */
4692           ptr += cd->nllen;
4693           break;
4694           }
4695         ptr++;
4696 #ifdef SUPPORT_UTF
4697         if (utf) FORWARDCHAR(ptr);
4698 #endif
4699         }
4700       c = *ptr;     /* Either NULL or the char after a newline */
4701       }
4702     }
4703
4704   /* See if the next thing is a quantifier. */
4705
4706   is_quantifier =
4707     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4708     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4709
4710   /* Fill in length of a previous callout, except when the next thing is a
4711   quantifier or when processing a property substitution string in UCP mode. */
4712
4713   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4714        after_manual_callout-- <= 0)
4715     {
4716     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4717       complete_callout(previous_callout, ptr, cd);
4718     previous_callout = NULL;
4719     }
4720
4721   /* Create auto callout, except for quantifiers, or while processing property
4722   strings that are substituted for \w etc in UCP mode. */
4723
4724   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4725     {
4726     previous_callout = code;
4727     code = auto_callout(code, ptr, cd);
4728     }
4729
4730   /* Process the next pattern item. */
4731
4732   switch(c)
4733     {
4734     /* ===================================================================*/
4735     case CHAR_NULL:                /* The branch terminates at string end */
4736     case CHAR_VERTICAL_LINE:       /* or | or ) */
4737     case CHAR_RIGHT_PARENTHESIS:
4738     *firstcharptr = firstchar;
4739     *firstcharflagsptr = firstcharflags;
4740     *reqcharptr = reqchar;
4741     *reqcharflagsptr = reqcharflags;
4742     *codeptr = code;
4743     *ptrptr = ptr;
4744     if (lengthptr != NULL)
4745       {
4746       if (OFLOW_MAX - *lengthptr < code - last_code)
4747         {
4748         *errorcodeptr = ERR20;
4749         goto FAILED;
4750         }
4751       *lengthptr += (int)(code - last_code);   /* To include callout length */
4752       DPRINTF((">> end branch\n"));
4753       }
4754     return TRUE;
4755
4756
4757     /* ===================================================================*/
4758     /* Handle single-character metacharacters. In multiline mode, ^ disables
4759     the setting of any following char as a first character. */
4760
4761     case CHAR_CIRCUMFLEX_ACCENT:
4762     previous = NULL;
4763     if ((options & PCRE_MULTILINE) != 0)
4764       {
4765       if (firstcharflags == REQ_UNSET)
4766         zerofirstcharflags = firstcharflags = REQ_NONE;
4767       *code++ = OP_CIRCM;
4768       }
4769     else *code++ = OP_CIRC;
4770     break;
4771
4772     case CHAR_DOLLAR_SIGN:
4773     previous = NULL;
4774     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4775     break;
4776
4777     /* There can never be a first char if '.' is first, whatever happens about
4778     repeats. The value of reqchar doesn't change either. */
4779
4780     case CHAR_DOT:
4781     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4782     zerofirstchar = firstchar;
4783     zerofirstcharflags = firstcharflags;
4784     zeroreqchar = reqchar;
4785     zeroreqcharflags = reqcharflags;
4786     previous = code;
4787     item_hwm_offset = cd->hwm - cd->start_workspace;
4788     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4789     break;
4790
4791
4792     /* ===================================================================*/
4793     /* Character classes. If the included characters are all < 256, we build a
4794     32-byte bitmap of the permitted characters, except in the special case
4795     where there is only one such character. For negated classes, we build the
4796     map as usual, then invert it at the end. However, we use a different opcode
4797     so that data characters > 255 can be handled correctly.
4798
4799     If the class contains characters outside the 0-255 range, a different
4800     opcode is compiled. It may optionally have a bit map for characters < 256,
4801     but those above are are explicitly listed afterwards. A flag byte tells
4802     whether the bitmap is present, and whether this is a negated class or not.
4803
4804     In JavaScript compatibility mode, an isolated ']' causes an error. In
4805     default (Perl) mode, it is treated as a data character. */
4806
4807     case CHAR_RIGHT_SQUARE_BRACKET:
4808     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4809       {
4810       *errorcodeptr = ERR64;
4811       goto FAILED;
4812       }
4813     goto NORMAL_CHAR;
4814
4815     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4816     used for "start of word" and "end of word". As these are otherwise illegal
4817     sequences, we don't break anything by recognizing them. They are replaced
4818     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4819     erroneous and are handled by the normal code below. */
4820
4821     case CHAR_LEFT_SQUARE_BRACKET:
4822     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4823       {
4824       nestptr = ptr + 7;
4825       ptr = sub_start_of_word - 1;
4826       continue;
4827       }
4828
4829     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4830       {
4831       nestptr = ptr + 7;
4832       ptr = sub_end_of_word - 1;
4833       continue;
4834       }
4835
4836     /* Handle a real character class. */
4837
4838     previous = code;
4839     item_hwm_offset = cd->hwm - cd->start_workspace;
4840
4841     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4842     they are encountered at the top level, so we'll do that too. */
4843
4844     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4845          ptr[1] == CHAR_EQUALS_SIGN) &&
4846         check_posix_syntax(ptr, &tempptr))
4847       {
4848       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4849       goto FAILED;
4850       }
4851
4852     /* If the first character is '^', set the negation flag and skip it. Also,
4853     if the first few characters (either before or after ^) are \Q\E or \E we
4854     skip them too. This makes for compatibility with Perl. */
4855
4856     negate_class = FALSE;
4857     for (;;)
4858       {
4859       c = *(++ptr);
4860       if (c == CHAR_BACKSLASH)
4861         {
4862         if (ptr[1] == CHAR_E)
4863           ptr++;
4864         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4865           ptr += 3;
4866         else
4867           break;
4868         }
4869       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4870         negate_class = TRUE;
4871       else break;
4872       }
4873
4874     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4875     an initial ']' is taken as a data character -- the code below handles
4876     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4877     [^] must match any character, so generate OP_ALLANY. */
4878
4879     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4880         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4881       {
4882       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4883       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4884       zerofirstchar = firstchar;
4885       zerofirstcharflags = firstcharflags;
4886       break;
4887       }
4888
4889     /* If a class contains a negative special such as \S, we need to flip the
4890     negation flag at the end, so that support for characters > 255 works
4891     correctly (they are all included in the class). */
4892
4893     should_flip_negation = FALSE;
4894
4895     /* Extended class (xclass) will be used when characters > 255
4896     might match. */
4897
4898 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4899     xclass = FALSE;
4900     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4901     class_uchardata_base = class_uchardata;   /* Save the start */
4902 #endif
4903
4904     /* For optimization purposes, we track some properties of the class:
4905     class_has_8bitchar will be non-zero if the class contains at least one <
4906     256 character; class_one_char will be 1 if the class contains just one
4907     character; xclass_has_prop will be TRUE if unicode property checks
4908     are present in the class. */
4909
4910     class_has_8bitchar = 0;
4911     class_one_char = 0;
4912 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4913     xclass_has_prop = FALSE;
4914 #endif
4915
4916     /* Initialize the 32-char bit map to all zeros. We build the map in a
4917     temporary bit of memory, in case the class contains fewer than two
4918     8-bit characters because in that case the compiled code doesn't use the bit
4919     map. */
4920
4921     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4922
4923     /* Process characters until ] is reached. By writing this as a "do" it
4924     means that an initial ] is taken as a data character. At the start of the
4925     loop, c contains the first byte of the character. */
4926
4927     if (c != CHAR_NULL) do
4928       {
4929       const pcre_uchar *oldptr;
4930
4931 #ifdef SUPPORT_UTF
4932       if (utf && HAS_EXTRALEN(c))
4933         {                           /* Braces are required because the */
4934         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4935         }
4936 #endif
4937
4938 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4939       /* In the pre-compile phase, accumulate the length of any extra
4940       data and reset the pointer. This is so that very large classes that
4941       contain a zillion > 255 characters no longer overwrite the work space
4942       (which is on the stack). We have to remember that there was XCLASS data,
4943       however. */
4944
4945       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4946
4947       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4948         {
4949         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4950         class_uchardata = class_uchardata_base;
4951         }
4952 #endif
4953
4954       /* Inside \Q...\E everything is literal except \E */
4955
4956       if (inescq)
4957         {
4958         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4959           {
4960           inescq = FALSE;                   /* Reset literal state */
4961           ptr++;                            /* Skip the 'E' */
4962           continue;                         /* Carry on with next */
4963           }
4964         goto CHECK_RANGE;                   /* Could be range if \E follows */
4965         }
4966
4967       /* Handle POSIX class names. Perl allows a negation extension of the
4968       form [:^name:]. A square bracket that doesn't match the syntax is
4969       treated as a literal. We also recognize the POSIX constructions
4970       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4971       5.6 and 5.8 do. */
4972
4973       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4974           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4975            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4976         {
4977         BOOL local_negate = FALSE;
4978         int posix_class, taboffset, tabopt;
4979         register const pcre_uint8 *cbits = cd->cbits;
4980         pcre_uint8 pbits[32];
4981
4982         if (ptr[1] != CHAR_COLON)
4983           {
4984           *errorcodeptr = ERR31;
4985           goto FAILED;
4986           }
4987
4988         ptr += 2;
4989         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4990           {
4991           local_negate = TRUE;
4992           should_flip_negation = TRUE;  /* Note negative special */
4993           ptr++;
4994           }
4995
4996         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4997         if (posix_class < 0)
4998           {
4999           *errorcodeptr = ERR30;
5000           goto FAILED;
5001           }
5002
5003         /* If matching is caseless, upper and lower are converted to
5004         alpha. This relies on the fact that the class table starts with
5005         alpha, lower, upper as the first 3 entries. */
5006
5007         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5008           posix_class = 0;
5009
5010         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5011         different escape sequences that use Unicode properties \p or \P. Others
5012         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5013         directly. */
5014
5015 #ifdef SUPPORT_UCP
5016         if ((options & PCRE_UCP) != 0)
5017           {
5018           unsigned int ptype = 0;
5019           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5020
5021           /* The posix_substitutes table specifies which POSIX classes can be
5022           converted to \p or \P items. */
5023
5024           if (posix_substitutes[pc] != NULL)
5025             {
5026             nestptr = tempptr + 1;
5027             ptr = posix_substitutes[pc] - 1;
5028             continue;
5029             }
5030
5031           /* There are three other classes that generate special property calls
5032           that are recognized only in an XCLASS. */
5033
5034           else switch(posix_class)
5035             {
5036             case PC_GRAPH:
5037             ptype = PT_PXGRAPH;
5038             /* Fall through */
5039             case PC_PRINT:
5040             if (ptype == 0) ptype = PT_PXPRINT;
5041             /* Fall through */
5042             case PC_PUNCT:
5043             if (ptype == 0) ptype = PT_PXPUNCT;
5044             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5045             *class_uchardata++ = ptype;
5046             *class_uchardata++ = 0;
5047             xclass_has_prop = TRUE;
5048             ptr = tempptr + 1;
5049             continue;
5050
5051             /* For the other POSIX classes (ascii, xdigit) we are going to fall
5052             through to the non-UCP case and build a bit map for characters with
5053             code points less than 256. If we are in a negated POSIX class
5054             within a non-negated overall class, characters with code points
5055             greater than 255 must all match. In the special case where we have
5056             not yet generated any xclass data, and this is the final item in
5057             the overall class, we need do nothing: later on, the opcode
5058             OP_NCLASS will be used to indicate that characters greater than 255
5059             are acceptable. If we have already seen an xclass item or one may
5060             follow (we have to assume that it might if this is not the end of
5061             the class), explicitly match all wide codepoints. */
5062
5063             default:
5064             if (!negate_class && local_negate &&
5065                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5066               {
5067               *class_uchardata++ = XCL_RANGE;
5068               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5069               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5070               }
5071             break;
5072             }
5073           }
5074 #endif
5075         /* In the non-UCP case, or when UCP makes no difference, we build the
5076         bit map for the POSIX class in a chunk of local store because we may be
5077         adding and subtracting from it, and we don't want to subtract bits that
5078         may be in the main map already. At the end we or the result into the
5079         bit map that is being built. */
5080
5081         posix_class *= 3;
5082
5083         /* Copy in the first table (always present) */
5084
5085         memcpy(pbits, cbits + posix_class_maps[posix_class],
5086           32 * sizeof(pcre_uint8));
5087
5088         /* If there is a second table, add or remove it as required. */
5089
5090         taboffset = posix_class_maps[posix_class + 1];
5091         tabopt = posix_class_maps[posix_class + 2];
5092
5093         if (taboffset >= 0)
5094           {
5095           if (tabopt >= 0)
5096             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5097           else
5098             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5099           }
5100
5101         /* Now see if we need to remove any special characters. An option
5102         value of 1 removes vertical space and 2 removes underscore. */
5103
5104         if (tabopt < 0) tabopt = -tabopt;
5105         if (tabopt == 1) pbits[1] &= ~0x3c;
5106           else if (tabopt == 2) pbits[11] &= 0x7f;
5107
5108         /* Add the POSIX table or its complement into the main table that is
5109         being built and we are done. */
5110
5111         if (local_negate)
5112           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5113         else
5114           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5115
5116         ptr = tempptr + 1;
5117         /* Every class contains at least one < 256 character. */
5118         class_has_8bitchar = 1;
5119         /* Every class contains at least two characters. */
5120         class_one_char = 2;
5121         continue;    /* End of POSIX syntax handling */
5122         }
5123
5124       /* Backslash may introduce a single character, or it may introduce one
5125       of the specials, which just set a flag. The sequence \b is a special
5126       case. Inside a class (and only there) it is treated as backspace. We
5127       assume that other escapes have more than one character in them, so
5128       speculatively set both class_has_8bitchar and class_one_char bigger
5129       than one. Unrecognized escapes fall through and are either treated
5130       as literal characters (by default), or are faulted if
5131       PCRE_EXTRA is set. */
5132
5133       if (c == CHAR_BACKSLASH)
5134         {
5135         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5136           TRUE);
5137         if (*errorcodeptr != 0) goto FAILED;
5138         if (escape == 0) c = ec;
5139         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5140         else if (escape == ESC_N)          /* \N is not supported in a class */
5141           {
5142           *errorcodeptr = ERR71;
5143           goto FAILED;
5144           }
5145         else if (escape == ESC_Q)            /* Handle start of quoted string */
5146           {
5147           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5148             {
5149             ptr += 2; /* avoid empty string */
5150             }
5151           else inescq = TRUE;
5152           continue;
5153           }
5154         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5155
5156         else
5157           {
5158           register const pcre_uint8 *cbits = cd->cbits;
5159           /* Every class contains at least two < 256 characters. */
5160           class_has_8bitchar++;
5161           /* Every class contains at least two characters. */
5162           class_one_char += 2;
5163
5164           switch (escape)
5165             {
5166 #ifdef SUPPORT_UCP
5167             case ESC_du:     /* These are the values given for \d etc */
5168             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5169             case ESC_wu:     /* escape sequence with an appropriate \p */
5170             case ESC_WU:     /* or \P to test Unicode properties instead */
5171             case ESC_su:     /* of the default ASCII testing. */
5172             case ESC_SU:
5173             nestptr = ptr;
5174             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5175             class_has_8bitchar--;                /* Undo! */
5176             continue;
5177 #endif
5178             case ESC_d:
5179             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5180             continue;
5181
5182             case ESC_D:
5183             should_flip_negation = TRUE;
5184             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5185             continue;
5186
5187             case ESC_w:
5188             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5189             continue;
5190
5191             case ESC_W:
5192             should_flip_negation = TRUE;
5193             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5194             continue;
5195
5196             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5197             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5198             previously set by something earlier in the character class.
5199             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5200             we could just adjust the appropriate bit. From PCRE 8.34 we no
5201             longer treat \s and \S specially. */
5202
5203             case ESC_s:
5204             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5205             continue;
5206
5207             case ESC_S:
5208             should_flip_negation = TRUE;
5209             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5210             continue;
5211
5212             /* The rest apply in both UCP and non-UCP cases. */
5213
5214             case ESC_h:
5215             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5216               PRIV(hspace_list), NOTACHAR);
5217             continue;
5218
5219             case ESC_H:
5220             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5221               cd, PRIV(hspace_list));
5222             continue;
5223
5224             case ESC_v:
5225             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5226               PRIV(vspace_list), NOTACHAR);
5227             continue;
5228
5229             case ESC_V:
5230             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5231               cd, PRIV(vspace_list));
5232             continue;
5233
5234             case ESC_p:
5235             case ESC_P:
5236 #ifdef SUPPORT_UCP
5237               {
5238               BOOL negated;
5239               unsigned int ptype = 0, pdata = 0;
5240               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5241                 goto FAILED;
5242               *class_uchardata++ = ((escape == ESC_p) != negated)?
5243                 XCL_PROP : XCL_NOTPROP;
5244               *class_uchardata++ = ptype;
5245               *class_uchardata++ = pdata;
5246               xclass_has_prop = TRUE;
5247               class_has_8bitchar--;                /* Undo! */
5248               continue;
5249               }
5250 #else
5251             *errorcodeptr = ERR45;
5252             goto FAILED;
5253 #endif
5254             /* Unrecognized escapes are faulted if PCRE is running in its
5255             strict mode. By default, for compatibility with Perl, they are
5256             treated as literals. */
5257
5258             default:
5259             if ((options & PCRE_EXTRA) != 0)
5260               {
5261               *errorcodeptr = ERR7;
5262               goto FAILED;
5263               }
5264             class_has_8bitchar--;    /* Undo the speculative increase. */
5265             class_one_char -= 2;     /* Undo the speculative increase. */
5266             c = *ptr;                /* Get the final character and fall through */
5267             break;
5268             }
5269           }
5270
5271         /* Fall through if the escape just defined a single character (c >= 0).
5272         This may be greater than 256. */
5273
5274         escape = 0;
5275
5276         }   /* End of backslash handling */
5277
5278       /* A character may be followed by '-' to form a range. However, Perl does
5279       not permit ']' to be the end of the range. A '-' character at the end is
5280       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5281       code for handling \Q and \E is messy. */
5282
5283       CHECK_RANGE:
5284       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5285         {
5286         inescq = FALSE;
5287         ptr += 2;
5288         }
5289       oldptr = ptr;
5290
5291       /* Remember if \r or \n were explicitly used */
5292
5293       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5294
5295       /* Check for range */
5296
5297       if (!inescq && ptr[1] == CHAR_MINUS)
5298         {
5299         pcre_uint32 d;
5300         ptr += 2;
5301         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5302
5303         /* If we hit \Q (not followed by \E) at this point, go into escaped
5304         mode. */
5305
5306         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5307           {
5308           ptr += 2;
5309           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5310             { ptr += 2; continue; }
5311           inescq = TRUE;
5312           break;
5313           }
5314
5315         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5316         back the pointer and jump to handle the character that preceded it. */
5317
5318         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5319           {
5320           ptr = oldptr;
5321           goto CLASS_SINGLE_CHARACTER;
5322           }
5323
5324         /* Otherwise, we have a potential range; pick up the next character */
5325
5326 #ifdef SUPPORT_UTF
5327         if (utf)
5328           {                           /* Braces are required because the */
5329           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5330           }
5331         else
5332 #endif
5333         d = *ptr;  /* Not UTF-8 mode */
5334
5335         /* The second part of a range can be a single-character escape
5336         sequence, but not any of the other escapes. Perl treats a hyphen as a
5337         literal in such circumstances. However, in Perl's warning mode, a
5338         warning is given, so PCRE now faults it as it is almost certainly a
5339         mistake on the user's part. */
5340
5341         if (!inescq)
5342           {
5343           if (d == CHAR_BACKSLASH)
5344             {
5345             int descape;
5346             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5347             if (*errorcodeptr != 0) goto FAILED;
5348
5349             /* 0 means a character was put into d; \b is backspace; any other
5350             special causes an error. */
5351
5352             if (descape != 0)
5353               {
5354               if (descape == ESC_b) d = CHAR_BS; else
5355                 {
5356                 *errorcodeptr = ERR83;
5357                 goto FAILED;
5358                 }
5359               }
5360             }
5361
5362           /* A hyphen followed by a POSIX class is treated in the same way. */
5363
5364           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5365                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5366                     ptr[1] == CHAR_EQUALS_SIGN) &&
5367                    check_posix_syntax(ptr, &tempptr))
5368             {
5369             *errorcodeptr = ERR83;
5370             goto FAILED;
5371             }
5372           }
5373
5374         /* Check that the two values are in the correct order. Optimize
5375         one-character ranges. */
5376
5377         if (d < c)
5378           {
5379           *errorcodeptr = ERR8;
5380           goto FAILED;
5381           }
5382         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5383
5384         /* We have found a character range, so single character optimizations
5385         cannot be done anymore. Any value greater than 1 indicates that there
5386         is more than one character. */
5387
5388         class_one_char = 2;
5389
5390         /* Remember an explicit \r or \n, and add the range to the class. */
5391
5392         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5393
5394         class_has_8bitchar +=
5395           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5396
5397         continue;   /* Go get the next char in the class */
5398         }
5399
5400       /* Handle a single character - we can get here for a normal non-escape
5401       char, or after \ that introduces a single character or for an apparent
5402       range that isn't. Only the value 1 matters for class_one_char, so don't
5403       increase it if it is already 2 or more ... just in case there's a class
5404       with a zillion characters in it. */
5405
5406       CLASS_SINGLE_CHARACTER:
5407       if (class_one_char < 2) class_one_char++;
5408
5409       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5410       single character in the class, and there have been no prior ranges, or
5411       XCLASS items generated by escapes. If this is the final character in the
5412       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5413       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5414       can cause firstchar to be set. Otherwise, there can be no first char if
5415       this item is first, whatever repeat count may follow. In the case of
5416       reqchar, save the previous value for reinstating. */
5417
5418       if (!inescq &&
5419 #ifdef SUPPORT_UCP
5420           !xclass_has_prop &&
5421 #endif
5422           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5423         {
5424         ptr++;
5425         zeroreqchar = reqchar;
5426         zeroreqcharflags = reqcharflags;
5427
5428         if (negate_class)
5429           {
5430 #ifdef SUPPORT_UCP
5431           int d;
5432 #endif
5433           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5434           zerofirstchar = firstchar;
5435           zerofirstcharflags = firstcharflags;
5436
5437           /* For caseless UTF-8 mode when UCP support is available, check
5438           whether this character has more than one other case. If so, generate
5439           a special OP_NOTPROP item instead of OP_NOTI. */
5440
5441 #ifdef SUPPORT_UCP
5442           if (utf && (options & PCRE_CASELESS) != 0 &&
5443               (d = UCD_CASESET(c)) != 0)
5444             {
5445             *code++ = OP_NOTPROP;
5446             *code++ = PT_CLIST;
5447             *code++ = d;
5448             }
5449           else
5450 #endif
5451           /* Char has only one other case, or UCP not available */
5452
5453             {
5454             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5455 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5456             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5457               code += PRIV(ord2utf)(c, code);
5458             else
5459 #endif
5460               *code++ = c;
5461             }
5462
5463           /* We are finished with this character class */
5464
5465           goto END_CLASS;
5466           }
5467
5468         /* For a single, positive character, get the value into mcbuffer, and
5469         then we can handle this with the normal one-character code. */
5470
5471 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5472         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5473           mclength = PRIV(ord2utf)(c, mcbuffer);
5474         else
5475 #endif
5476           {
5477           mcbuffer[0] = c;
5478           mclength = 1;
5479           }
5480         goto ONE_CHAR;
5481         }       /* End of 1-char optimization */
5482
5483       /* There is more than one character in the class, or an XCLASS item
5484       has been generated. Add this character to the class. */
5485
5486       class_has_8bitchar +=
5487         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5488       }
5489
5490     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5491     If we are at the end of an internal nested string, revert to the outer
5492     string. */
5493
5494     while (((c = *(++ptr)) != CHAR_NULL ||
5495            (nestptr != NULL &&
5496              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5497            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5498
5499     /* Check for missing terminating ']' */
5500
5501     if (c == CHAR_NULL)
5502       {
5503       *errorcodeptr = ERR6;
5504       goto FAILED;
5505       }
5506
5507     /* We will need an XCLASS if data has been placed in class_uchardata. In
5508     the second phase this is a sufficient test. However, in the pre-compile
5509     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5510     only if the very last character in the class needs XCLASS will it contain
5511     anything at this point. For this reason, xclass gets set TRUE above when
5512     uchar_classdata is emptied, and that's why this code is the way it is here
5513     instead of just doing a test on class_uchardata below. */
5514
5515 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5516     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5517 #endif
5518
5519     /* If this is the first thing in the branch, there can be no first char
5520     setting, whatever the repeat count. Any reqchar setting must remain
5521     unchanged after any kind of repeat. */
5522
5523     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5524     zerofirstchar = firstchar;
5525     zerofirstcharflags = firstcharflags;
5526     zeroreqchar = reqchar;
5527     zeroreqcharflags = reqcharflags;
5528
5529     /* If there are characters with values > 255, we have to compile an
5530     extended class, with its own opcode, unless there was a negated special
5531     such as \S in the class, and PCRE_UCP is not set, because in that case all
5532     characters > 255 are in the class, so any that were explicitly given as
5533     well can be ignored. If (when there are explicit characters > 255 that must
5534     be listed) there are no characters < 256, we can omit the bitmap in the
5535     actual compiled code. */
5536
5537 #ifdef SUPPORT_UTF
5538     if (xclass && (xclass_has_prop || !should_flip_negation ||
5539         (options & PCRE_UCP) != 0))
5540 #elif !defined COMPILE_PCRE8
5541     if (xclass && (xclass_has_prop || !should_flip_negation))
5542 #endif
5543 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5544       {
5545       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5546       *code++ = OP_XCLASS;
5547       code += LINK_SIZE;
5548       *code = negate_class? XCL_NOT:0;
5549       if (xclass_has_prop) *code |= XCL_HASPROP;
5550
5551       /* If the map is required, move up the extra data to make room for it;
5552       otherwise just move the code pointer to the end of the extra data. */
5553
5554       if (class_has_8bitchar > 0)
5555         {
5556         *code++ |= XCL_MAP;
5557         memmove(code + (32 / sizeof(pcre_uchar)), code,
5558           IN_UCHARS(class_uchardata - code));
5559         if (negate_class && !xclass_has_prop)
5560           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5561         memcpy(code, classbits, 32);
5562         code = class_uchardata + (32 / sizeof(pcre_uchar));
5563         }
5564       else code = class_uchardata;
5565
5566       /* Now fill in the complete length of the item */
5567
5568       PUT(previous, 1, (int)(code - previous));
5569       break;   /* End of class handling */
5570       }
5571
5572     /* Even though any XCLASS list is now discarded, we must allow for
5573     its memory. */
5574
5575     if (lengthptr != NULL)
5576       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5577 #endif
5578
5579     /* If there are no characters > 255, or they are all to be included or
5580     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5581     whole class was negated and whether there were negative specials such as \S
5582     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5583     negating it if necessary. */
5584
5585     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5586     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5587       {
5588       if (negate_class)
5589         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5590       memcpy(code, classbits, 32);
5591       }
5592     code += 32 / sizeof(pcre_uchar);
5593
5594     END_CLASS:
5595     break;
5596
5597
5598     /* ===================================================================*/
5599     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5600     has been tested above. */
5601
5602     case CHAR_LEFT_CURLY_BRACKET:
5603     if (!is_quantifier) goto NORMAL_CHAR;
5604     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5605     if (*errorcodeptr != 0) goto FAILED;
5606     goto REPEAT;
5607
5608     case CHAR_ASTERISK:
5609     repeat_min = 0;
5610     repeat_max = -1;
5611     goto REPEAT;
5612
5613     case CHAR_PLUS:
5614     repeat_min = 1;
5615     repeat_max = -1;
5616     goto REPEAT;
5617
5618     case CHAR_QUESTION_MARK:
5619     repeat_min = 0;
5620     repeat_max = 1;
5621
5622     REPEAT:
5623     if (previous == NULL)
5624       {
5625       *errorcodeptr = ERR9;
5626       goto FAILED;
5627       }
5628
5629     if (repeat_min == 0)
5630       {
5631       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5632       firstcharflags = zerofirstcharflags;
5633       reqchar = zeroreqchar;        /* Ditto */
5634       reqcharflags = zeroreqcharflags;
5635       }
5636
5637     /* Remember whether this is a variable length repeat */
5638
5639     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5640
5641     op_type = 0;                    /* Default single-char op codes */
5642     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5643
5644     /* Save start of previous item, in case we have to move it up in order to
5645     insert something before it. */
5646
5647     tempcode = previous;
5648
5649     /* Before checking for a possessive quantifier, we must skip over
5650     whitespace and comments in extended mode because Perl allows white space at
5651     this point. */
5652
5653     if ((options & PCRE_EXTENDED) != 0)
5654       {
5655       const pcre_uchar *p = ptr + 1;
5656       for (;;)
5657         {
5658         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5659         if (*p != CHAR_NUMBER_SIGN) break;
5660         p++;
5661         while (*p != CHAR_NULL)
5662           {
5663           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5664             {                        /* IS_NEWLINE sets cd->nllen. */
5665             p += cd->nllen;
5666             break;
5667             }
5668           p++;
5669 #ifdef SUPPORT_UTF
5670           if (utf) FORWARDCHAR(p);
5671 #endif
5672           }           /* Loop for comment characters */
5673         }             /* Loop for multiple comments */
5674       ptr = p - 1;    /* Character before the next significant one. */
5675       }
5676
5677     /* If the next character is '+', we have a possessive quantifier. This
5678     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5679     If the next character is '?' this is a minimizing repeat, by default,
5680     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5681     repeat type to the non-default. */
5682
5683     if (ptr[1] == CHAR_PLUS)
5684       {
5685       repeat_type = 0;                  /* Force greedy */
5686       possessive_quantifier = TRUE;
5687       ptr++;
5688       }
5689     else if (ptr[1] == CHAR_QUESTION_MARK)
5690       {
5691       repeat_type = greedy_non_default;
5692       ptr++;
5693       }
5694     else repeat_type = greedy_default;
5695
5696     /* If previous was a recursion call, wrap it in atomic brackets so that
5697     previous becomes the atomic group. All recursions were so wrapped in the
5698     past, but it no longer happens for non-repeated recursions. In fact, the
5699     repeated ones could be re-implemented independently so as not to need this,
5700     but for the moment we rely on the code for repeating groups. */
5701
5702     if (*previous == OP_RECURSE)
5703       {
5704       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5705       *previous = OP_ONCE;
5706       PUT(previous, 1, 2 + 2*LINK_SIZE);
5707       previous[2 + 2*LINK_SIZE] = OP_KET;
5708       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5709       code += 2 + 2 * LINK_SIZE;
5710       length_prevgroup = 3 + 3*LINK_SIZE;
5711
5712       /* When actually compiling, we need to check whether this was a forward
5713       reference, and if so, adjust the offset. */
5714
5715       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5716         {
5717         int offset = GET(cd->hwm, -LINK_SIZE);
5718         if (offset == previous + 1 - cd->start_code)
5719           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5720         }
5721       }
5722
5723     /* Now handle repetition for the different types of item. */
5724
5725     /* If previous was a character or negated character match, abolish the item
5726     and generate a repeat item instead. If a char item has a minimum of more
5727     than one, ensure that it is set in reqchar - it might not be if a sequence
5728     such as x{3} is the first thing in a branch because the x will have gone
5729     into firstchar instead.  */
5730
5731     if (*previous == OP_CHAR || *previous == OP_CHARI
5732         || *previous == OP_NOT || *previous == OP_NOTI)
5733       {
5734       switch (*previous)
5735         {
5736         default: /* Make compiler happy. */
5737         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5738         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5739         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5740         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5741         }
5742
5743       /* Deal with UTF characters that take up more than one character. It's
5744       easier to write this out separately than try to macrify it. Use c to
5745       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5746       it's a length rather than a small character. */
5747
5748 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5749       if (utf && NOT_FIRSTCHAR(code[-1]))
5750         {
5751         pcre_uchar *lastchar = code - 1;
5752         BACKCHAR(lastchar);
5753         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5754         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5755         c |= UTF_LENGTH;                /* Flag c as a length */
5756         }
5757       else
5758 #endif /* SUPPORT_UTF */
5759
5760       /* Handle the case of a single charater - either with no UTF support, or
5761       with UTF disabled, or for a single character UTF character. */
5762         {
5763         c = code[-1];
5764         if (*previous <= OP_CHARI && repeat_min > 1)
5765           {
5766           reqchar = c;
5767           reqcharflags = req_caseopt | cd->req_varyopt;
5768           }
5769         }
5770
5771       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5772       }
5773
5774     /* If previous was a character type match (\d or similar), abolish it and
5775     create a suitable repeat item. The code is shared with single-character
5776     repeats by setting op_type to add a suitable offset into repeat_type. Note
5777     the the Unicode property types will be present only when SUPPORT_UCP is
5778     defined, but we don't wrap the little bits of code here because it just
5779     makes it horribly messy. */
5780
5781     else if (*previous < OP_EODN)
5782       {
5783       pcre_uchar *oldcode;
5784       int prop_type, prop_value;
5785       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5786       c = *previous;
5787
5788       OUTPUT_SINGLE_REPEAT:
5789       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5790         {
5791         prop_type = previous[1];
5792         prop_value = previous[2];
5793         }
5794       else prop_type = prop_value = -1;
5795
5796       oldcode = code;
5797       code = previous;                  /* Usually overwrite previous item */
5798
5799       /* If the maximum is zero then the minimum must also be zero; Perl allows
5800       this case, so we do too - by simply omitting the item altogether. */
5801
5802       if (repeat_max == 0) goto END_REPEAT;
5803
5804       /* Combine the op_type with the repeat_type */
5805
5806       repeat_type += op_type;
5807
5808       /* A minimum of zero is handled either as the special case * or ?, or as
5809       an UPTO, with the maximum given. */
5810
5811       if (repeat_min == 0)
5812         {
5813         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5814           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5815         else
5816           {
5817           *code++ = OP_UPTO + repeat_type;
5818           PUT2INC(code, 0, repeat_max);
5819           }
5820         }
5821
5822       /* A repeat minimum of 1 is optimized into some special cases. If the
5823       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5824       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5825       one less than the maximum. */
5826
5827       else if (repeat_min == 1)
5828         {
5829         if (repeat_max == -1)
5830           *code++ = OP_PLUS + repeat_type;
5831         else
5832           {
5833           code = oldcode;                 /* leave previous item in place */
5834           if (repeat_max == 1) goto END_REPEAT;
5835           *code++ = OP_UPTO + repeat_type;
5836           PUT2INC(code, 0, repeat_max - 1);
5837           }
5838         }
5839
5840       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5841       handled as an EXACT followed by an UPTO. */
5842
5843       else
5844         {
5845         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5846         PUT2INC(code, 0, repeat_min);
5847
5848         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5849         we have to insert the character for the previous code. For a repeated
5850         Unicode property match, there are two extra bytes that define the
5851         required property. In UTF-8 mode, long characters have their length in
5852         c, with the UTF_LENGTH bit as a flag. */
5853
5854         if (repeat_max < 0)
5855           {
5856 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5857           if (utf && (c & UTF_LENGTH) != 0)
5858             {
5859             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5860             code += c & 7;
5861             }
5862           else
5863 #endif
5864             {
5865             *code++ = c;
5866             if (prop_type >= 0)
5867               {
5868               *code++ = prop_type;
5869               *code++ = prop_value;
5870               }
5871             }
5872           *code++ = OP_STAR + repeat_type;
5873           }
5874
5875         /* Else insert an UPTO if the max is greater than the min, again
5876         preceded by the character, for the previously inserted code. If the
5877         UPTO is just for 1 instance, we can use QUERY instead. */
5878
5879         else if (repeat_max != repeat_min)
5880           {
5881 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5882           if (utf && (c & UTF_LENGTH) != 0)
5883             {
5884             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5885             code += c & 7;
5886             }
5887           else
5888 #endif
5889           *code++ = c;
5890           if (prop_type >= 0)
5891             {
5892             *code++ = prop_type;
5893             *code++ = prop_value;
5894             }
5895           repeat_max -= repeat_min;
5896
5897           if (repeat_max == 1)
5898             {
5899             *code++ = OP_QUERY + repeat_type;
5900             }
5901           else
5902             {
5903             *code++ = OP_UPTO + repeat_type;
5904             PUT2INC(code, 0, repeat_max);
5905             }
5906           }
5907         }
5908
5909       /* The character or character type itself comes last in all cases. */
5910
5911 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5912       if (utf && (c & UTF_LENGTH) != 0)
5913         {
5914         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5915         code += c & 7;
5916         }
5917       else
5918 #endif
5919       *code++ = c;
5920
5921       /* For a repeated Unicode property match, there are two extra bytes that
5922       define the required property. */
5923
5924 #ifdef SUPPORT_UCP
5925       if (prop_type >= 0)
5926         {
5927         *code++ = prop_type;
5928         *code++ = prop_value;
5929         }
5930 #endif
5931       }
5932
5933     /* If previous was a character class or a back reference, we put the repeat
5934     stuff after it, but just skip the item if the repeat was {0,0}. */
5935
5936     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5937 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5938              *previous == OP_XCLASS ||
5939 #endif
5940              *previous == OP_REF   || *previous == OP_REFI ||
5941              *previous == OP_DNREF || *previous == OP_DNREFI)
5942       {
5943       if (repeat_max == 0)
5944         {
5945         code = previous;
5946         goto END_REPEAT;
5947         }
5948
5949       if (repeat_min == 0 && repeat_max == -1)
5950         *code++ = OP_CRSTAR + repeat_type;
5951       else if (repeat_min == 1 && repeat_max == -1)
5952         *code++ = OP_CRPLUS + repeat_type;
5953       else if (repeat_min == 0 && repeat_max == 1)
5954         *code++ = OP_CRQUERY + repeat_type;
5955       else
5956         {
5957         *code++ = OP_CRRANGE + repeat_type;
5958         PUT2INC(code, 0, repeat_min);
5959         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5960         PUT2INC(code, 0, repeat_max);
5961         }
5962       }
5963
5964     /* If previous was a bracket group, we may have to replicate it in certain
5965     cases. Note that at this point we can encounter only the "basic" bracket
5966     opcodes such as BRA and CBRA, as this is the place where they get converted
5967     into the more special varieties such as BRAPOS and SBRA. A test for >=
5968     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5969     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5970     Originally, PCRE did not allow repetition of assertions, but now it does,
5971     for Perl compatibility. */
5972
5973     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5974       {
5975       register int i;
5976       int len = (int)(code - previous);
5977       size_t base_hwm_offset = item_hwm_offset;
5978       pcre_uchar *bralink = NULL;
5979       pcre_uchar *brazeroptr = NULL;
5980
5981       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5982       we just ignore the repeat. */
5983
5984       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5985         goto END_REPEAT;
5986
5987       /* There is no sense in actually repeating assertions. The only potential
5988       use of repetition is in cases when the assertion is optional. Therefore,
5989       if the minimum is greater than zero, just ignore the repeat. If the
5990       maximum is not zero or one, set it to 1. */
5991
5992       if (*previous < OP_ONCE)    /* Assertion */
5993         {
5994         if (repeat_min > 0) goto END_REPEAT;
5995         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5996         }
5997
5998       /* The case of a zero minimum is special because of the need to stick
5999       OP_BRAZERO in front of it, and because the group appears once in the
6000       data, whereas in other cases it appears the minimum number of times. For
6001       this reason, it is simplest to treat this case separately, as otherwise
6002       the code gets far too messy. There are several special subcases when the
6003       minimum is zero. */
6004
6005       if (repeat_min == 0)
6006         {
6007         /* If the maximum is also zero, we used to just omit the group from the
6008         output altogether, like this:
6009
6010         ** if (repeat_max == 0)
6011         **   {
6012         **   code = previous;
6013         **   goto END_REPEAT;
6014         **   }
6015
6016         However, that fails when a group or a subgroup within it is referenced
6017         as a subroutine from elsewhere in the pattern, so now we stick in
6018         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6019         don't have a list of which groups are referenced, we cannot do this
6020         selectively.
6021
6022         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6023         and do no more at this point. However, we do need to adjust any
6024         OP_RECURSE calls inside the group that refer to the group itself or any
6025         internal or forward referenced group, because the offset is from the
6026         start of the whole regex. Temporarily terminate the pattern while doing
6027         this. */
6028
6029         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6030           {
6031           *code = OP_END;
6032           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6033           memmove(previous + 1, previous, IN_UCHARS(len));
6034           code++;
6035           if (repeat_max == 0)
6036             {
6037             *previous++ = OP_SKIPZERO;
6038             goto END_REPEAT;
6039             }
6040           brazeroptr = previous;    /* Save for possessive optimizing */
6041           *previous++ = OP_BRAZERO + repeat_type;
6042           }
6043
6044         /* If the maximum is greater than 1 and limited, we have to replicate
6045         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6046         The first one has to be handled carefully because it's the original
6047         copy, which has to be moved up. The remainder can be handled by code
6048         that is common with the non-zero minimum case below. We have to
6049         adjust the value or repeat_max, since one less copy is required. Once
6050         again, we may have to adjust any OP_RECURSE calls inside the group. */
6051
6052         else
6053           {
6054           int offset;
6055           *code = OP_END;
6056           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6057           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6058           code += 2 + LINK_SIZE;
6059           *previous++ = OP_BRAZERO + repeat_type;
6060           *previous++ = OP_BRA;
6061
6062           /* We chain together the bracket offset fields that have to be
6063           filled in later when the ends of the brackets are reached. */
6064
6065           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6066           bralink = previous;
6067           PUTINC(previous, 0, offset);
6068           }
6069
6070         repeat_max--;
6071         }
6072
6073       /* If the minimum is greater than zero, replicate the group as many
6074       times as necessary, and adjust the maximum to the number of subsequent
6075       copies that we need. If we set a first char from the group, and didn't
6076       set a required char, copy the latter from the former. If there are any
6077       forward reference subroutine calls in the group, there will be entries on
6078       the workspace list; replicate these with an appropriate increment. */
6079
6080       else
6081         {
6082         if (repeat_min > 1)
6083           {
6084           /* In the pre-compile phase, we don't actually do the replication. We
6085           just adjust the length as if we had. Do some paranoid checks for
6086           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6087           integer type when available, otherwise double. */
6088
6089           if (lengthptr != NULL)
6090             {
6091             int delta = (repeat_min - 1)*length_prevgroup;
6092             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6093                   (INT64_OR_DOUBLE)length_prevgroup >
6094                     (INT64_OR_DOUBLE)INT_MAX ||
6095                 OFLOW_MAX - *lengthptr < delta)
6096               {
6097               *errorcodeptr = ERR20;
6098               goto FAILED;
6099               }
6100             *lengthptr += delta;
6101             }
6102
6103           /* This is compiling for real. If there is a set first byte for
6104           the group, and we have not yet set a "required byte", set it. Make
6105           sure there is enough workspace for copying forward references before
6106           doing the copy. */
6107
6108           else
6109             {
6110             if (groupsetfirstchar && reqcharflags < 0)
6111               {
6112               reqchar = firstchar;
6113               reqcharflags = firstcharflags;
6114               }
6115
6116             for (i = 1; i < repeat_min; i++)
6117               {
6118               pcre_uchar *hc;
6119               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6120               memcpy(code, previous, IN_UCHARS(len));
6121
6122               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6123                      WORK_SIZE_SAFETY_MARGIN -
6124                      (this_hwm_offset - base_hwm_offset))
6125                 {
6126                 *errorcodeptr = expand_workspace(cd);
6127                 if (*errorcodeptr != 0) goto FAILED;
6128                 }
6129
6130               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6131                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6132                    hc += LINK_SIZE)
6133                 {
6134                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6135                 cd->hwm += LINK_SIZE;
6136                 }
6137               base_hwm_offset = this_hwm_offset;
6138               code += len;
6139               }
6140             }
6141           }
6142
6143         if (repeat_max > 0) repeat_max -= repeat_min;
6144         }
6145
6146       /* This code is common to both the zero and non-zero minimum cases. If
6147       the maximum is limited, it replicates the group in a nested fashion,
6148       remembering the bracket starts on a stack. In the case of a zero minimum,
6149       the first one was set up above. In all cases the repeat_max now specifies
6150       the number of additional copies needed. Again, we must remember to
6151       replicate entries on the forward reference list. */
6152
6153       if (repeat_max >= 0)
6154         {
6155         /* In the pre-compile phase, we don't actually do the replication. We
6156         just adjust the length as if we had. For each repetition we must add 1
6157         to the length for BRAZERO and for all but the last repetition we must
6158         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6159         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6160         a 64-bit integer type when available, otherwise double. */
6161
6162         if (lengthptr != NULL && repeat_max > 0)
6163           {
6164           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6165                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6166           if ((INT64_OR_DOUBLE)repeat_max *
6167                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6168                   > (INT64_OR_DOUBLE)INT_MAX ||
6169               OFLOW_MAX - *lengthptr < delta)
6170             {
6171             *errorcodeptr = ERR20;
6172             goto FAILED;
6173             }
6174           *lengthptr += delta;
6175           }
6176
6177         /* This is compiling for real */
6178
6179         else for (i = repeat_max - 1; i >= 0; i--)
6180           {
6181           pcre_uchar *hc;
6182           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6183
6184           *code++ = OP_BRAZERO + repeat_type;
6185
6186           /* All but the final copy start a new nesting, maintaining the
6187           chain of brackets outstanding. */
6188
6189           if (i != 0)
6190             {
6191             int offset;
6192             *code++ = OP_BRA;
6193             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6194             bralink = code;
6195             PUTINC(code, 0, offset);
6196             }
6197
6198           memcpy(code, previous, IN_UCHARS(len));
6199
6200           /* Ensure there is enough workspace for forward references before
6201           copying them. */
6202
6203           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6204                  WORK_SIZE_SAFETY_MARGIN -
6205                  (this_hwm_offset - base_hwm_offset))
6206             {
6207             *errorcodeptr = expand_workspace(cd);
6208             if (*errorcodeptr != 0) goto FAILED;
6209             }
6210
6211           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6212                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6213                hc += LINK_SIZE)
6214             {
6215             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6216             cd->hwm += LINK_SIZE;
6217             }
6218           base_hwm_offset = this_hwm_offset;
6219           code += len;
6220           }
6221
6222         /* Now chain through the pending brackets, and fill in their length
6223         fields (which are holding the chain links pro tem). */
6224
6225         while (bralink != NULL)
6226           {
6227           int oldlinkoffset;
6228           int offset = (int)(code - bralink + 1);
6229           pcre_uchar *bra = code - offset;
6230           oldlinkoffset = GET(bra, 1);
6231           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6232           *code++ = OP_KET;
6233           PUTINC(code, 0, offset);
6234           PUT(bra, 1, offset);
6235           }
6236         }
6237
6238       /* If the maximum is unlimited, set a repeater in the final copy. For
6239       ONCE brackets, that's all we need to do. However, possessively repeated
6240       ONCE brackets can be converted into non-capturing brackets, as the
6241       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6242       deal with possessive ONCEs specially.
6243
6244       Otherwise, when we are doing the actual compile phase, check to see
6245       whether this group is one that could match an empty string. If so,
6246       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6247       that runtime checking can be done. [This check is also applied to ONCE
6248       groups at runtime, but in a different way.]
6249
6250       Then, if the quantifier was possessive and the bracket is not a
6251       conditional, we convert the BRA code to the POS form, and the KET code to
6252       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6253       subpattern at both the start and at the end.) The use of special opcodes
6254       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6255       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6256
6257       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6258       flag so that the default action below, of wrapping everything inside
6259       atomic brackets, does not happen. When the minimum is greater than 1,
6260       there will be earlier copies of the group, and so we still have to wrap
6261       the whole thing. */
6262
6263       else
6264         {
6265         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6266         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6267
6268         /* Convert possessive ONCE brackets to non-capturing */
6269
6270         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6271             possessive_quantifier) *bracode = OP_BRA;
6272
6273         /* For non-possessive ONCE brackets, all we need to do is to
6274         set the KET. */
6275
6276         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6277           *ketcode = OP_KETRMAX + repeat_type;
6278
6279         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6280         converted to non-capturing above). */
6281
6282         else
6283           {
6284           /* In the compile phase, check for empty string matching. */
6285
6286           if (lengthptr == NULL)
6287             {
6288             pcre_uchar *scode = bracode;
6289             do
6290               {
6291               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6292                 {
6293                 *bracode += OP_SBRA - OP_BRA;
6294                 break;
6295                 }
6296               scode += GET(scode, 1);
6297               }
6298             while (*scode == OP_ALT);
6299             }
6300
6301           /* A conditional group with only one branch has an implicit empty
6302           alternative branch. */
6303
6304           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6305             *bracode = OP_SCOND;
6306
6307           /* Handle possessive quantifiers. */
6308
6309           if (possessive_quantifier)
6310             {
6311             /* For COND brackets, we wrap the whole thing in a possessively
6312             repeated non-capturing bracket, because we have not invented POS
6313             versions of the COND opcodes. Because we are moving code along, we
6314             must ensure that any pending recursive references are updated. */
6315
6316             if (*bracode == OP_COND || *bracode == OP_SCOND)
6317               {
6318               int nlen = (int)(code - bracode);
6319               *code = OP_END;
6320               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6321               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6322               code += 1 + LINK_SIZE;
6323               nlen += 1 + LINK_SIZE;
6324               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6325               *code++ = OP_KETRPOS;
6326               PUTINC(code, 0, nlen);
6327               PUT(bracode, 1, nlen);
6328               }
6329
6330             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6331
6332             else
6333               {
6334               *bracode += 1;              /* Switch to xxxPOS opcodes */
6335               *ketcode = OP_KETRPOS;
6336               }
6337
6338             /* If the minimum is zero, mark it as possessive, then unset the
6339             possessive flag when the minimum is 0 or 1. */
6340
6341             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6342             if (repeat_min < 2) possessive_quantifier = FALSE;
6343             }
6344
6345           /* Non-possessive quantifier */
6346
6347           else *ketcode = OP_KETRMAX + repeat_type;
6348           }
6349         }
6350       }
6351
6352     /* If previous is OP_FAIL, it was generated by an empty class [] in
6353     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6354     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6355     error above. We can just ignore the repeat in JS case. */
6356
6357     else if (*previous == OP_FAIL) goto END_REPEAT;
6358
6359     /* Else there's some kind of shambles */
6360
6361     else
6362       {
6363       *errorcodeptr = ERR11;
6364       goto FAILED;
6365       }
6366
6367     /* If the character following a repeat is '+', possessive_quantifier is
6368     TRUE. For some opcodes, there are special alternative opcodes for this
6369     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6370     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6371     Sun's Java package, but the special opcodes can optimize it.
6372
6373     Some (but not all) possessively repeated subpatterns have already been
6374     completely handled in the code just above. For them, possessive_quantifier
6375     is always FALSE at this stage. Note that the repeated item starts at
6376     tempcode, not at previous, which might be the first part of a string whose
6377     (former) last char we repeated. */
6378
6379     if (possessive_quantifier)
6380       {
6381       int len;
6382
6383       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6384       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6385       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6386       remains is greater than zero, there's a further opcode that can be
6387       handled. If not, do nothing, leaving the EXACT alone. */
6388
6389       switch(*tempcode)
6390         {
6391         case OP_TYPEEXACT:
6392         tempcode += PRIV(OP_lengths)[*tempcode] +
6393           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6394           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6395         break;
6396
6397         /* CHAR opcodes are used for exacts whose count is 1. */
6398
6399         case OP_CHAR:
6400         case OP_CHARI:
6401         case OP_NOT:
6402         case OP_NOTI:
6403         case OP_EXACT:
6404         case OP_EXACTI:
6405         case OP_NOTEXACT:
6406         case OP_NOTEXACTI:
6407         tempcode += PRIV(OP_lengths)[*tempcode];
6408 #ifdef SUPPORT_UTF
6409         if (utf && HAS_EXTRALEN(tempcode[-1]))
6410           tempcode += GET_EXTRALEN(tempcode[-1]);
6411 #endif
6412         break;
6413
6414         /* For the class opcodes, the repeat operator appears at the end;
6415         adjust tempcode to point to it. */
6416
6417         case OP_CLASS:
6418         case OP_NCLASS:
6419         tempcode += 1 + 32/sizeof(pcre_uchar);
6420         break;
6421
6422 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6423         case OP_XCLASS:
6424         tempcode += GET(tempcode, 1);
6425         break;
6426 #endif
6427         }
6428
6429       /* If tempcode is equal to code (which points to the end of the repeated
6430       item), it means we have skipped an EXACT item but there is no following
6431       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6432       all other cases, tempcode will be pointing to the repeat opcode, and will
6433       be less than code, so the value of len will be greater than 0. */
6434
6435       len = (int)(code - tempcode);
6436       if (len > 0)
6437         {
6438         unsigned int repcode = *tempcode;
6439
6440         /* There is a table for possessifying opcodes, all of which are less
6441         than OP_CALLOUT. A zero entry means there is no possessified version.
6442         */
6443
6444         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6445           *tempcode = opcode_possessify[repcode];
6446
6447         /* For opcode without a special possessified version, wrap the item in
6448         ONCE brackets. Because we are moving code along, we must ensure that any
6449         pending recursive references are updated. */
6450
6451         else
6452           {
6453           *code = OP_END;
6454           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6455           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6456           code += 1 + LINK_SIZE;
6457           len += 1 + LINK_SIZE;
6458           tempcode[0] = OP_ONCE;
6459           *code++ = OP_KET;
6460           PUTINC(code, 0, len);
6461           PUT(tempcode, 1, len);
6462           }
6463         }
6464
6465 #ifdef NEVER
6466       if (len > 0) switch (*tempcode)
6467         {
6468         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6469         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6470         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6471         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6472
6473         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6474         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6475         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6476         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6477
6478         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6479         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6480         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6481         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6482
6483         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6484         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6485         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6486         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6487
6488         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6489         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6490         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6491         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6492
6493         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6494         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6495         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6496         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6497
6498         /* Because we are moving code along, we must ensure that any
6499         pending recursive references are updated. */
6500
6501         default:
6502         *code = OP_END;
6503         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6504         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6505         code += 1 + LINK_SIZE;
6506         len += 1 + LINK_SIZE;
6507         tempcode[0] = OP_ONCE;
6508         *code++ = OP_KET;
6509         PUTINC(code, 0, len);
6510         PUT(tempcode, 1, len);
6511         break;
6512         }
6513 #endif
6514       }
6515
6516     /* In all case we no longer have a previous item. We also set the
6517     "follows varying string" flag for subsequently encountered reqchars if
6518     it isn't already set and we have just passed a varying length item. */
6519
6520     END_REPEAT:
6521     previous = NULL;
6522     cd->req_varyopt |= reqvary;
6523     break;
6524
6525
6526     /* ===================================================================*/
6527     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6528     lookbehind or option setting or condition or all the other extended
6529     parenthesis forms.  */
6530
6531     case CHAR_LEFT_PARENTHESIS:
6532     ptr++;
6533
6534     /* First deal with comments. Putting this code right at the start ensures
6535     that comments have no bad side effects. */
6536
6537     if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6538       {
6539       ptr += 2;
6540       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6541       if (*ptr == CHAR_NULL)
6542         {
6543         *errorcodeptr = ERR18;
6544         goto FAILED;
6545         }
6546       continue;
6547       }
6548
6549     /* Now deal with various "verbs" that can be introduced by '*'. */
6550
6551     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6552          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6553       {
6554       int i, namelen;
6555       int arglen = 0;
6556       const char *vn = verbnames;
6557       const pcre_uchar *name = ptr + 1;
6558       const pcre_uchar *arg = NULL;
6559       previous = NULL;
6560       ptr++;
6561       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6562       namelen = (int)(ptr - name);
6563
6564       /* It appears that Perl allows any characters whatsoever, other than
6565       a closing parenthesis, to appear in arguments, so we no longer insist on
6566       letters, digits, and underscores. */
6567
6568       if (*ptr == CHAR_COLON)
6569         {
6570         arg = ++ptr;
6571         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6572         arglen = (int)(ptr - arg);
6573         if ((unsigned int)arglen > MAX_MARK)
6574           {
6575           *errorcodeptr = ERR75;
6576           goto FAILED;
6577           }
6578         }
6579
6580       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6581         {
6582         *errorcodeptr = ERR60;
6583         goto FAILED;
6584         }
6585
6586       /* Scan the table of verb names */
6587
6588       for (i = 0; i < verbcount; i++)
6589         {
6590         if (namelen == verbs[i].len &&
6591             STRNCMP_UC_C8(name, vn, namelen) == 0)
6592           {
6593           int setverb;
6594
6595           /* Check for open captures before ACCEPT and convert it to
6596           ASSERT_ACCEPT if in an assertion. */
6597
6598           if (verbs[i].op == OP_ACCEPT)
6599             {
6600             open_capitem *oc;
6601             if (arglen != 0)
6602               {
6603               *errorcodeptr = ERR59;
6604               goto FAILED;
6605               }
6606             cd->had_accept = TRUE;
6607             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6608               {
6609               if (lengthptr != NULL)
6610                 {
6611 #ifdef COMPILE_PCRE8
6612                 *lengthptr += 1 + IMM2_SIZE;
6613 #elif defined COMPILE_PCRE16
6614                 *lengthptr += 2 + IMM2_SIZE;
6615 #elif defined COMPILE_PCRE32
6616                 *lengthptr += 4 + IMM2_SIZE;
6617 #endif
6618                 }
6619               else
6620                 {
6621                 *code++ = OP_CLOSE;
6622                 PUT2INC(code, 0, oc->number);
6623                 }
6624               }
6625             setverb = *code++ =
6626               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6627
6628             /* Do not set firstchar after *ACCEPT */
6629             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6630             }
6631
6632           /* Handle other cases with/without an argument */
6633
6634           else if (arglen == 0)
6635             {
6636             if (verbs[i].op < 0)   /* Argument is mandatory */
6637               {
6638               *errorcodeptr = ERR66;
6639               goto FAILED;
6640               }
6641             setverb = *code++ = verbs[i].op;
6642             }
6643
6644           else
6645             {
6646             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6647               {
6648               *errorcodeptr = ERR59;
6649               goto FAILED;
6650               }
6651             setverb = *code++ = verbs[i].op_arg;
6652             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6653               {                       /* to avoid potential workspace */
6654               *lengthptr += arglen;   /* overflow. */
6655               *code++ = 0;
6656               }
6657             else
6658               {
6659               *code++ = arglen;
6660               memcpy(code, arg, IN_UCHARS(arglen));
6661               code += arglen;
6662               }
6663             *code++ = 0;
6664             }
6665
6666           switch (setverb)
6667             {
6668             case OP_THEN:
6669             case OP_THEN_ARG:
6670             cd->external_flags |= PCRE_HASTHEN;
6671             break;
6672
6673             case OP_PRUNE:
6674             case OP_PRUNE_ARG:
6675             case OP_SKIP:
6676             case OP_SKIP_ARG:
6677             cd->had_pruneorskip = TRUE;
6678             break;
6679             }
6680
6681           break;  /* Found verb, exit loop */
6682           }
6683
6684         vn += verbs[i].len + 1;
6685         }
6686
6687       if (i < verbcount) continue;    /* Successfully handled a verb */
6688       *errorcodeptr = ERR60;          /* Verb not recognized */
6689       goto FAILED;
6690       }
6691
6692     /* Initialize for "real" parentheses */
6693
6694     newoptions = options;
6695     skipbytes = 0;
6696     bravalue = OP_CBRA;
6697     item_hwm_offset = cd->hwm - cd->start_workspace;
6698     reset_bracount = FALSE;
6699
6700     /* Deal with the extended parentheses; all are introduced by '?', and the
6701     appearance of any of them means that this is not a capturing group. */
6702
6703     if (*ptr == CHAR_QUESTION_MARK)
6704       {
6705       int i, set, unset, namelen;
6706       int *optset;
6707       const pcre_uchar *name;
6708       pcre_uchar *slot;
6709
6710       switch (*(++ptr))
6711         {
6712         /* ------------------------------------------------------------ */
6713         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6714         reset_bracount = TRUE;
6715         cd->dupgroups = TRUE;     /* Record (?| encountered */
6716         /* Fall through */
6717
6718         /* ------------------------------------------------------------ */
6719         case CHAR_COLON:          /* Non-capturing bracket */
6720         bravalue = OP_BRA;
6721         ptr++;
6722         break;
6723
6724
6725         /* ------------------------------------------------------------ */
6726         case CHAR_LEFT_PARENTHESIS:
6727         bravalue = OP_COND;       /* Conditional group */
6728         tempptr = ptr;
6729
6730         /* A condition can be an assertion, a number (referring to a numbered
6731         group's having been set), a name (referring to a named group), or 'R',
6732         referring to recursion. R<digits> and R&name are also permitted for
6733         recursion tests.
6734
6735         There are ways of testing a named group: (?(name)) is used by Python;
6736         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6737
6738         There is one unfortunate ambiguity, caused by history. 'R' can be the
6739         recursive thing or the name 'R' (and similarly for 'R' followed by
6740         digits). We look for a name first; if not found, we try the other case.
6741
6742         For compatibility with auto-callouts, we allow a callout to be
6743         specified before a condition that is an assertion. First, check for the
6744         syntax of a callout; if found, adjust the temporary pointer that is
6745         used to check for an assertion condition. That's all that is needed! */
6746
6747         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6748           {
6749           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6750           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6751             tempptr += i + 1;
6752           }
6753
6754         /* For conditions that are assertions, check the syntax, and then exit
6755         the switch. This will take control down to where bracketed groups,
6756         including assertions, are processed. */
6757
6758         if (tempptr[1] == CHAR_QUESTION_MARK &&
6759               (tempptr[2] == CHAR_EQUALS_SIGN ||
6760                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6761                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6762                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6763                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6764           {
6765           cd->iscondassert = TRUE;
6766           break;
6767           }
6768
6769         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6770         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6771
6772         code[1+LINK_SIZE] = OP_CREF;
6773         skipbytes = 1+IMM2_SIZE;
6774         refsign = -1;     /* => not a number */
6775         namelen = -1;     /* => not a name; must set to avoid warning */
6776         name = NULL;      /* Always set to avoid warning */
6777         recno = 0;        /* Always set to avoid warning */
6778
6779         /* Check for a test for recursion in a named group. */
6780
6781         ptr++;
6782         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6783           {
6784           terminator = -1;
6785           ptr += 2;
6786           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6787           }
6788
6789         /* Check for a test for a named group's having been set, using the Perl
6790         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6791         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6792
6793         else if (*ptr == CHAR_LESS_THAN_SIGN)
6794           {
6795           terminator = CHAR_GREATER_THAN_SIGN;
6796           ptr++;
6797           }
6798         else if (*ptr == CHAR_APOSTROPHE)
6799           {
6800           terminator = CHAR_APOSTROPHE;
6801           ptr++;
6802           }
6803         else
6804           {
6805           terminator = CHAR_NULL;
6806           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6807             else if (IS_DIGIT(*ptr)) refsign = 0;
6808           }
6809
6810         /* Handle a number */
6811
6812         if (refsign >= 0)
6813           {
6814           while (IS_DIGIT(*ptr))
6815             {
6816             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6817               {
6818               while (IS_DIGIT(*ptr)) ptr++;
6819               *errorcodeptr = ERR61;
6820               goto FAILED;
6821               }
6822             recno = recno * 10 + (int)(*ptr - CHAR_0);
6823             ptr++;
6824             }
6825           }
6826
6827         /* Otherwise we expect to read a name; anything else is an error. When
6828         a name is one of a number of duplicates, a different opcode is used and
6829         it needs more memory. Unfortunately we cannot tell whether a name is a
6830         duplicate in the first pass, so we have to allow for more memory. */
6831
6832         else
6833           {
6834           if (IS_DIGIT(*ptr))
6835             {
6836             *errorcodeptr = ERR84;
6837             goto FAILED;
6838             }
6839           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6840             {
6841             *errorcodeptr = ERR28;   /* Assertion expected */
6842             goto FAILED;
6843             }
6844           name = ptr++;
6845           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6846             {
6847             ptr++;
6848             }
6849           namelen = (int)(ptr - name);
6850           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6851           }
6852
6853         /* Check the terminator */
6854
6855         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6856             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6857           {
6858           ptr--;                  /* Error offset */
6859           *errorcodeptr = ERR26;  /* Malformed number or name */
6860           goto FAILED;
6861           }
6862
6863         /* Do no further checking in the pre-compile phase. */
6864
6865         if (lengthptr != NULL) break;
6866
6867         /* In the real compile we do the work of looking for the actual
6868         reference. If refsign is not negative, it means we have a number in
6869         recno. */
6870
6871         if (refsign >= 0)
6872           {
6873           if (recno <= 0)
6874             {
6875             *errorcodeptr = ERR35;
6876             goto FAILED;
6877             }
6878           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6879             cd->bracount - recno + 1 : recno + cd->bracount;
6880           if (recno <= 0 || recno > cd->final_bracount)
6881             {
6882             *errorcodeptr = ERR15;
6883             goto FAILED;
6884             }
6885           PUT2(code, 2+LINK_SIZE, recno);
6886           if (recno > cd->top_backref) cd->top_backref = recno;
6887           break;
6888           }
6889
6890         /* Otherwise look for the name. */
6891
6892         slot = cd->name_table;
6893         for (i = 0; i < cd->names_found; i++)
6894           {
6895           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6896           slot += cd->name_entry_size;
6897           }
6898
6899         /* Found the named subpattern. If the name is duplicated, add one to
6900         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6901         appropriate data values. Otherwise, just insert the unique subpattern
6902         number. */
6903
6904         if (i < cd->names_found)
6905           {
6906           int offset = i++;
6907           int count = 1;
6908           recno = GET2(slot, 0);   /* Number from first found */
6909           if (recno > cd->top_backref) cd->top_backref = recno;
6910           for (; i < cd->names_found; i++)
6911             {
6912             slot += cd->name_entry_size;
6913             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6914               (slot+IMM2_SIZE)[namelen] != 0) break;
6915             count++;
6916             }
6917
6918           if (count > 1)
6919             {
6920             PUT2(code, 2+LINK_SIZE, offset);
6921             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6922             skipbytes += IMM2_SIZE;
6923             code[1+LINK_SIZE]++;
6924             }
6925           else  /* Not a duplicated name */
6926             {
6927             PUT2(code, 2+LINK_SIZE, recno);
6928             }
6929           }
6930
6931         /* If terminator == CHAR_NULL it means that the name followed directly
6932         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6933         are some further alternatives to try. For the cases where terminator !=
6934         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6935         we have now checked all the possibilities, so give an error. */
6936
6937         else if (terminator != CHAR_NULL)
6938           {
6939           *errorcodeptr = ERR15;
6940           goto FAILED;
6941           }
6942
6943         /* Check for (?(R) for recursion. Allow digits after R to specify a
6944         specific group number. */
6945
6946         else if (*name == CHAR_R)
6947           {
6948           recno = 0;
6949           for (i = 1; i < namelen; i++)
6950             {
6951             if (!IS_DIGIT(name[i]))
6952               {
6953               *errorcodeptr = ERR15;
6954               goto FAILED;
6955               }
6956             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6957               {
6958               *errorcodeptr = ERR61;
6959               goto FAILED;
6960               }
6961             recno = recno * 10 + name[i] - CHAR_0;
6962             }
6963           if (recno == 0) recno = RREF_ANY;
6964           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6965           PUT2(code, 2+LINK_SIZE, recno);
6966           }
6967
6968         /* Similarly, check for the (?(DEFINE) "condition", which is always
6969         false. */
6970
6971         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6972           {
6973           code[1+LINK_SIZE] = OP_DEF;
6974           skipbytes = 1;
6975           }
6976
6977         /* Reference to an unidentified subpattern. */
6978
6979         else
6980           {
6981           *errorcodeptr = ERR15;
6982           goto FAILED;
6983           }
6984         break;
6985
6986
6987         /* ------------------------------------------------------------ */
6988         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6989         bravalue = OP_ASSERT;
6990         cd->assert_depth += 1;
6991         ptr++;
6992         break;
6993
6994         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6995         thing to do, but Perl allows all assertions to be quantified, and when
6996         they contain capturing parentheses there may be a potential use for
6997         this feature. Not that that applies to a quantified (?!) but we allow
6998         it for uniformity. */
6999
7000         /* ------------------------------------------------------------ */
7001         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
7002         ptr++;
7003         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7004              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7005             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7006           {
7007           *code++ = OP_FAIL;
7008           previous = NULL;
7009           continue;
7010           }
7011         bravalue = OP_ASSERT_NOT;
7012         cd->assert_depth += 1;
7013         break;
7014
7015
7016         /* ------------------------------------------------------------ */
7017         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7018         switch (ptr[1])
7019           {
7020           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7021           bravalue = OP_ASSERTBACK;
7022           cd->assert_depth += 1;
7023           ptr += 2;
7024           break;
7025
7026           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7027           bravalue = OP_ASSERTBACK_NOT;
7028           cd->assert_depth += 1;
7029           ptr += 2;
7030           break;
7031
7032           default:                /* Could be name define, else bad */
7033           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7034             goto DEFINE_NAME;
7035           ptr++;                  /* Correct offset for error */
7036           *errorcodeptr = ERR24;
7037           goto FAILED;
7038           }
7039         break;
7040
7041
7042         /* ------------------------------------------------------------ */
7043         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7044         bravalue = OP_ONCE;
7045         ptr++;
7046         break;
7047
7048
7049         /* ------------------------------------------------------------ */
7050         case CHAR_C:                 /* Callout - may be followed by digits; */
7051         previous_callout = code;     /* Save for later completion */
7052         after_manual_callout = 1;    /* Skip one item before completing */
7053         *code++ = OP_CALLOUT;
7054           {
7055           int n = 0;
7056           ptr++;
7057           while(IS_DIGIT(*ptr))
7058             n = n * 10 + *ptr++ - CHAR_0;
7059           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7060             {
7061             *errorcodeptr = ERR39;
7062             goto FAILED;
7063             }
7064           if (n > 255)
7065             {
7066             *errorcodeptr = ERR38;
7067             goto FAILED;
7068             }
7069           *code++ = n;
7070           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7071           PUT(code, LINK_SIZE, 0);                          /* Default length */
7072           code += 2 * LINK_SIZE;
7073           }
7074         previous = NULL;
7075         continue;
7076
7077
7078         /* ------------------------------------------------------------ */
7079         case CHAR_P:              /* Python-style named subpattern handling */
7080         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7081             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7082           {
7083           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7084           terminator = CHAR_RIGHT_PARENTHESIS;
7085           goto NAMED_REF_OR_RECURSE;
7086           }
7087         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7088           {
7089           *errorcodeptr = ERR41;
7090           goto FAILED;
7091           }
7092         /* Fall through to handle (?P< as (?< is handled */
7093
7094
7095         /* ------------------------------------------------------------ */
7096         DEFINE_NAME:    /* Come here from (?< handling */
7097         case CHAR_APOSTROPHE:
7098         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7099           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7100         name = ++ptr;
7101         if (IS_DIGIT(*ptr))
7102           {
7103           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7104           goto FAILED;
7105           }
7106         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7107         namelen = (int)(ptr - name);
7108
7109         /* In the pre-compile phase, do a syntax check, remember the longest
7110         name, and then remember the group in a vector, expanding it if
7111         necessary. Duplicates for the same number are skipped; other duplicates
7112         are checked for validity. In the actual compile, there is nothing to
7113         do. */
7114
7115         if (lengthptr != NULL)
7116           {
7117           named_group *ng;
7118           pcre_uint32 number = cd->bracount + 1;
7119
7120           if (*ptr != (pcre_uchar)terminator)
7121             {
7122             *errorcodeptr = ERR42;
7123             goto FAILED;
7124             }
7125
7126           if (cd->names_found >= MAX_NAME_COUNT)
7127             {
7128             *errorcodeptr = ERR49;
7129             goto FAILED;
7130             }
7131
7132           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7133             {
7134             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7135             if (namelen > MAX_NAME_SIZE)
7136               {
7137               *errorcodeptr = ERR48;
7138               goto FAILED;
7139               }
7140             }
7141
7142           /* Scan the list to check for duplicates. For duplicate names, if the
7143           number is the same, break the loop, which causes the name to be
7144           discarded; otherwise, if DUPNAMES is not set, give an error.
7145           If it is set, allow the name with a different number, but continue
7146           scanning in case this is a duplicate with the same number. For
7147           non-duplicate names, give an error if the number is duplicated. */
7148
7149           ng = cd->named_groups;
7150           for (i = 0; i < cd->names_found; i++, ng++)
7151             {
7152             if (namelen == ng->length &&
7153                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7154               {
7155               if (ng->number == number) break;
7156               if ((options & PCRE_DUPNAMES) == 0)
7157                 {
7158                 *errorcodeptr = ERR43;
7159                 goto FAILED;
7160                 }
7161               cd->dupnames = TRUE;  /* Duplicate names exist */
7162               }
7163             else if (ng->number == number)
7164               {
7165               *errorcodeptr = ERR65;
7166               goto FAILED;
7167               }
7168             }
7169
7170           if (i >= cd->names_found)     /* Not a duplicate with same number */
7171             {
7172             /* Increase the list size if necessary */
7173
7174             if (cd->names_found >= cd->named_group_list_size)
7175               {
7176               int newsize = cd->named_group_list_size * 2;
7177               named_group *newspace = (PUBL(malloc))
7178                 (newsize * sizeof(named_group));
7179
7180               if (newspace == NULL)
7181                 {
7182                 *errorcodeptr = ERR21;
7183                 goto FAILED;
7184                 }
7185
7186               memcpy(newspace, cd->named_groups,
7187                 cd->named_group_list_size * sizeof(named_group));
7188               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7189                 (PUBL(free))((void *)cd->named_groups);
7190               cd->named_groups = newspace;
7191               cd->named_group_list_size = newsize;
7192               }
7193
7194             cd->named_groups[cd->names_found].name = name;
7195             cd->named_groups[cd->names_found].length = namelen;
7196             cd->named_groups[cd->names_found].number = number;
7197             cd->names_found++;
7198             }
7199           }
7200
7201         ptr++;                    /* Move past > or ' in both passes. */
7202         goto NUMBERED_GROUP;
7203
7204
7205         /* ------------------------------------------------------------ */
7206         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7207         terminator = CHAR_RIGHT_PARENTHESIS;
7208         is_recurse = TRUE;
7209         /* Fall through */
7210
7211         /* We come here from the Python syntax above that handles both
7212         references (?P=name) and recursion (?P>name), as well as falling
7213         through from the Perl recursion syntax (?&name). We also come here from
7214         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7215         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7216
7217         NAMED_REF_OR_RECURSE:
7218         name = ++ptr;
7219         if (IS_DIGIT(*ptr))
7220           {
7221           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7222           goto FAILED;
7223           }
7224         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7225         namelen = (int)(ptr - name);
7226
7227         /* In the pre-compile phase, do a syntax check. We used to just set
7228         a dummy reference number, because it was not used in the first pass.
7229         However, with the change of recursive back references to be atomic,
7230         we have to look for the number so that this state can be identified, as
7231         otherwise the incorrect length is computed. If it's not a backwards
7232         reference, the dummy number will do. */
7233
7234         if (lengthptr != NULL)
7235           {
7236           named_group *ng;
7237           recno = 0;
7238
7239           if (namelen == 0)
7240             {
7241             *errorcodeptr = ERR62;
7242             goto FAILED;
7243             }
7244           if (*ptr != (pcre_uchar)terminator)
7245             {
7246             *errorcodeptr = ERR42;
7247             goto FAILED;
7248             }
7249           if (namelen > MAX_NAME_SIZE)
7250             {
7251             *errorcodeptr = ERR48;
7252             goto FAILED;
7253             }
7254
7255           /* Count named back references. */
7256
7257           if (!is_recurse) cd->namedrefcount++;
7258
7259           /* We have to allow for a named reference to a duplicated name (this
7260           cannot be determined until the second pass). This needs an extra
7261           16-bit data item. */
7262
7263           *lengthptr += IMM2_SIZE;
7264
7265           /* If this is a forward reference and we are within a (?|...) group,
7266           the reference may end up as the number of a group which we are
7267           currently inside, that is, it could be a recursive reference. In the
7268           real compile this will be picked up and the reference wrapped with
7269           OP_ONCE to make it atomic, so we must space in case this occurs. */
7270
7271           /* In fact, this can happen for a non-forward reference because
7272           another group with the same number might be created later. This
7273           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7274           only mode, we finesse the bug by allowing more memory always. */
7275
7276           *lengthptr += 2 + 2*LINK_SIZE;
7277
7278           /* It is even worse than that. The current reference may be to an
7279           existing named group with a different number (so apparently not
7280           recursive) but which later on is also attached to a group with the
7281           current number. This can only happen if $(| has been previous
7282           encountered. In that case, we allow yet more memory, just in case.
7283           (Again, this is fixed "properly" in PCRE2. */
7284
7285           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7286
7287           /* Otherwise, check for recursion here. The name table does not exist
7288           in the first pass; instead we must scan the list of names encountered
7289           so far in order to get the number. If the name is not found, leave
7290           the value of recno as 0 for a forward reference. */
7291
7292           /* This patch (removing "else") fixes a problem when a reference is
7293           to multiple identically named nested groups from within the nest.
7294           Once again, it is not the "proper" fix, and it results in an
7295           over-allocation of memory. */
7296
7297           /* else */
7298             {
7299             ng = cd->named_groups;
7300             for (i = 0; i < cd->names_found; i++, ng++)
7301               {
7302               if (namelen == ng->length &&
7303                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7304                 {
7305                 open_capitem *oc;
7306                 recno = ng->number;
7307                 if (is_recurse) break;
7308                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7309                   {
7310                   if (oc->number == recno)
7311                     {
7312                     oc->flag = TRUE;
7313                     break;
7314                     }
7315                   }
7316                 }
7317               }
7318             }
7319           }
7320
7321         /* In the real compile, search the name table. We check the name
7322         first, and then check that we have reached the end of the name in the
7323         table. That way, if the name is longer than any in the table, the
7324         comparison will fail without reading beyond the table entry. */
7325
7326         else
7327           {
7328           slot = cd->name_table;
7329           for (i = 0; i < cd->names_found; i++)
7330             {
7331             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7332                 slot[IMM2_SIZE+namelen] == 0)
7333               break;
7334             slot += cd->name_entry_size;
7335             }
7336
7337           if (i < cd->names_found)
7338             {
7339             recno = GET2(slot, 0);
7340             }
7341           else
7342             {
7343             *errorcodeptr = ERR15;
7344             goto FAILED;
7345             }
7346           }
7347
7348         /* In both phases, for recursions, we can now go to the code than
7349         handles numerical recursion. */
7350
7351         if (is_recurse) goto HANDLE_RECURSION;
7352
7353         /* In the second pass we must see if the name is duplicated. If so, we
7354         generate a different opcode. */
7355
7356         if (lengthptr == NULL && cd->dupnames)
7357           {
7358           int count = 1;
7359           unsigned int index = i;
7360           pcre_uchar *cslot = slot + cd->name_entry_size;
7361
7362           for (i++; i < cd->names_found; i++)
7363             {
7364             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7365             count++;
7366             cslot += cd->name_entry_size;
7367             }
7368
7369           if (count > 1)
7370             {
7371             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7372             previous = code;
7373             item_hwm_offset = cd->hwm - cd->start_workspace;
7374             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7375             PUT2INC(code, 0, index);
7376             PUT2INC(code, 0, count);
7377
7378             /* Process each potentially referenced group. */
7379
7380             for (; slot < cslot; slot += cd->name_entry_size)
7381               {
7382               open_capitem *oc;
7383               recno = GET2(slot, 0);
7384               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7385               if (recno > cd->top_backref) cd->top_backref = recno;
7386
7387               /* Check to see if this back reference is recursive, that it, it
7388               is inside the group that it references. A flag is set so that the
7389               group can be made atomic. */
7390
7391               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7392                 {
7393                 if (oc->number == recno)
7394                   {
7395                   oc->flag = TRUE;
7396                   break;
7397                   }
7398                 }
7399               }
7400
7401             continue;  /* End of back ref handling */
7402             }
7403           }
7404
7405         /* First pass, or a non-duplicated name. */
7406
7407         goto HANDLE_REFERENCE;
7408
7409
7410         /* ------------------------------------------------------------ */
7411         case CHAR_R:              /* Recursion, same as (?0) */
7412         recno = 0;
7413         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7414           {
7415           *errorcodeptr = ERR29;
7416           goto FAILED;
7417           }
7418         goto HANDLE_RECURSION;
7419
7420
7421         /* ------------------------------------------------------------ */
7422         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7423         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7424         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7425           {
7426           const pcre_uchar *called;
7427           terminator = CHAR_RIGHT_PARENTHESIS;
7428
7429           /* Come here from the \g<...> and \g'...' code (Oniguruma
7430           compatibility). However, the syntax has been checked to ensure that
7431           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7432           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7433           ever be taken. */
7434
7435           HANDLE_NUMERICAL_RECURSION:
7436
7437           if ((refsign = *ptr) == CHAR_PLUS)
7438             {
7439             ptr++;
7440             if (!IS_DIGIT(*ptr))
7441               {
7442               *errorcodeptr = ERR63;
7443               goto FAILED;
7444               }
7445             }
7446           else if (refsign == CHAR_MINUS)
7447             {
7448             if (!IS_DIGIT(ptr[1]))
7449               goto OTHER_CHAR_AFTER_QUERY;
7450             ptr++;
7451             }
7452
7453           recno = 0;
7454           while(IS_DIGIT(*ptr))
7455             {
7456             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7457               {
7458               while (IS_DIGIT(*ptr)) ptr++;
7459               *errorcodeptr = ERR61;
7460               goto FAILED;
7461               }
7462             recno = recno * 10 + *ptr++ - CHAR_0;
7463             }
7464
7465           if (*ptr != (pcre_uchar)terminator)
7466             {
7467             *errorcodeptr = ERR29;
7468             goto FAILED;
7469             }
7470
7471           if (refsign == CHAR_MINUS)
7472             {
7473             if (recno == 0)
7474               {
7475               *errorcodeptr = ERR58;
7476               goto FAILED;
7477               }
7478             recno = cd->bracount - recno + 1;
7479             if (recno <= 0)
7480               {
7481               *errorcodeptr = ERR15;
7482               goto FAILED;
7483               }
7484             }
7485           else if (refsign == CHAR_PLUS)
7486             {
7487             if (recno == 0)
7488               {
7489               *errorcodeptr = ERR58;
7490               goto FAILED;
7491               }
7492             recno += cd->bracount;
7493             }
7494
7495           /* Come here from code above that handles a named recursion */
7496
7497           HANDLE_RECURSION:
7498
7499           previous = code;
7500           item_hwm_offset = cd->hwm - cd->start_workspace;
7501           called = cd->start_code;
7502
7503           /* When we are actually compiling, find the bracket that is being
7504           referenced. Temporarily end the regex in case it doesn't exist before
7505           this point. If we end up with a forward reference, first check that
7506           the bracket does occur later so we can give the error (and position)
7507           now. Then remember this forward reference in the workspace so it can
7508           be filled in at the end. */
7509
7510           if (lengthptr == NULL)
7511             {
7512             *code = OP_END;
7513             if (recno != 0)
7514               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7515
7516             /* Forward reference */
7517
7518             if (called == NULL)
7519               {
7520               if (recno > cd->final_bracount)
7521                 {
7522                 *errorcodeptr = ERR15;
7523                 goto FAILED;
7524                 }
7525
7526               /* Fudge the value of "called" so that when it is inserted as an
7527               offset below, what it actually inserted is the reference number
7528               of the group. Then remember the forward reference. */
7529
7530               called = cd->start_code + recno;
7531               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7532                   WORK_SIZE_SAFETY_MARGIN)
7533                 {
7534                 *errorcodeptr = expand_workspace(cd);
7535                 if (*errorcodeptr != 0) goto FAILED;
7536                 }
7537               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7538               }
7539
7540             /* If not a forward reference, and the subpattern is still open,
7541             this is a recursive call. We check to see if this is a left
7542             recursion that could loop for ever, and diagnose that case. We
7543             must not, however, do this check if we are in a conditional
7544             subpattern because the condition might be testing for recursion in
7545             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7546             Forever loops are also detected at runtime, so those that occur in
7547             conditional subpatterns will be picked up then. */
7548
7549             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7550                      could_be_empty(called, code, bcptr, utf, cd))
7551               {
7552               *errorcodeptr = ERR40;
7553               goto FAILED;
7554               }
7555             }
7556
7557           /* Insert the recursion/subroutine item. It does not have a set first
7558           character (relevant if it is repeated, because it will then be
7559           wrapped with ONCE brackets). */
7560
7561           *code = OP_RECURSE;
7562           PUT(code, 1, (int)(called - cd->start_code));
7563           code += 1 + LINK_SIZE;
7564           groupsetfirstchar = FALSE;
7565           }
7566
7567         /* Can't determine a first byte now */
7568
7569         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7570         continue;
7571
7572
7573         /* ------------------------------------------------------------ */
7574         default:              /* Other characters: check option setting */
7575         OTHER_CHAR_AFTER_QUERY:
7576         set = unset = 0;
7577         optset = &set;
7578
7579         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7580           {
7581           switch (*ptr++)
7582             {
7583             case CHAR_MINUS: optset = &unset; break;
7584
7585             case CHAR_J:    /* Record that it changed in the external options */
7586             *optset |= PCRE_DUPNAMES;
7587             cd->external_flags |= PCRE_JCHANGED;
7588             break;
7589
7590             case CHAR_i: *optset |= PCRE_CASELESS; break;
7591             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7592             case CHAR_s: *optset |= PCRE_DOTALL; break;
7593             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7594             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7595             case CHAR_X: *optset |= PCRE_EXTRA; break;
7596
7597             default:  *errorcodeptr = ERR12;
7598                       ptr--;    /* Correct the offset */
7599                       goto FAILED;
7600             }
7601           }
7602
7603         /* Set up the changed option bits, but don't change anything yet. */
7604
7605         newoptions = (options | set) & (~unset);
7606
7607         /* If the options ended with ')' this is not the start of a nested
7608         group with option changes, so the options change at this level. If this
7609         item is right at the start of the pattern, the options can be
7610         abstracted and made external in the pre-compile phase, and ignored in
7611         the compile phase. This can be helpful when matching -- for instance in
7612         caseless checking of required bytes.
7613
7614         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7615         definitely *not* at the start of the pattern because something has been
7616         compiled. In the pre-compile phase, however, the code pointer can have
7617         that value after the start, because it gets reset as code is discarded
7618         during the pre-compile. However, this can happen only at top level - if
7619         we are within parentheses, the starting BRA will still be present. At
7620         any parenthesis level, the length value can be used to test if anything
7621         has been compiled at that level. Thus, a test for both these conditions
7622         is necessary to ensure we correctly detect the start of the pattern in
7623         both phases.
7624
7625         If we are not at the pattern start, reset the greedy defaults and the
7626         case value for firstchar and reqchar. */
7627
7628         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7629           {
7630           if (code == cd->start_code + 1 + LINK_SIZE &&
7631                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7632             {
7633             cd->external_options = newoptions;
7634             }
7635           else
7636             {
7637             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7638             greedy_non_default = greedy_default ^ 1;
7639             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7640             }
7641
7642           /* Change options at this level, and pass them back for use
7643           in subsequent branches. */
7644
7645           *optionsptr = options = newoptions;
7646           previous = NULL;       /* This item can't be repeated */
7647           continue;              /* It is complete */
7648           }
7649
7650         /* If the options ended with ':' we are heading into a nested group
7651         with possible change of options. Such groups are non-capturing and are
7652         not assertions of any kind. All we need to do is skip over the ':';
7653         the newoptions value is handled below. */
7654
7655         bravalue = OP_BRA;
7656         ptr++;
7657         }     /* End of switch for character following (? */
7658       }       /* End of (? handling */
7659
7660     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7661     is set, all unadorned brackets become non-capturing and behave like (?:...)
7662     brackets. */
7663
7664     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7665       {
7666       bravalue = OP_BRA;
7667       }
7668
7669     /* Else we have a capturing group. */
7670
7671     else
7672       {
7673       NUMBERED_GROUP:
7674       cd->bracount += 1;
7675       PUT2(code, 1+LINK_SIZE, cd->bracount);
7676       skipbytes = IMM2_SIZE;
7677       }
7678
7679     /* Process nested bracketed regex. First check for parentheses nested too
7680     deeply. */
7681
7682     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7683       {
7684       *errorcodeptr = ERR82;
7685       goto FAILED;
7686       }
7687
7688     /* All assertions used not to be repeatable, but this was changed for Perl
7689     compatibility. All kinds can now be repeated except for assertions that are
7690     conditions (Perl also forbids these to be repeated). We copy code into a
7691     non-register variable (tempcode) in order to be able to pass its address
7692     because some compilers complain otherwise. At the start of a conditional
7693     group whose condition is an assertion, cd->iscondassert is set. We unset it
7694     here so as to allow assertions later in the group to be quantified. */
7695
7696     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7697         cd->iscondassert)
7698       {
7699       previous = NULL;
7700       cd->iscondassert = FALSE;
7701       }
7702     else
7703       {
7704       previous = code;
7705       item_hwm_offset = cd->hwm - cd->start_workspace;
7706       }
7707
7708     *code = bravalue;
7709     tempcode = code;
7710     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7711     tempbracount = cd->bracount;          /* Save value before bracket */
7712     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7713
7714     if (!compile_regex(
7715          newoptions,                      /* The complete new option state */
7716          &tempcode,                       /* Where to put code (updated) */
7717          &ptr,                            /* Input pointer (updated) */
7718          errorcodeptr,                    /* Where to put an error message */
7719          (bravalue == OP_ASSERTBACK ||
7720           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7721          reset_bracount,                  /* True if (?| group */
7722          skipbytes,                       /* Skip over bracket number */
7723          cond_depth +
7724            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7725          &subfirstchar,                   /* For possible first char */
7726          &subfirstcharflags,
7727          &subreqchar,                     /* For possible last char */
7728          &subreqcharflags,
7729          bcptr,                           /* Current branch chain */
7730          cd,                              /* Tables block */
7731          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7732            &length_prevgroup              /* Pre-compile phase */
7733          ))
7734       goto FAILED;
7735
7736     cd->parens_depth -= 1;
7737
7738     /* If this was an atomic group and there are no capturing groups within it,
7739     generate OP_ONCE_NC instead of OP_ONCE. */
7740
7741     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7742       *code = OP_ONCE_NC;
7743
7744     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7745       cd->assert_depth -= 1;
7746
7747     /* At the end of compiling, code is still pointing to the start of the
7748     group, while tempcode has been updated to point past the end of the group.
7749     The pattern pointer (ptr) is on the bracket.
7750
7751     If this is a conditional bracket, check that there are no more than
7752     two branches in the group, or just one if it's a DEFINE group. We do this
7753     in the real compile phase, not in the pre-pass, where the whole group may
7754     not be available. */
7755
7756     if (bravalue == OP_COND && lengthptr == NULL)
7757       {
7758       pcre_uchar *tc = code;
7759       int condcount = 0;
7760
7761       do {
7762          condcount++;
7763          tc += GET(tc,1);
7764          }
7765       while (*tc != OP_KET);
7766
7767       /* A DEFINE group is never obeyed inline (the "condition" is always
7768       false). It must have only one branch. */
7769
7770       if (code[LINK_SIZE+1] == OP_DEF)
7771         {
7772         if (condcount > 1)
7773           {
7774           *errorcodeptr = ERR54;
7775           goto FAILED;
7776           }
7777         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7778         }
7779
7780       /* A "normal" conditional group. If there is just one branch, we must not
7781       make use of its firstchar or reqchar, because this is equivalent to an
7782       empty second branch. */
7783
7784       else
7785         {
7786         if (condcount > 2)
7787           {
7788           *errorcodeptr = ERR27;
7789           goto FAILED;
7790           }
7791         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7792         }
7793       }
7794
7795     /* Error if hit end of pattern */
7796
7797     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7798       {
7799       *errorcodeptr = ERR14;
7800       goto FAILED;
7801       }
7802
7803     /* In the pre-compile phase, update the length by the length of the group,
7804     less the brackets at either end. Then reduce the compiled code to just a
7805     set of non-capturing brackets so that it doesn't use much memory if it is
7806     duplicated by a quantifier.*/
7807
7808     if (lengthptr != NULL)
7809       {
7810       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7811         {
7812         *errorcodeptr = ERR20;
7813         goto FAILED;
7814         }
7815       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7816       code++;   /* This already contains bravalue */
7817       PUTINC(code, 0, 1 + LINK_SIZE);
7818       *code++ = OP_KET;
7819       PUTINC(code, 0, 1 + LINK_SIZE);
7820       break;    /* No need to waste time with special character handling */
7821       }
7822
7823     /* Otherwise update the main code pointer to the end of the group. */
7824
7825     code = tempcode;
7826
7827     /* For a DEFINE group, required and first character settings are not
7828     relevant. */
7829
7830     if (bravalue == OP_DEF) break;
7831
7832     /* Handle updating of the required and first characters for other types of
7833     group. Update for normal brackets of all kinds, and conditions with two
7834     branches (see code above). If the bracket is followed by a quantifier with
7835     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7836     zerofirstchar outside the main loop so that they can be accessed for the
7837     back off. */
7838
7839     zeroreqchar = reqchar;
7840     zeroreqcharflags = reqcharflags;
7841     zerofirstchar = firstchar;
7842     zerofirstcharflags = firstcharflags;
7843     groupsetfirstchar = FALSE;
7844
7845     if (bravalue >= OP_ONCE)
7846       {
7847       /* If we have not yet set a firstchar in this branch, take it from the
7848       subpattern, remembering that it was set here so that a repeat of more
7849       than one can replicate it as reqchar if necessary. If the subpattern has
7850       no firstchar, set "none" for the whole branch. In both cases, a zero
7851       repeat forces firstchar to "none". */
7852
7853       if (firstcharflags == REQ_UNSET)
7854         {
7855         if (subfirstcharflags >= 0)
7856           {
7857           firstchar = subfirstchar;
7858           firstcharflags = subfirstcharflags;
7859           groupsetfirstchar = TRUE;
7860           }
7861         else firstcharflags = REQ_NONE;
7862         zerofirstcharflags = REQ_NONE;
7863         }
7864
7865       /* If firstchar was previously set, convert the subpattern's firstchar
7866       into reqchar if there wasn't one, using the vary flag that was in
7867       existence beforehand. */
7868
7869       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7870         {
7871         subreqchar = subfirstchar;
7872         subreqcharflags = subfirstcharflags | tempreqvary;
7873         }
7874
7875       /* If the subpattern set a required byte (or set a first byte that isn't
7876       really the first byte - see above), set it. */
7877
7878       if (subreqcharflags >= 0)
7879         {
7880         reqchar = subreqchar;
7881         reqcharflags = subreqcharflags;
7882         }
7883       }
7884
7885     /* For a forward assertion, we take the reqchar, if set. This can be
7886     helpful if the pattern that follows the assertion doesn't set a different
7887     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7888     for an assertion, however because it leads to incorrect effect for patterns
7889     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7890     of a firstchar. This is overcome by a scan at the end if there's no
7891     firstchar, looking for an asserted first char. */
7892
7893     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7894       {
7895       reqchar = subreqchar;
7896       reqcharflags = subreqcharflags;
7897       }
7898     break;     /* End of processing '(' */
7899
7900
7901     /* ===================================================================*/
7902     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7903     are arranged to be the negation of the corresponding OP_values in the
7904     default case when PCRE_UCP is not set. For the back references, the values
7905     are negative the reference number. Only back references and those types
7906     that consume a character may be repeated. We can test for values between
7907     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7908     ever created. */
7909
7910     case CHAR_BACKSLASH:
7911     tempptr = ptr;
7912     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7913     if (*errorcodeptr != 0) goto FAILED;
7914
7915     if (escape == 0)                  /* The escape coded a single character */
7916       c = ec;
7917     else
7918       {
7919       if (escape == ESC_Q)            /* Handle start of quoted string */
7920         {
7921         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7922           ptr += 2;               /* avoid empty string */
7923             else inescq = TRUE;
7924         continue;
7925         }
7926
7927       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7928
7929       /* For metasequences that actually match a character, we disable the
7930       setting of a first character if it hasn't already been set. */
7931
7932       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7933         firstcharflags = REQ_NONE;
7934
7935       /* Set values to reset to if this is followed by a zero repeat. */
7936
7937       zerofirstchar = firstchar;
7938       zerofirstcharflags = firstcharflags;
7939       zeroreqchar = reqchar;
7940       zeroreqcharflags = reqcharflags;
7941
7942       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7943       is a subroutine call by number (Oniguruma syntax). In fact, the value
7944       ESC_g is returned only for these cases. So we don't need to check for <
7945       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7946       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7947       that is a synonym for a named back reference). */
7948
7949       if (escape == ESC_g)
7950         {
7951         const pcre_uchar *p;
7952         pcre_uint32 cf;
7953
7954         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7955         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7956           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7957
7958         /* These two statements stop the compiler for warning about possibly
7959         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7960         fact, because we do the check for a number below, the paths that
7961         would actually be in error are never taken. */
7962
7963         skipbytes = 0;
7964         reset_bracount = FALSE;
7965
7966         /* If it's not a signed or unsigned number, treat it as a name. */
7967
7968         cf = ptr[1];
7969         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7970           {
7971           is_recurse = TRUE;
7972           goto NAMED_REF_OR_RECURSE;
7973           }
7974
7975         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7976         or a digit. */
7977
7978         p = ptr + 2;
7979         while (IS_DIGIT(*p)) p++;
7980         if (*p != (pcre_uchar)terminator)
7981           {
7982           *errorcodeptr = ERR57;
7983           goto FAILED;
7984           }
7985         ptr++;
7986         goto HANDLE_NUMERICAL_RECURSION;
7987         }
7988
7989       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7990       We also support \k{name} (.NET syntax).  */
7991
7992       if (escape == ESC_k)
7993         {
7994         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7995           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7996           {
7997           *errorcodeptr = ERR69;
7998           goto FAILED;
7999           }
8000         is_recurse = FALSE;
8001         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8002           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8003           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8004         goto NAMED_REF_OR_RECURSE;
8005         }
8006
8007       /* Back references are handled specially; must disable firstchar if
8008       not set to cope with cases like (?=(\w+))\1: which would otherwise set
8009       ':' later. */
8010
8011       if (escape < 0)
8012         {
8013         open_capitem *oc;
8014         recno = -escape;
8015
8016         /* Come here from named backref handling when the reference is to a
8017         single group (i.e. not to a duplicated name. */
8018
8019         HANDLE_REFERENCE:
8020         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8021         previous = code;
8022         item_hwm_offset = cd->hwm - cd->start_workspace;
8023         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8024         PUT2INC(code, 0, recno);
8025         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8026         if (recno > cd->top_backref) cd->top_backref = recno;
8027
8028         /* Check to see if this back reference is recursive, that it, it
8029         is inside the group that it references. A flag is set so that the
8030         group can be made atomic. */
8031
8032         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8033           {
8034           if (oc->number == recno)
8035             {
8036             oc->flag = TRUE;
8037             break;
8038             }
8039           }
8040         }
8041
8042       /* So are Unicode property matches, if supported. */
8043
8044 #ifdef SUPPORT_UCP
8045       else if (escape == ESC_P || escape == ESC_p)
8046         {
8047         BOOL negated;
8048         unsigned int ptype = 0, pdata = 0;
8049         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8050           goto FAILED;
8051         previous = code;
8052         item_hwm_offset = cd->hwm - cd->start_workspace;
8053         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8054         *code++ = ptype;
8055         *code++ = pdata;
8056         }
8057 #else
8058
8059       /* If Unicode properties are not supported, \X, \P, and \p are not
8060       allowed. */
8061
8062       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8063         {
8064         *errorcodeptr = ERR45;
8065         goto FAILED;
8066         }
8067 #endif
8068
8069       /* For the rest (including \X when Unicode properties are supported), we
8070       can obtain the OP value by negating the escape value in the default
8071       situation when PCRE_UCP is not set. When it *is* set, we substitute
8072       Unicode property tests. Note that \b and \B do a one-character
8073       lookbehind, and \A also behaves as if it does. */
8074
8075       else
8076         {
8077         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8078              cd->max_lookbehind == 0)
8079           cd->max_lookbehind = 1;
8080 #ifdef SUPPORT_UCP
8081         if (escape >= ESC_DU && escape <= ESC_wu)
8082           {
8083           nestptr = ptr + 1;                   /* Where to resume */
8084           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8085           }
8086         else
8087 #endif
8088         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8089         so that it works in DFA mode and in lookbehinds. */
8090
8091           {
8092           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8093           item_hwm_offset = cd->hwm - cd->start_workspace;
8094           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8095           }
8096         }
8097       continue;
8098       }
8099
8100     /* We have a data character whose value is in c. In UTF-8 mode it may have
8101     a value > 127. We set its representation in the length/buffer, and then
8102     handle it as a data character. */
8103
8104 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8105     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8106       mclength = PRIV(ord2utf)(c, mcbuffer);
8107     else
8108 #endif
8109
8110      {
8111      mcbuffer[0] = c;
8112      mclength = 1;
8113      }
8114     goto ONE_CHAR;
8115
8116
8117     /* ===================================================================*/
8118     /* Handle a literal character. It is guaranteed not to be whitespace or #
8119     when the extended flag is set. If we are in a UTF mode, it may be a
8120     multi-unit literal character. */
8121
8122     default:
8123     NORMAL_CHAR:
8124     mclength = 1;
8125     mcbuffer[0] = c;
8126
8127 #ifdef SUPPORT_UTF
8128     if (utf && HAS_EXTRALEN(c))
8129       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8130 #endif
8131
8132     /* At this point we have the character's bytes in mcbuffer, and the length
8133     in mclength. When not in UTF-8 mode, the length is always 1. */
8134
8135     ONE_CHAR:
8136     previous = code;
8137     item_hwm_offset = cd->hwm - cd->start_workspace;
8138
8139     /* For caseless UTF-8 mode when UCP support is available, check whether
8140     this character has more than one other case. If so, generate a special
8141     OP_PROP item instead of OP_CHARI. */
8142
8143 #ifdef SUPPORT_UCP
8144     if (utf && (options & PCRE_CASELESS) != 0)
8145       {
8146       GETCHAR(c, mcbuffer);
8147       if ((c = UCD_CASESET(c)) != 0)
8148         {
8149         *code++ = OP_PROP;
8150         *code++ = PT_CLIST;
8151         *code++ = c;
8152         if (firstcharflags == REQ_UNSET)
8153           firstcharflags = zerofirstcharflags = REQ_NONE;
8154         break;
8155         }
8156       }
8157 #endif
8158
8159     /* Caseful matches, or not one of the multicase characters. */
8160
8161     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8162     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8163
8164     /* Remember if \r or \n were seen */
8165
8166     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8167       cd->external_flags |= PCRE_HASCRORLF;
8168
8169     /* Set the first and required bytes appropriately. If no previous first
8170     byte, set it from this character, but revert to none on a zero repeat.
8171     Otherwise, leave the firstchar value alone, and don't change it on a zero
8172     repeat. */
8173
8174     if (firstcharflags == REQ_UNSET)
8175       {
8176       zerofirstcharflags = REQ_NONE;
8177       zeroreqchar = reqchar;
8178       zeroreqcharflags = reqcharflags;
8179
8180       /* If the character is more than one byte long, we can set firstchar
8181       only if it is not to be matched caselessly. */
8182
8183       if (mclength == 1 || req_caseopt == 0)
8184         {
8185         firstchar = mcbuffer[0] | req_caseopt;
8186         firstchar = mcbuffer[0];
8187         firstcharflags = req_caseopt;
8188
8189         if (mclength != 1)
8190           {
8191           reqchar = code[-1];
8192           reqcharflags = cd->req_varyopt;
8193           }
8194         }
8195       else firstcharflags = reqcharflags = REQ_NONE;
8196       }
8197
8198     /* firstchar was previously set; we can set reqchar only if the length is
8199     1 or the matching is caseful. */
8200
8201     else
8202       {
8203       zerofirstchar = firstchar;
8204       zerofirstcharflags = firstcharflags;
8205       zeroreqchar = reqchar;
8206       zeroreqcharflags = reqcharflags;
8207       if (mclength == 1 || req_caseopt == 0)
8208         {
8209         reqchar = code[-1];
8210         reqcharflags = req_caseopt | cd->req_varyopt;
8211         }
8212       }
8213
8214     break;            /* End of literal character handling */
8215     }
8216   }                   /* end of big loop */
8217
8218
8219 /* Control never reaches here by falling through, only by a goto for all the
8220 error states. Pass back the position in the pattern so that it can be displayed
8221 to the user for diagnosing the error. */
8222
8223 FAILED:
8224 *ptrptr = ptr;
8225 return FALSE;
8226 }
8227
8228
8229
8230 /*************************************************
8231 *     Compile sequence of alternatives           *
8232 *************************************************/
8233
8234 /* On entry, ptr is pointing past the bracket character, but on return it
8235 points to the closing bracket, or vertical bar, or end of string. The code
8236 variable is pointing at the byte into which the BRA operator has been stored.
8237 This function is used during the pre-compile phase when we are trying to find
8238 out the amount of memory needed, as well as during the real compile phase. The
8239 value of lengthptr distinguishes the two phases.
8240
8241 Arguments:
8242   options           option bits, including any changes for this subpattern
8243   codeptr           -> the address of the current code pointer
8244   ptrptr            -> the address of the current pattern pointer
8245   errorcodeptr      -> pointer to error code variable
8246   lookbehind        TRUE if this is a lookbehind assertion
8247   reset_bracount    TRUE to reset the count for each branch
8248   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8249   cond_depth        depth of nesting for conditional subpatterns
8250   firstcharptr      place to put the first required character
8251   firstcharflagsptr place to put the first character flags, or a negative number
8252   reqcharptr        place to put the last required character
8253   reqcharflagsptr   place to put the last required character flags, or a negative number
8254   bcptr             pointer to the chain of currently open branches
8255   cd                points to the data block with tables pointers etc.
8256   lengthptr         NULL during the real compile phase
8257                     points to length accumulator during pre-compile phase
8258
8259 Returns:            TRUE on success
8260 */
8261
8262 static BOOL
8263 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8264   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8265   int cond_depth,
8266   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8267   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8268   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8269 {
8270 const pcre_uchar *ptr = *ptrptr;
8271 pcre_uchar *code = *codeptr;
8272 pcre_uchar *last_branch = code;
8273 pcre_uchar *start_bracket = code;
8274 pcre_uchar *reverse_count = NULL;
8275 open_capitem capitem;
8276 int capnumber = 0;
8277 pcre_uint32 firstchar, reqchar;
8278 pcre_int32 firstcharflags, reqcharflags;
8279 pcre_uint32 branchfirstchar, branchreqchar;
8280 pcre_int32 branchfirstcharflags, branchreqcharflags;
8281 int length;
8282 unsigned int orig_bracount;
8283 unsigned int max_bracount;
8284 branch_chain bc;
8285 size_t save_hwm_offset;
8286
8287 /* If set, call the external function that checks for stack availability. */
8288
8289 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8290   {
8291   *errorcodeptr= ERR85;
8292   return FALSE;
8293   }
8294
8295 /* Miscellaneous initialization */
8296
8297 bc.outer = bcptr;
8298 bc.current_branch = code;
8299
8300 firstchar = reqchar = 0;
8301 firstcharflags = reqcharflags = REQ_UNSET;
8302
8303 save_hwm_offset = cd->hwm - cd->start_workspace;
8304
8305 /* Accumulate the length for use in the pre-compile phase. Start with the
8306 length of the BRA and KET and any extra bytes that are required at the
8307 beginning. We accumulate in a local variable to save frequent testing of
8308 lenthptr for NULL. We cannot do this by looking at the value of code at the
8309 start and end of each alternative, because compiled items are discarded during
8310 the pre-compile phase so that the work space is not exceeded. */
8311
8312 length = 2 + 2*LINK_SIZE + skipbytes;
8313
8314 /* WARNING: If the above line is changed for any reason, you must also change
8315 the code that abstracts option settings at the start of the pattern and makes
8316 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8317 pre-compile phase to find out whether anything has yet been compiled or not. */
8318
8319 /* If this is a capturing subpattern, add to the chain of open capturing items
8320 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8321 detect groups that contain recursive back references to themselves. Note that
8322 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8323 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8324
8325 if (*code == OP_CBRA)
8326   {
8327   capnumber = GET2(code, 1 + LINK_SIZE);
8328   capitem.number = capnumber;
8329   capitem.next = cd->open_caps;
8330   capitem.flag = FALSE;
8331   cd->open_caps = &capitem;
8332   }
8333
8334 /* Offset is set zero to mark that this bracket is still open */
8335
8336 PUT(code, 1, 0);
8337 code += 1 + LINK_SIZE + skipbytes;
8338
8339 /* Loop for each alternative branch */
8340
8341 orig_bracount = max_bracount = cd->bracount;
8342 for (;;)
8343   {
8344   /* For a (?| group, reset the capturing bracket count so that each branch
8345   uses the same numbers. */
8346
8347   if (reset_bracount) cd->bracount = orig_bracount;
8348
8349   /* Set up dummy OP_REVERSE if lookbehind assertion */
8350
8351   if (lookbehind)
8352     {
8353     *code++ = OP_REVERSE;
8354     reverse_count = code;
8355     PUTINC(code, 0, 0);
8356     length += 1 + LINK_SIZE;
8357     }
8358
8359   /* Now compile the branch; in the pre-compile phase its length gets added
8360   into the length. */
8361
8362   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8363         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8364         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8365     {
8366     *ptrptr = ptr;
8367     return FALSE;
8368     }
8369
8370   /* Keep the highest bracket count in case (?| was used and some branch
8371   has fewer than the rest. */
8372
8373   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8374
8375   /* In the real compile phase, there is some post-processing to be done. */
8376
8377   if (lengthptr == NULL)
8378     {
8379     /* If this is the first branch, the firstchar and reqchar values for the
8380     branch become the values for the regex. */
8381
8382     if (*last_branch != OP_ALT)
8383       {
8384       firstchar = branchfirstchar;
8385       firstcharflags = branchfirstcharflags;
8386       reqchar = branchreqchar;
8387       reqcharflags = branchreqcharflags;
8388       }
8389
8390     /* If this is not the first branch, the first char and reqchar have to
8391     match the values from all the previous branches, except that if the
8392     previous value for reqchar didn't have REQ_VARY set, it can still match,
8393     and we set REQ_VARY for the regex. */
8394
8395     else
8396       {
8397       /* If we previously had a firstchar, but it doesn't match the new branch,
8398       we have to abandon the firstchar for the regex, but if there was
8399       previously no reqchar, it takes on the value of the old firstchar. */
8400
8401       if (firstcharflags >= 0 &&
8402           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8403         {
8404         if (reqcharflags < 0)
8405           {
8406           reqchar = firstchar;
8407           reqcharflags = firstcharflags;
8408           }
8409         firstcharflags = REQ_NONE;
8410         }
8411
8412       /* If we (now or from before) have no firstchar, a firstchar from the
8413       branch becomes a reqchar if there isn't a branch reqchar. */
8414
8415       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8416         {
8417         branchreqchar = branchfirstchar;
8418         branchreqcharflags = branchfirstcharflags;
8419         }
8420
8421       /* Now ensure that the reqchars match */
8422
8423       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8424           reqchar != branchreqchar)
8425         reqcharflags = REQ_NONE;
8426       else
8427         {
8428         reqchar = branchreqchar;
8429         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8430         }
8431       }
8432
8433     /* If lookbehind, check that this branch matches a fixed-length string, and
8434     put the length into the OP_REVERSE item. Temporarily mark the end of the
8435     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8436     because there may be forward references that we can't check here. Set a
8437     flag to cause another lookbehind check at the end. Why not do it all at the
8438     end? Because common, erroneous checks are picked up here and the offset of
8439     the problem can be shown. */
8440
8441     if (lookbehind)
8442       {
8443       int fixed_length;
8444       *code = OP_END;
8445       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8446         FALSE, cd, NULL);
8447       DPRINTF(("fixed length = %d\n", fixed_length));
8448       if (fixed_length == -3)
8449         {
8450         cd->check_lookbehind = TRUE;
8451         }
8452       else if (fixed_length < 0)
8453         {
8454         *errorcodeptr = (fixed_length == -2)? ERR36 :
8455                         (fixed_length == -4)? ERR70: ERR25;
8456         *ptrptr = ptr;
8457         return FALSE;
8458         }
8459       else
8460         {
8461         if (fixed_length > cd->max_lookbehind)
8462           cd->max_lookbehind = fixed_length;
8463         PUT(reverse_count, 0, fixed_length);
8464         }
8465       }
8466     }
8467
8468   /* Reached end of expression, either ')' or end of pattern. In the real
8469   compile phase, go back through the alternative branches and reverse the chain
8470   of offsets, with the field in the BRA item now becoming an offset to the
8471   first alternative. If there are no alternatives, it points to the end of the
8472   group. The length in the terminating ket is always the length of the whole
8473   bracketed item. Return leaving the pointer at the terminating char. */
8474
8475   if (*ptr != CHAR_VERTICAL_LINE)
8476     {
8477     if (lengthptr == NULL)
8478       {
8479       int branch_length = (int)(code - last_branch);
8480       do
8481         {
8482         int prev_length = GET(last_branch, 1);
8483         PUT(last_branch, 1, branch_length);
8484         branch_length = prev_length;
8485         last_branch -= branch_length;
8486         }
8487       while (branch_length > 0);
8488       }
8489
8490     /* Fill in the ket */
8491
8492     *code = OP_KET;
8493     PUT(code, 1, (int)(code - start_bracket));
8494     code += 1 + LINK_SIZE;
8495
8496     /* If it was a capturing subpattern, check to see if it contained any
8497     recursive back references. If so, we must wrap it in atomic brackets.
8498     Because we are moving code along, we must ensure that any pending recursive
8499     references are updated. In any event, remove the block from the chain. */
8500
8501     if (capnumber > 0)
8502       {
8503       if (cd->open_caps->flag)
8504         {
8505         *code = OP_END;
8506         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8507           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8508         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8509           IN_UCHARS(code - start_bracket));
8510         *start_bracket = OP_ONCE;
8511         code += 1 + LINK_SIZE;
8512         PUT(start_bracket, 1, (int)(code - start_bracket));
8513         *code = OP_KET;
8514         PUT(code, 1, (int)(code - start_bracket));
8515         code += 1 + LINK_SIZE;
8516         length += 2 + 2*LINK_SIZE;
8517         }
8518       cd->open_caps = cd->open_caps->next;
8519       }
8520
8521     /* Retain the highest bracket number, in case resetting was used. */
8522
8523     cd->bracount = max_bracount;
8524
8525     /* Set values to pass back */
8526
8527     *codeptr = code;
8528     *ptrptr = ptr;
8529     *firstcharptr = firstchar;
8530     *firstcharflagsptr = firstcharflags;
8531     *reqcharptr = reqchar;
8532     *reqcharflagsptr = reqcharflags;
8533     if (lengthptr != NULL)
8534       {
8535       if (OFLOW_MAX - *lengthptr < length)
8536         {
8537         *errorcodeptr = ERR20;
8538         return FALSE;
8539         }
8540       *lengthptr += length;
8541       }
8542     return TRUE;
8543     }
8544
8545   /* Another branch follows. In the pre-compile phase, we can move the code
8546   pointer back to where it was for the start of the first branch. (That is,
8547   pretend that each branch is the only one.)
8548
8549   In the real compile phase, insert an ALT node. Its length field points back
8550   to the previous branch while the bracket remains open. At the end the chain
8551   is reversed. It's done like this so that the start of the bracket has a
8552   zero offset until it is closed, making it possible to detect recursion. */
8553
8554   if (lengthptr != NULL)
8555     {
8556     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8557     length += 1 + LINK_SIZE;
8558     }
8559   else
8560     {
8561     *code = OP_ALT;
8562     PUT(code, 1, (int)(code - last_branch));
8563     bc.current_branch = last_branch = code;
8564     code += 1 + LINK_SIZE;
8565     }
8566
8567   ptr++;
8568   }
8569 /* Control never reaches here */
8570 }
8571
8572
8573
8574
8575 /*************************************************
8576 *          Check for anchored expression         *
8577 *************************************************/
8578
8579 /* Try to find out if this is an anchored regular expression. Consider each
8580 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8581 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8582 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8583 be found, because ^ generates OP_CIRCM in that mode.
8584
8585 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8586 This is the code for \G, which means "match at start of match position, taking
8587 into account the match offset".
8588
8589 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8590 because that will try the rest of the pattern at all possible matching points,
8591 so there is no point trying again.... er ....
8592
8593 .... except when the .* appears inside capturing parentheses, and there is a
8594 subsequent back reference to those parentheses. We haven't enough information
8595 to catch that case precisely.
8596
8597 At first, the best we could do was to detect when .* was in capturing brackets
8598 and the highest back reference was greater than or equal to that level.
8599 However, by keeping a bitmap of the first 31 back references, we can catch some
8600 of the more common cases more precisely.
8601
8602 ... A second exception is when the .* appears inside an atomic group, because
8603 this prevents the number of characters it matches from being adjusted.
8604
8605 Arguments:
8606   code           points to start of expression (the bracket)
8607   bracket_map    a bitmap of which brackets we are inside while testing; this
8608                   handles up to substring 31; after that we just have to take
8609                   the less precise approach
8610   cd             points to the compile data block
8611   atomcount      atomic group level
8612
8613 Returns:     TRUE or FALSE
8614 */
8615
8616 static BOOL
8617 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8618   compile_data *cd, int atomcount)
8619 {
8620 do {
8621    const pcre_uchar *scode = first_significant_code(
8622      code + PRIV(OP_lengths)[*code], FALSE);
8623    register int op = *scode;
8624
8625    /* Non-capturing brackets */
8626
8627    if (op == OP_BRA  || op == OP_BRAPOS ||
8628        op == OP_SBRA || op == OP_SBRAPOS)
8629      {
8630      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8631      }
8632
8633    /* Capturing brackets */
8634
8635    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8636             op == OP_SCBRA || op == OP_SCBRAPOS)
8637      {
8638      int n = GET2(scode, 1+LINK_SIZE);
8639      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8640      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8641      }
8642
8643    /* Positive forward assertions and conditions */
8644
8645    else if (op == OP_ASSERT || op == OP_COND)
8646      {
8647      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8648      }
8649
8650    /* Atomic groups */
8651
8652    else if (op == OP_ONCE || op == OP_ONCE_NC)
8653      {
8654      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8655        return FALSE;
8656      }
8657
8658    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8659    it isn't in brackets that are or may be referenced or inside an atomic
8660    group. */
8661
8662    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8663              op == OP_TYPEPOSSTAR))
8664      {
8665      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8666          atomcount > 0 || cd->had_pruneorskip)
8667        return FALSE;
8668      }
8669
8670    /* Check for explicit anchoring */
8671
8672    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8673
8674    code += GET(code, 1);
8675    }
8676 while (*code == OP_ALT);   /* Loop for each alternative */
8677 return TRUE;
8678 }
8679
8680
8681
8682 /*************************************************
8683 *         Check for starting with ^ or .*        *
8684 *************************************************/
8685
8686 /* This is called to find out if every branch starts with ^ or .* so that
8687 "first char" processing can be done to speed things up in multiline
8688 matching and for non-DOTALL patterns that start with .* (which must start at
8689 the beginning or after \n). As in the case of is_anchored() (see above), we
8690 have to take account of back references to capturing brackets that contain .*
8691 because in that case we can't make the assumption. Also, the appearance of .*
8692 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8693 count, because once again the assumption no longer holds.
8694
8695 Arguments:
8696   code           points to start of expression (the bracket)
8697   bracket_map    a bitmap of which brackets we are inside while testing; this
8698                   handles up to substring 31; after that we just have to take
8699                   the less precise approach
8700   cd             points to the compile data
8701   atomcount      atomic group level
8702
8703 Returns:         TRUE or FALSE
8704 */
8705
8706 static BOOL
8707 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8708   compile_data *cd, int atomcount)
8709 {
8710 do {
8711    const pcre_uchar *scode = first_significant_code(
8712      code + PRIV(OP_lengths)[*code], FALSE);
8713    register int op = *scode;
8714
8715    /* If we are at the start of a conditional assertion group, *both* the
8716    conditional assertion *and* what follows the condition must satisfy the test
8717    for start of line. Other kinds of condition fail. Note that there may be an
8718    auto-callout at the start of a condition. */
8719
8720    if (op == OP_COND)
8721      {
8722      scode += 1 + LINK_SIZE;
8723      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8724      switch (*scode)
8725        {
8726        case OP_CREF:
8727        case OP_DNCREF:
8728        case OP_RREF:
8729        case OP_DNRREF:
8730        case OP_DEF:
8731        case OP_FAIL:
8732        return FALSE;
8733
8734        default:     /* Assertion */
8735        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8736        do scode += GET(scode, 1); while (*scode == OP_ALT);
8737        scode += 1 + LINK_SIZE;
8738        break;
8739        }
8740      scode = first_significant_code(scode, FALSE);
8741      op = *scode;
8742      }
8743
8744    /* Non-capturing brackets */
8745
8746    if (op == OP_BRA  || op == OP_BRAPOS ||
8747        op == OP_SBRA || op == OP_SBRAPOS)
8748      {
8749      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8750      }
8751
8752    /* Capturing brackets */
8753
8754    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8755             op == OP_SCBRA || op == OP_SCBRAPOS)
8756      {
8757      int n = GET2(scode, 1+LINK_SIZE);
8758      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8759      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8760      }
8761
8762    /* Positive forward assertions */
8763
8764    else if (op == OP_ASSERT)
8765      {
8766      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8767      }
8768
8769    /* Atomic brackets */
8770
8771    else if (op == OP_ONCE || op == OP_ONCE_NC)
8772      {
8773      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8774      }
8775
8776    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8777    brackets that may be referenced, as long as the pattern does not contain
8778    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8779    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8780    start of a line. */
8781
8782    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8783      {
8784      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8785          atomcount > 0 || cd->had_pruneorskip)
8786        return FALSE;
8787      }
8788
8789    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8790    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8791    because the number of characters matched by .* cannot be adjusted inside
8792    them. */
8793
8794    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8795
8796    /* Move on to the next alternative */
8797
8798    code += GET(code, 1);
8799    }
8800 while (*code == OP_ALT);  /* Loop for each alternative */
8801 return TRUE;
8802 }
8803
8804
8805
8806 /*************************************************
8807 *       Check for asserted fixed first char      *
8808 *************************************************/
8809
8810 /* During compilation, the "first char" settings from forward assertions are
8811 discarded, because they can cause conflicts with actual literals that follow.
8812 However, if we end up without a first char setting for an unanchored pattern,
8813 it is worth scanning the regex to see if there is an initial asserted first
8814 char. If all branches start with the same asserted char, or with a
8815 non-conditional bracket all of whose alternatives start with the same asserted
8816 char (recurse ad lib), then we return that char, with the flags set to zero or
8817 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8818
8819 Arguments:
8820   code       points to start of expression (the bracket)
8821   flags      points to the first char flags, or to REQ_NONE
8822   inassert   TRUE if in an assertion
8823
8824 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8825 */
8826
8827 static pcre_uint32
8828 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8829   BOOL inassert)
8830 {
8831 register pcre_uint32 c = 0;
8832 int cflags = REQ_NONE;
8833
8834 *flags = REQ_NONE;
8835 do {
8836    pcre_uint32 d;
8837    int dflags;
8838    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8839              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8840    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8841      TRUE);
8842    register pcre_uchar op = *scode;
8843
8844    switch(op)
8845      {
8846      default:
8847      return 0;
8848
8849      case OP_BRA:
8850      case OP_BRAPOS:
8851      case OP_CBRA:
8852      case OP_SCBRA:
8853      case OP_CBRAPOS:
8854      case OP_SCBRAPOS:
8855      case OP_ASSERT:
8856      case OP_ONCE:
8857      case OP_ONCE_NC:
8858      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8859      if (dflags < 0)
8860        return 0;
8861      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8862      break;
8863
8864      case OP_EXACT:
8865      scode += IMM2_SIZE;
8866      /* Fall through */
8867
8868      case OP_CHAR:
8869      case OP_PLUS:
8870      case OP_MINPLUS:
8871      case OP_POSPLUS:
8872      if (!inassert) return 0;
8873      if (cflags < 0) { c = scode[1]; cflags = 0; }
8874        else if (c != scode[1]) return 0;
8875      break;
8876
8877      case OP_EXACTI:
8878      scode += IMM2_SIZE;
8879      /* Fall through */
8880
8881      case OP_CHARI:
8882      case OP_PLUSI:
8883      case OP_MINPLUSI:
8884      case OP_POSPLUSI:
8885      if (!inassert) return 0;
8886      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8887        else if (c != scode[1]) return 0;
8888      break;
8889      }
8890
8891    code += GET(code, 1);
8892    }
8893 while (*code == OP_ALT);
8894
8895 *flags = cflags;
8896 return c;
8897 }
8898
8899
8900
8901 /*************************************************
8902 *     Add an entry to the name/number table      *
8903 *************************************************/
8904
8905 /* This function is called between compiling passes to add an entry to the
8906 name/number table, maintaining alphabetical order. Checking for permitted
8907 and forbidden duplicates has already been done.
8908
8909 Arguments:
8910   cd           the compile data block
8911   name         the name to add
8912   length       the length of the name
8913   groupno      the group number
8914
8915 Returns:       nothing
8916 */
8917
8918 static void
8919 add_name(compile_data *cd, const pcre_uchar *name, int length,
8920   unsigned int groupno)
8921 {
8922 int i;
8923 pcre_uchar *slot = cd->name_table;
8924
8925 for (i = 0; i < cd->names_found; i++)
8926   {
8927   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8928   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8929     crc = -1; /* Current name is a substring */
8930
8931   /* Make space in the table and break the loop for an earlier name. For a
8932   duplicate or later name, carry on. We do this for duplicates so that in the
8933   simple case (when ?(| is not used) they are in order of their numbers. In all
8934   cases they are in the order in which they appear in the pattern. */
8935
8936   if (crc < 0)
8937     {
8938     memmove(slot + cd->name_entry_size, slot,
8939       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8940     break;
8941     }
8942
8943   /* Continue the loop for a later or duplicate name */
8944
8945   slot += cd->name_entry_size;
8946   }
8947
8948 PUT2(slot, 0, groupno);
8949 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8950 slot[IMM2_SIZE + length] = 0;
8951 cd->names_found++;
8952 }
8953
8954
8955
8956 /*************************************************
8957 *        Compile a Regular Expression            *
8958 *************************************************/
8959
8960 /* This function takes a string and returns a pointer to a block of store
8961 holding a compiled version of the expression. The original API for this
8962 function had no error code return variable; it is retained for backwards
8963 compatibility. The new function is given a new name.
8964
8965 Arguments:
8966   pattern       the regular expression
8967   options       various option bits
8968   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8969                   can be NULL if you don't want a code value
8970   errorptr      pointer to pointer to error text
8971   erroroffset   ptr offset in pattern where error was detected
8972   tables        pointer to character tables or NULL
8973
8974 Returns:        pointer to compiled data block, or NULL on error,
8975                 with errorptr and erroroffset set
8976 */
8977
8978 #if defined COMPILE_PCRE8
8979 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8980 pcre_compile(const char *pattern, int options, const char **errorptr,
8981   int *erroroffset, const unsigned char *tables)
8982 #elif defined COMPILE_PCRE16
8983 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8984 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8985   int *erroroffset, const unsigned char *tables)
8986 #elif defined COMPILE_PCRE32
8987 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8988 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8989   int *erroroffset, const unsigned char *tables)
8990 #endif
8991 {
8992 #if defined COMPILE_PCRE8
8993 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8994 #elif defined COMPILE_PCRE16
8995 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8996 #elif defined COMPILE_PCRE32
8997 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8998 #endif
8999 }
9000
9001
9002 #if defined COMPILE_PCRE8
9003 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9004 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9005   const char **errorptr, int *erroroffset, const unsigned char *tables)
9006 #elif defined COMPILE_PCRE16
9007 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9008 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9009   const char **errorptr, int *erroroffset, const unsigned char *tables)
9010 #elif defined COMPILE_PCRE32
9011 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9012 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9013   const char **errorptr, int *erroroffset, const unsigned char *tables)
9014 #endif
9015 {
9016 REAL_PCRE *re;
9017 int length = 1;  /* For final END opcode */
9018 pcre_int32 firstcharflags, reqcharflags;
9019 pcre_uint32 firstchar, reqchar;
9020 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9021 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9022 int newline;
9023 int errorcode = 0;
9024 int skipatstart = 0;
9025 BOOL utf;
9026 BOOL never_utf = FALSE;
9027 size_t size;
9028 pcre_uchar *code;
9029 const pcre_uchar *codestart;
9030 const pcre_uchar *ptr;
9031 compile_data compile_block;
9032 compile_data *cd = &compile_block;
9033
9034 /* This space is used for "compiling" into during the first phase, when we are
9035 computing the amount of memory that is needed. Compiled items are thrown away
9036 as soon as possible, so that a fairly large buffer should be sufficient for
9037 this purpose. The same space is used in the second phase for remembering where
9038 to fill in forward references to subpatterns. That may overflow, in which case
9039 new memory is obtained from malloc(). */
9040
9041 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9042
9043 /* This vector is used for remembering name groups during the pre-compile. In a
9044 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9045
9046 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9047
9048 /* Set this early so that early errors get offset 0. */
9049
9050 ptr = (const pcre_uchar *)pattern;
9051
9052 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9053 can do is just return NULL, but we can set a code value if there is a code
9054 pointer. */
9055
9056 if (errorptr == NULL)
9057   {
9058   if (errorcodeptr != NULL) *errorcodeptr = 99;
9059   return NULL;
9060   }
9061
9062 *errorptr = NULL;
9063 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9064
9065 /* However, we can give a message for this error */
9066
9067 if (erroroffset == NULL)
9068   {
9069   errorcode = ERR16;
9070   goto PCRE_EARLY_ERROR_RETURN2;
9071   }
9072
9073 *erroroffset = 0;
9074
9075 /* Set up pointers to the individual character tables */
9076
9077 if (tables == NULL) tables = PRIV(default_tables);
9078 cd->lcc = tables + lcc_offset;
9079 cd->fcc = tables + fcc_offset;
9080 cd->cbits = tables + cbits_offset;
9081 cd->ctypes = tables + ctypes_offset;
9082
9083 /* Check that all undefined public option bits are zero */
9084
9085 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9086   {
9087   errorcode = ERR17;
9088   goto PCRE_EARLY_ERROR_RETURN;
9089   }
9090
9091 /* If PCRE_NEVER_UTF is set, remember it. */
9092
9093 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9094
9095 /* Check for global one-time settings at the start of the pattern, and remember
9096 the offset for later. */
9097
9098 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9099
9100 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9101        ptr[skipatstart+1] == CHAR_ASTERISK)
9102   {
9103   int newnl = 0;
9104   int newbsr = 0;
9105
9106 /* For completeness and backward compatibility, (*UTFn) is supported in the
9107 relevant libraries, but (*UTF) is generic and always supported. Note that
9108 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9109
9110 #ifdef COMPILE_PCRE8
9111   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9112     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9113 #endif
9114 #ifdef COMPILE_PCRE16
9115   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9116     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9117 #endif
9118 #ifdef COMPILE_PCRE32
9119   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9120     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9121 #endif
9122
9123   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9124     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9125   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9126     { skipatstart += 6; options |= PCRE_UCP; continue; }
9127   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9128     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9129   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9130     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9131
9132   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9133     {
9134     pcre_uint32 c = 0;
9135     int p = skipatstart + 14;
9136     while (isdigit(ptr[p]))
9137       {
9138       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9139       c = c*10 + ptr[p++] - CHAR_0;
9140       }
9141     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9142     if (c < limit_match)
9143       {
9144       limit_match = c;
9145       cd->external_flags |= PCRE_MLSET;
9146       }
9147     skipatstart = p;
9148     continue;
9149     }
9150
9151   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9152     {
9153     pcre_uint32 c = 0;
9154     int p = skipatstart + 18;
9155     while (isdigit(ptr[p]))
9156       {
9157       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9158       c = c*10 + ptr[p++] - CHAR_0;
9159       }
9160     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9161     if (c < limit_recursion)
9162       {
9163       limit_recursion = c;
9164       cd->external_flags |= PCRE_RLSET;
9165       }
9166     skipatstart = p;
9167     continue;
9168     }
9169
9170   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9171     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9172   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9173     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9174   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9175     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9176   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9177     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9178   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9179     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9180
9181   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9182     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9183   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9184     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9185
9186   if (newnl != 0)
9187     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9188   else if (newbsr != 0)
9189     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9190   else break;
9191   }
9192
9193 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9194 utf = (options & PCRE_UTF8) != 0;
9195 if (utf && never_utf)
9196   {
9197   errorcode = ERR78;
9198   goto PCRE_EARLY_ERROR_RETURN2;
9199   }
9200
9201 /* Can't support UTF unless PCRE has been compiled to include the code. The
9202 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9203 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9204 not used here. */
9205
9206 #ifdef SUPPORT_UTF
9207 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9208      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9209   {
9210 #if defined COMPILE_PCRE8
9211   errorcode = ERR44;
9212 #elif defined COMPILE_PCRE16
9213   errorcode = ERR74;
9214 #elif defined COMPILE_PCRE32
9215   errorcode = ERR77;
9216 #endif
9217   goto PCRE_EARLY_ERROR_RETURN2;
9218   }
9219 #else
9220 if (utf)
9221   {
9222   errorcode = ERR32;
9223   goto PCRE_EARLY_ERROR_RETURN;
9224   }
9225 #endif
9226
9227 /* Can't support UCP unless PCRE has been compiled to include the code. */
9228
9229 #ifndef SUPPORT_UCP
9230 if ((options & PCRE_UCP) != 0)
9231   {
9232   errorcode = ERR67;
9233   goto PCRE_EARLY_ERROR_RETURN;
9234   }
9235 #endif
9236
9237 /* Check validity of \R options. */
9238
9239 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9240      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9241   {
9242   errorcode = ERR56;
9243   goto PCRE_EARLY_ERROR_RETURN;
9244   }
9245
9246 /* Handle different types of newline. The three bits give seven cases. The
9247 current code allows for fixed one- or two-byte sequences, plus "any" and
9248 "anycrlf". */
9249
9250 switch (options & PCRE_NEWLINE_BITS)
9251   {
9252   case 0: newline = NEWLINE; break;   /* Build-time default */
9253   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9254   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9255   case PCRE_NEWLINE_CR+
9256        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9257   case PCRE_NEWLINE_ANY: newline = -1; break;
9258   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9259   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9260   }
9261
9262 if (newline == -2)
9263   {
9264   cd->nltype = NLTYPE_ANYCRLF;
9265   }
9266 else if (newline < 0)
9267   {
9268   cd->nltype = NLTYPE_ANY;
9269   }
9270 else
9271   {
9272   cd->nltype = NLTYPE_FIXED;
9273   if (newline > 255)
9274     {
9275     cd->nllen = 2;
9276     cd->nl[0] = (newline >> 8) & 255;
9277     cd->nl[1] = newline & 255;
9278     }
9279   else
9280     {
9281     cd->nllen = 1;
9282     cd->nl[0] = newline;
9283     }
9284   }
9285
9286 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9287 references to help in deciding whether (.*) can be treated as anchored or not.
9288 */
9289
9290 cd->top_backref = 0;
9291 cd->backref_map = 0;
9292
9293 /* Reflect pattern for debugging output */
9294
9295 DPRINTF(("------------------------------------------------------------------\n"));
9296 #ifdef PCRE_DEBUG
9297 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9298 #endif
9299 DPRINTF(("\n"));
9300
9301 /* Pretend to compile the pattern while actually just accumulating the length
9302 of memory required. This behaviour is triggered by passing a non-NULL final
9303 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9304 to compile parts of the pattern into; the compiled code is discarded when it is
9305 no longer needed, so hopefully this workspace will never overflow, though there
9306 is a test for its doing so. */
9307
9308 cd->bracount = cd->final_bracount = 0;
9309 cd->names_found = 0;
9310 cd->name_entry_size = 0;
9311 cd->name_table = NULL;
9312 cd->dupnames = FALSE;
9313 cd->dupgroups = FALSE;
9314 cd->namedrefcount = 0;
9315 cd->start_code = cworkspace;
9316 cd->hwm = cworkspace;
9317 cd->iscondassert = FALSE;
9318 cd->start_workspace = cworkspace;
9319 cd->workspace_size = COMPILE_WORK_SIZE;
9320 cd->named_groups = named_groups;
9321 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9322 cd->start_pattern = (const pcre_uchar *)pattern;
9323 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9324 cd->req_varyopt = 0;
9325 cd->parens_depth = 0;
9326 cd->assert_depth = 0;
9327 cd->max_lookbehind = 0;
9328 cd->external_options = options;
9329 cd->open_caps = NULL;
9330
9331 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9332 don't need to look at the result of the function here. The initial options have
9333 been put into the cd block so that they can be changed if an option setting is
9334 found within the regex right at the beginning. Bringing initial option settings
9335 outside can help speed up starting point checks. */
9336
9337 ptr += skipatstart;
9338 code = cworkspace;
9339 *code = OP_BRA;
9340
9341 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9342   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9343   cd, &length);
9344 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9345
9346 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9347   (int)(cd->hwm - cworkspace)));
9348
9349 if (length > MAX_PATTERN_SIZE)
9350   {
9351   errorcode = ERR20;
9352   goto PCRE_EARLY_ERROR_RETURN;
9353   }
9354
9355 /* Compute the size of the data block for storing the compiled pattern. Integer
9356 overflow should no longer be possible because nowadays we limit the maximum
9357 value of cd->names_found and cd->name_entry_size. */
9358
9359 size = sizeof(REAL_PCRE) +
9360   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9361
9362 /* Get the memory. */
9363
9364 re = (REAL_PCRE *)(PUBL(malloc))(size);
9365 if (re == NULL)
9366   {
9367   errorcode = ERR21;
9368   goto PCRE_EARLY_ERROR_RETURN;
9369   }
9370
9371 /* Put in the magic number, and save the sizes, initial options, internal
9372 flags, and character table pointer. NULL is used for the default character
9373 tables. The nullpad field is at the end; it's there to help in the case when a
9374 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9375 pointers. */
9376
9377 re->magic_number = MAGIC_NUMBER;
9378 re->size = (int)size;
9379 re->options = cd->external_options;
9380 re->flags = cd->external_flags;
9381 re->limit_match = limit_match;
9382 re->limit_recursion = limit_recursion;
9383 re->first_char = 0;
9384 re->req_char = 0;
9385 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9386 re->name_entry_size = cd->name_entry_size;
9387 re->name_count = cd->names_found;
9388 re->ref_count = 0;
9389 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9390 re->nullpad = NULL;
9391 #ifdef COMPILE_PCRE32
9392 re->dummy = 0;
9393 #else
9394 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9395 #endif
9396
9397 /* The starting points of the name/number translation table and of the code are
9398 passed around in the compile data block. The start/end pattern and initial
9399 options are already set from the pre-compile phase, as is the name_entry_size
9400 field. Reset the bracket count and the names_found field. Also reset the hwm
9401 field; this time it's used for remembering forward references to subpatterns.
9402 */
9403
9404 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9405 cd->parens_depth = 0;
9406 cd->assert_depth = 0;
9407 cd->bracount = 0;
9408 cd->max_lookbehind = 0;
9409 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9410 codestart = cd->name_table + re->name_entry_size * re->name_count;
9411 cd->start_code = codestart;
9412 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9413 cd->iscondassert = FALSE;
9414 cd->req_varyopt = 0;
9415 cd->had_accept = FALSE;
9416 cd->had_pruneorskip = FALSE;
9417 cd->check_lookbehind = FALSE;
9418 cd->open_caps = NULL;
9419
9420 /* If any named groups were found, create the name/number table from the list
9421 created in the first pass. */
9422
9423 if (cd->names_found > 0)
9424   {
9425   int i = cd->names_found;
9426   named_group *ng = cd->named_groups;
9427   cd->names_found = 0;
9428   for (; i > 0; i--, ng++)
9429     add_name(cd, ng->name, ng->length, ng->number);
9430   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9431     (PUBL(free))((void *)cd->named_groups);
9432   }
9433
9434 /* Set up a starting, non-extracting bracket, then compile the expression. On
9435 error, errorcode will be set non-zero, so we don't need to look at the result
9436 of the function here. */
9437
9438 ptr = (const pcre_uchar *)pattern + skipatstart;
9439 code = (pcre_uchar *)codestart;
9440 *code = OP_BRA;
9441 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9442   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9443 re->top_bracket = cd->bracount;
9444 re->top_backref = cd->top_backref;
9445 re->max_lookbehind = cd->max_lookbehind;
9446 re->flags = cd->external_flags | PCRE_MODE;
9447
9448 if (cd->had_accept)
9449   {
9450   reqchar = 0;              /* Must disable after (*ACCEPT) */
9451   reqcharflags = REQ_NONE;
9452   }
9453
9454 /* If not reached end of pattern on success, there's an excess bracket. */
9455
9456 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9457
9458 /* Fill in the terminating state and check for disastrous overflow, but
9459 if debugging, leave the test till after things are printed out. */
9460
9461 *code++ = OP_END;
9462
9463 #ifndef PCRE_DEBUG
9464 if (code - codestart > length) errorcode = ERR23;
9465 #endif
9466
9467 #ifdef SUPPORT_VALGRIND
9468 /* If the estimated length exceeds the really used length, mark the extra
9469 allocated memory as unaddressable, so that any out-of-bound reads can be
9470 detected. */
9471 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9472 #endif
9473
9474 /* Fill in any forward references that are required. There may be repeated
9475 references; optimize for them, as searching a large regex takes time. */
9476
9477 if (cd->hwm > cd->start_workspace)
9478   {
9479   int prev_recno = -1;
9480   const pcre_uchar *groupptr = NULL;
9481   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9482     {
9483     int offset, recno;
9484     cd->hwm -= LINK_SIZE;
9485     offset = GET(cd->hwm, 0);
9486
9487     /* Check that the hwm handling hasn't gone wrong. This whole area is
9488     rewritten in PCRE2 because there are some obscure cases. */
9489
9490     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9491       {
9492       errorcode = ERR10;
9493       break;
9494       }
9495
9496     recno = GET(codestart, offset);
9497     if (recno != prev_recno)
9498       {
9499       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9500       prev_recno = recno;
9501       }
9502     if (groupptr == NULL) errorcode = ERR53;
9503       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9504     }
9505   }
9506
9507 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9508 NULL to indicate that forward references have been filled in. */
9509
9510 if (cd->workspace_size > COMPILE_WORK_SIZE)
9511   (PUBL(free))((void *)cd->start_workspace);
9512 cd->start_workspace = NULL;
9513
9514 /* Give an error if there's back reference to a non-existent capturing
9515 subpattern. */
9516
9517 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9518
9519 /* Unless disabled, check whether any single character iterators can be
9520 auto-possessified. The function overwrites the appropriate opcode values, so
9521 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9522 used in this code because at least one compiler gives a warning about loss of
9523 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9524 function call. */
9525
9526 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9527   {
9528   pcre_uchar *temp = (pcre_uchar *)codestart;
9529   auto_possessify(temp, utf, cd);
9530   }
9531
9532 /* If there were any lookbehind assertions that contained OP_RECURSE
9533 (recursions or subroutine calls), a flag is set for them to be checked here,
9534 because they may contain forward references. Actual recursions cannot be fixed
9535 length, but subroutine calls can. It is done like this so that those without
9536 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9537 exceptional ones forgo this. We scan the pattern to check that they are fixed
9538 length, and set their lengths. */
9539
9540 if (errorcode == 0 && cd->check_lookbehind)
9541   {
9542   pcre_uchar *cc = (pcre_uchar *)codestart;
9543
9544   /* Loop, searching for OP_REVERSE items, and process those that do not have
9545   their length set. (Actually, it will also re-process any that have a length
9546   of zero, but that is a pathological case, and it does no harm.) When we find
9547   one, we temporarily terminate the branch it is in while we scan it. */
9548
9549   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9550        cc != NULL;
9551        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9552     {
9553     if (GET(cc, 1) == 0)
9554       {
9555       int fixed_length;
9556       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9557       int end_op = *be;
9558       *be = OP_END;
9559       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9560         cd, NULL);
9561       *be = end_op;
9562       DPRINTF(("fixed length = %d\n", fixed_length));
9563       if (fixed_length < 0)
9564         {
9565         errorcode = (fixed_length == -2)? ERR36 :
9566                     (fixed_length == -4)? ERR70 : ERR25;
9567         break;
9568         }
9569       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9570       PUT(cc, 1, fixed_length);
9571       }
9572     cc += 1 + LINK_SIZE;
9573     }
9574   }
9575
9576 /* Failed to compile, or error while post-processing */
9577
9578 if (errorcode != 0)
9579   {
9580   (PUBL(free))(re);
9581   PCRE_EARLY_ERROR_RETURN:
9582   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9583   PCRE_EARLY_ERROR_RETURN2:
9584   *errorptr = find_error_text(errorcode);
9585   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9586   return NULL;
9587   }
9588
9589 /* If the anchored option was not passed, set the flag if we can determine that
9590 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9591 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9592 of *PRUNE or *SKIP.
9593
9594 Otherwise, if we know what the first byte has to be, save it, because that
9595 speeds up unanchored matches no end. If not, see if we can set the
9596 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9597 start with ^. and also when all branches start with non-atomic .* for
9598 non-DOTALL matches when *PRUNE and SKIP are not present. */
9599
9600 if ((re->options & PCRE_ANCHORED) == 0)
9601   {
9602   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9603   else
9604     {
9605     if (firstcharflags < 0)
9606       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9607     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9608       {
9609 #if defined COMPILE_PCRE8
9610       re->first_char = firstchar & 0xff;
9611 #elif defined COMPILE_PCRE16
9612       re->first_char = firstchar & 0xffff;
9613 #elif defined COMPILE_PCRE32
9614       re->first_char = firstchar;
9615 #endif
9616       if ((firstcharflags & REQ_CASELESS) != 0)
9617         {
9618 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9619         /* We ignore non-ASCII first chars in 8 bit mode. */
9620         if (utf)
9621           {
9622           if (re->first_char < 128)
9623             {
9624             if (cd->fcc[re->first_char] != re->first_char)
9625               re->flags |= PCRE_FCH_CASELESS;
9626             }
9627           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9628             re->flags |= PCRE_FCH_CASELESS;
9629           }
9630         else
9631 #endif
9632         if (MAX_255(re->first_char)
9633             && cd->fcc[re->first_char] != re->first_char)
9634           re->flags |= PCRE_FCH_CASELESS;
9635         }
9636
9637       re->flags |= PCRE_FIRSTSET;
9638       }
9639
9640     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9641     }
9642   }
9643
9644 /* For an anchored pattern, we use the "required byte" only if it follows a
9645 variable length item in the regex. Remove the caseless flag for non-caseable
9646 bytes. */
9647
9648 if (reqcharflags >= 0 &&
9649      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9650   {
9651 #if defined COMPILE_PCRE8
9652   re->req_char = reqchar & 0xff;
9653 #elif defined COMPILE_PCRE16
9654   re->req_char = reqchar & 0xffff;
9655 #elif defined COMPILE_PCRE32
9656   re->req_char = reqchar;
9657 #endif
9658   if ((reqcharflags & REQ_CASELESS) != 0)
9659     {
9660 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9661     /* We ignore non-ASCII first chars in 8 bit mode. */
9662     if (utf)
9663       {
9664       if (re->req_char < 128)
9665         {
9666         if (cd->fcc[re->req_char] != re->req_char)
9667           re->flags |= PCRE_RCH_CASELESS;
9668         }
9669       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9670         re->flags |= PCRE_RCH_CASELESS;
9671       }
9672     else
9673 #endif
9674     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9675       re->flags |= PCRE_RCH_CASELESS;
9676     }
9677
9678   re->flags |= PCRE_REQCHSET;
9679   }
9680
9681 /* Print out the compiled data if debugging is enabled. This is never the
9682 case when building a production library. */
9683
9684 #ifdef PCRE_DEBUG
9685 printf("Length = %d top_bracket = %d top_backref = %d\n",
9686   length, re->top_bracket, re->top_backref);
9687
9688 printf("Options=%08x\n", re->options);
9689
9690 if ((re->flags & PCRE_FIRSTSET) != 0)
9691   {
9692   pcre_uchar ch = re->first_char;
9693   const char *caseless =
9694     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9695   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9696     else printf("First char = \\x%02x%s\n", ch, caseless);
9697   }
9698
9699 if ((re->flags & PCRE_REQCHSET) != 0)
9700   {
9701   pcre_uchar ch = re->req_char;
9702   const char *caseless =
9703     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9704   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9705     else printf("Req char = \\x%02x%s\n", ch, caseless);
9706   }
9707
9708 #if defined COMPILE_PCRE8
9709 pcre_printint((pcre *)re, stdout, TRUE);
9710 #elif defined COMPILE_PCRE16
9711 pcre16_printint((pcre *)re, stdout, TRUE);
9712 #elif defined COMPILE_PCRE32
9713 pcre32_printint((pcre *)re, stdout, TRUE);
9714 #endif
9715
9716 /* This check is done here in the debugging case so that the code that
9717 was compiled can be seen. */
9718
9719 if (code - codestart > length)
9720   {
9721   (PUBL(free))(re);
9722   *errorptr = find_error_text(ERR23);
9723   *erroroffset = ptr - (pcre_uchar *)pattern;
9724   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9725   return NULL;
9726   }
9727 #endif   /* PCRE_DEBUG */
9728
9729 /* Check for a pattern than can match an empty string, so that this information
9730 can be provided to applications. */
9731
9732 do
9733   {
9734   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9735     {
9736     re->flags |= PCRE_MATCH_EMPTY;
9737     break;
9738     }
9739   codestart += GET(codestart, 1);
9740   }
9741 while (*codestart == OP_ALT);
9742
9743 #if defined COMPILE_PCRE8
9744 return (pcre *)re;
9745 #elif defined COMPILE_PCRE16
9746 return (pcre16 *)re;
9747 #elif defined COMPILE_PCRE32
9748 return (pcre32 *)re;
9749 #endif
9750 }
9751
9752 /* End of pcre_compile.c */