pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2014 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing pattern start */
  51 #define PSEND   end_pattern    /* Field containing pattern end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
  57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
  58 library. We do not need to select pcre16_printint.c specially, because the
  59 COMPILE_PCREx macro will already be appropriately set. */
  60
  61 #ifdef PCRE_DEBUG
  62 /* pcre_printint.c should not include any headers */
  63 #define PCRE_INCLUDED
  64 #include "pcre_printint.c"
  65 #undef PCRE_INCLUDED
  66 #endif
  67
  68
  69 /* Macro for setting individual bits in class bitmaps. */
  70
  71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
  72
  73 /* Maximum length value to check against when making sure that the integer that
  74 holds the compiled pattern length does not overflow. We make it a bit less than
  75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  76 to check them every time. */
  77
  78 #define OFLOW_MAX (INT_MAX - 20)
  79
  80 /* Definitions to allow mutual recursion */
  81
  82 static int
  83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
  84     const pcre_uint32 *, unsigned int);
  85
  86 static BOOL
  87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
  88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
  89     compile_data *, int *);
  90
  91
  92
  93 /*************************************************
  94 *      Code parameters and static tables         *
  95 *************************************************/
  96
  97 /* This value specifies the size of stack workspace that is used during the
  98 first pre-compile phase that determines how much memory is required. The regex
  99 is partly compiled into this space, but the compiled parts are discarded as
 100 soon as they can be, so that hopefully there will never be an overrun. The code
 101 does, however, check for an overrun. The largest amount I've seen used is 218,
 102 so this number is very generous.
 103
 104 The same workspace is used during the second, actual compile phase for
 105 remembering forward references to groups so that they can be filled in at the
 106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
 107 is 4 there is plenty of room for most patterns. However, the memory can get
 108 filled up by repetitions of forward references, for example patterns like
 109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
 110 that the workspace is expanded using malloc() in this situation. The value
 111 below is therefore a minimum, and we put a maximum on it for safety. The
 112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
 113 kicks in at the same number of forward references in all cases. */
 114
 115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
 116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
 117
 118 /* This value determines the size of the initial vector that is used for
 119 remembering named groups during the pre-compile. It is allocated on the stack,
 120 but if it is too small, it is expanded using malloc(), in a similar way to the
 121 workspace. The value is the number of slots in the list. */
 122
 123 #define NAMED_GROUP_LIST_SIZE  20
 124
 125 /* The overrun tests check for a slightly smaller size so that they detect the
 126 overrun before it actually does run off the end of the data block. */
 127
 128 #define WORK_SIZE_SAFETY_MARGIN (100)
 129
 130 /* Private flags added to firstchar and reqchar. */
 131
 132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
 133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
 134 /* Negative values for the firstchar and reqchar flags */
 135 #define REQ_UNSET       (-2)
 136 #define REQ_NONE        (-1)
 137
 138 /* Repeated character flags. */
 139
 140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
 141
 142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 143 are simple data values; negative values are for special things like \d and so
 144 on. Zero means further processing is needed (for things like \x), or the escape
 145 is invalid. */
 146
 147 #ifndef EBCDIC
 148
 149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
 150 in UTF-8 mode. */
 151
 152 static const short int escapes[] = {
 153      0,                       0,
 154      0,                       0,
 155      0,                       0,
 156      0,                       0,
 157      0,                       0,
 158      CHAR_COLON,              CHAR_SEMICOLON,
 159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
 160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
 161      CHAR_COMMERCIAL_AT,      -ESC_A,
 162      -ESC_B,                  -ESC_C,
 163      -ESC_D,                  -ESC_E,
 164      0,                       -ESC_G,
 165      -ESC_H,                  0,
 166      0,                       -ESC_K,
 167      0,                       0,
 168      -ESC_N,                  0,
 169      -ESC_P,                  -ESC_Q,
 170      -ESC_R,                  -ESC_S,
 171      0,                       0,
 172      -ESC_V,                  -ESC_W,
 173      -ESC_X,                  0,
 174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
 175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
 176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
 177      CHAR_GRAVE_ACCENT,       ESC_a,
 178      -ESC_b,                  0,
 179      -ESC_d,                  ESC_e,
 180      ESC_f,                   0,
 181      -ESC_h,                  0,
 182      0,                       -ESC_k,
 183      0,                       0,
 184      ESC_n,                   0,
 185      -ESC_p,                  0,
 186      ESC_r,                   -ESC_s,
 187      ESC_tee,                 0,
 188      -ESC_v,                  -ESC_w,
 189      0,                       0,
 190      -ESC_z
 191 };
 192
 193 #else
 194
 195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
 196
 197 static const short int escapes[] = {
 198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
 208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
 216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 221 };
 222
 223 /* We also need a table of characters that may follow \c in an EBCDIC
 224 environment for characters 0-31. */
 225
 226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
 227
 228 #endif
 229
 230
 231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 232 searched linearly. Put all the names into a single string, in order to reduce
 233 the number of relocations when a shared library is dynamically linked. The
 234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
 235 platforms. */
 236
 237 typedef struct verbitem {
 238   int   len;                 /* Length of verb name */
 239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
 240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
 241 } verbitem;
 242
 243 static const char verbnames[] =
 244   "\0"                       /* Empty name is a shorthand for MARK */
 245   STRING_MARK0
 246   STRING_ACCEPT0
 247   STRING_COMMIT0
 248   STRING_F0
 249   STRING_FAIL0
 250   STRING_PRUNE0
 251   STRING_SKIP0
 252   STRING_THEN;
 253
 254 static const verbitem verbs[] = {
 255   { 0, -1,        OP_MARK },
 256   { 4, -1,        OP_MARK },
 257   { 6, OP_ACCEPT, -1 },
 258   { 6, OP_COMMIT, -1 },
 259   { 1, OP_FAIL,   -1 },
 260   { 4, OP_FAIL,   -1 },
 261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
 262   { 4, OP_SKIP,   OP_SKIP_ARG  },
 263   { 4, OP_THEN,   OP_THEN_ARG  }
 264 };
 265
 266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 267
 268
 269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
 270 another regex library. */
 271
 272 static const pcre_uchar sub_start_of_word[] = {
 273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
 274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
 275
 276 static const pcre_uchar sub_end_of_word[] = {
 277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
 278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
 279   CHAR_RIGHT_PARENTHESIS, '\0' };
 280
 281
 282 /* Tables of names of POSIX character classes and their lengths. The names are
 283 now all in a single string, to reduce the number of relocations when a shared
 284 library is dynamically loaded. The list of lengths is terminated by a zero
 285 length entry. The first three must be alpha, lower, upper, as this is assumed
 286 for handling case independence. The indices for graph, print, and punct are
 287 needed, so identify them. */
 288
 289 static const char posix_names[] =
 290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
 291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
 292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
 293   STRING_word0  STRING_xdigit;
 294
 295 static const pcre_uint8 posix_name_lengths[] = {
 296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 297
 298 #define PC_GRAPH  8
 299 #define PC_PRINT  9
 300 #define PC_PUNCT 10
 301
 302
 303 /* Table of class bit maps for each POSIX class. Each class is formed from a
 304 base map, with an optional addition or removal of another map. Then, for some
 305 classes, there is some additional tweaking: for [:blank:] the vertical space
 306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 307 character is removed. The triples in the table consist of the base map offset,
 308 second map offset or -1 if no second map, and a non-negative value for map
 309 addition or a negative value for map subtraction (if there are two maps). The
 310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 311 remove vertical space characters, 2 => remove underscore. */
 312
 313 static const int posix_class_maps[] = {
 314   cbit_word,  cbit_digit, -2,             /* alpha */
 315   cbit_lower, -1,          0,             /* lower */
 316   cbit_upper, -1,          0,             /* upper */
 317   cbit_word,  -1,          2,             /* alnum - word without underscore */
 318   cbit_print, cbit_cntrl,  0,             /* ascii */
 319   cbit_space, -1,          1,             /* blank - a GNU extension */
 320   cbit_cntrl, -1,          0,             /* cntrl */
 321   cbit_digit, -1,          0,             /* digit */
 322   cbit_graph, -1,          0,             /* graph */
 323   cbit_print, -1,          0,             /* print */
 324   cbit_punct, -1,          0,             /* punct */
 325   cbit_space, -1,          0,             /* space */
 326   cbit_word,  -1,          0,             /* word - a Perl extension */
 327   cbit_xdigit,-1,          0              /* xdigit */
 328 };
 329
 330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
 331 Unicode property escapes. */
 332
 333 #ifdef SUPPORT_UCP
 334 static const pcre_uchar string_PNd[]  = {
 335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 337 static const pcre_uchar string_pNd[]  = {
 338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 340 static const pcre_uchar string_PXsp[] = {
 341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 343 static const pcre_uchar string_pXsp[] = {
 344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 346 static const pcre_uchar string_PXwd[] = {
 347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 349 static const pcre_uchar string_pXwd[] = {
 350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 352
 353 static const pcre_uchar *substitutes[] = {
 354   string_PNd,           /* \D */
 355   string_pNd,           /* \d */
 356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
 357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
 358   string_PXwd,          /* \W */
 359   string_pXwd           /* \w */
 360 };
 361
 362 /* The POSIX class substitutes must be in the order of the POSIX class names,
 363 defined above, and there are both positive and negative cases. NULL means no
 364 general substitute of a Unicode property escape (\p or \P). However, for some
 365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
 366 directly. */
 367
 368 static const pcre_uchar string_pL[] =   {
 369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 371 static const pcre_uchar string_pLl[] =  {
 372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 374 static const pcre_uchar string_pLu[] =  {
 375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 377 static const pcre_uchar string_pXan[] = {
 378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 380 static const pcre_uchar string_h[] =    {
 381   CHAR_BACKSLASH, CHAR_h, '\0' };
 382 static const pcre_uchar string_pXps[] = {
 383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 385 static const pcre_uchar string_PL[] =   {
 386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 388 static const pcre_uchar string_PLl[] =  {
 389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 391 static const pcre_uchar string_PLu[] =  {
 392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 394 static const pcre_uchar string_PXan[] = {
 395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 397 static const pcre_uchar string_H[] =    {
 398   CHAR_BACKSLASH, CHAR_H, '\0' };
 399 static const pcre_uchar string_PXps[] = {
 400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 402
 403 static const pcre_uchar *posix_substitutes[] = {
 404   string_pL,            /* alpha */
 405   string_pLl,           /* lower */
 406   string_pLu,           /* upper */
 407   string_pXan,          /* alnum */
 408   NULL,                 /* ascii */
 409   string_h,             /* blank */
 410   NULL,                 /* cntrl */
 411   string_pNd,           /* digit */
 412   NULL,                 /* graph */
 413   NULL,                 /* print */
 414   NULL,                 /* punct */
 415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
 416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
 417   NULL,                 /* xdigit */
 418   /* Negated cases */
 419   string_PL,            /* ^alpha */
 420   string_PLl,           /* ^lower */
 421   string_PLu,           /* ^upper */
 422   string_PXan,          /* ^alnum */
 423   NULL,                 /* ^ascii */
 424   string_H,             /* ^blank */
 425   NULL,                 /* ^cntrl */
 426   string_PNd,           /* ^digit */
 427   NULL,                 /* ^graph */
 428   NULL,                 /* ^print */
 429   NULL,                 /* ^punct */
 430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
 431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
 432   NULL                  /* ^xdigit */
 433 };
 434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
 435 #endif
 436
 437 #define STRING(a)  # a
 438 #define XSTRING(s) STRING(s)
 439
 440 /* The texts of compile-time error messages. These are "char *" because they
 441 are passed to the outside world. Do not ever re-use any error number, because
 442 they are documented. Always add a new error instead. Messages marked DEAD below
 443 are no longer used. This used to be a table of strings, but in order to reduce
 444 the number of relocations needed when a shared library is loaded dynamically,
 445 it is now one long string. We cannot use a table of offsets, because the
 446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 447 simply count through to the one we want - this isn't a performance issue
 448 because these strings are used only when there is a compilation error.
 449
 450 Each substring ends with \0 to insert a null character. This includes the final
 451 substring, so that the whole string ends with \0\0, which can be detected when
 452 counting through. */
 453
 454 static const char error_texts[] =
 455   "no error\0"
 456   "\\ at end of pattern\0"
 457   "\\c at end of pattern\0"
 458   "unrecognized character follows \\\0"
 459   "numbers out of order in {} quantifier\0"
 460   /* 5 */
 461   "number too big in {} quantifier\0"
 462   "missing terminating ] for character class\0"
 463   "invalid escape sequence in character class\0"
 464   "range out of order in character class\0"
 465   "nothing to repeat\0"
 466   /* 10 */
 467   "internal error: invalid forward reference offset\0"
 468   "internal error: unexpected repeat\0"
 469   "unrecognized character after (? or (?-\0"
 470   "POSIX named classes are supported only within a class\0"
 471   "missing )\0"
 472   /* 15 */
 473   "reference to non-existent subpattern\0"
 474   "erroffset passed as NULL\0"
 475   "unknown option bit(s) set\0"
 476   "missing ) after comment\0"
 477   "parentheses nested too deeply\0"  /** DEAD **/
 478   /* 20 */
 479   "regular expression is too large\0"
 480   "failed to get memory\0"
 481   "unmatched parentheses\0"
 482   "internal error: code overflow\0"
 483   "unrecognized character after (?<\0"
 484   /* 25 */
 485   "lookbehind assertion is not fixed length\0"
 486   "malformed number or name after (?(\0"
 487   "conditional group contains more than two branches\0"
 488   "assertion expected after (?(\0"
 489   "(?R or (?[+-]digits must be followed by )\0"
 490   /* 30 */
 491   "unknown POSIX class name\0"
 492   "POSIX collating elements are not supported\0"
 493   "this version of PCRE is compiled without UTF support\0"
 494   "spare error\0"  /** DEAD **/
 495   "character value in \\x{} or \\o{} is too large\0"
 496   /* 35 */
 497   "invalid condition (?(0)\0"
 498   "\\C not allowed in lookbehind assertion\0"
 499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
 500   "number after (?C is > 255\0"
 501   "closing ) for (?C expected\0"
 502   /* 40 */
 503   "recursive call could loop indefinitely\0"
 504   "unrecognized character after (?P\0"
 505   "syntax error in subpattern name (missing terminator)\0"
 506   "two named subpatterns have the same name\0"
 507   "invalid UTF-8 string\0"
 508   /* 45 */
 509   "support for \\P, \\p, and \\X has not been compiled\0"
 510   "malformed \\P or \\p sequence\0"
 511   "unknown property name after \\P or \\p\0"
 512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 514   /* 50 */
 515   "repeated subpattern is too long\0"    /** DEAD **/
 516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
 517   "internal error: overran compiling workspace\0"
 518   "internal error: previously-checked referenced subpattern not found\0"
 519   "DEFINE group contains more than one branch\0"
 520   /* 55 */
 521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
 522   "inconsistent NEWLINE options\0"
 523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 524   "a numbered reference must not be zero\0"
 525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
 526   /* 60 */
 527   "(*VERB) not recognized or malformed\0"
 528   "number is too big\0"
 529   "subpattern name expected\0"
 530   "digit expected after (?+\0"
 531   "] is an invalid data character in JavaScript compatibility mode\0"
 532   /* 65 */
 533   "different names for subpatterns of the same number are not allowed\0"
 534   "(*MARK) must have an argument\0"
 535   "this version of PCRE is not compiled with Unicode property support\0"
 536 #ifndef EBCDIC
 537   "\\c must be followed by an ASCII character\0"
 538 #else
 539   "\\c must be followed by a letter or one of [\\]^_?\0"
 540 #endif
 541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
 542   /* 70 */
 543   "internal error: unknown opcode in find_fixedlength()\0"
 544   "\\N is not supported in a class\0"
 545   "too many forward references\0"
 546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
 547   "invalid UTF-16 string\0"
 548   /* 75 */
 549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
 550   "character value in \\u.... sequence is too large\0"
 551   "invalid UTF-32 string\0"
 552   "setting UTF is disabled by the application\0"
 553   "non-hex character in \\x{} (closing brace missing?)\0"
 554   /* 80 */
 555   "non-octal character in \\o{} (closing brace missing?)\0"
 556   "missing opening brace after \\o\0"
 557   "parentheses are too deeply nested\0"
 558   "invalid range in character class\0"
 559   "group name must start with a non-digit\0"
 560   /* 85 */
 561   "parentheses are too deeply nested (stack check)\0"
 562   "digits missing in \\x{} or \\o{}\0"
 563   ;
 564
 565 /* Table to identify digits and hex digits. This is used when compiling
 566 patterns. Note that the tables in chartables are dependent on the locale, and
 567 may mark arbitrary characters as digits - but the PCRE compiling code expects
 568 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 569 a private table here. It costs 256 bytes, but it is a lot faster than doing
 570 character value tests (at least in some simple cases I timed), and in some
 571 applications one wants PCRE to compile efficiently as well as match
 572 efficiently.
 573
 574 For convenience, we use the same bit definitions as in chartables:
 575
 576   0x04   decimal digit
 577   0x08   hexadecimal digit
 578
 579 Then we can use ctype_digit and ctype_xdigit in the code. */
 580
 581 /* Using a simple comparison for decimal numbers rather than a memory read
 582 is much faster, and the resulting code is simpler (the compiler turns it
 583 into a subtraction and unsigned comparison). */
 584
 585 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
 586
 587 #ifndef EBCDIC
 588
 589 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
 590 UTF-8 mode. */
 591
 592 static const pcre_uint8 digitab[] =
 593   {
 594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 600   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 601   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 602   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 606   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 626
 627 #else
 628
 629 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
 630
 631 static const pcre_uint8 digitab[] =
 632   {
 633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 649   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 657   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 658   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 663   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 664   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 665
 666 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
 667   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 668   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 669   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 670   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 671   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 675   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 676   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 677   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 678   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 679   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 680   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 681   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 683   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 685   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 687   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 688   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 689   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 690   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 691   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 692   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 693   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 694   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 695   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 696   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 697   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 698   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 699 #endif
 700
 701
 702 /* This table is used to check whether auto-possessification is possible
 703 between adjacent character-type opcodes. The left-hand (repeated) opcode is
 704 used to select the row, and the right-hand opcode is use to select the column.
 705 A value of 1 means that auto-possessification is OK. For example, the second
 706 value in the first row means that \D+\d can be turned into \D++\d.
 707
 708 The Unicode property types (\P and \p) have to be present to fill out the table
 709 because of what their opcode values are, but the table values should always be
 710 zero because property types are handled separately in the code. The last four
 711 columns apply to items that cannot be repeated, so there is no need to have
 712 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
 713 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 714
 715 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
 716 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
 717
 718 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
 719 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
 720   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
 721   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
 722   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
 723   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
 724   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
 725   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
 726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
 727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
 728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
 729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
 730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
 731   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
 732   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
 733   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
 734   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
 735   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
 736   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
 737 };
 738
 739
 740 /* This table is used to check whether auto-possessification is possible
 741 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
 742 left-hand (repeated) opcode is used to select the row, and the right-hand
 743 opcode is used to select the column. The values are as follows:
 744
 745   0   Always return FALSE (never auto-possessify)
 746   1   Character groups are distinct (possessify if both are OP_PROP)
 747   2   Check character categories in the same group (general or particular)
 748   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
 749
 750   4   Check left general category vs right particular category
 751   5   Check right general category vs left particular category
 752
 753   6   Left alphanum vs right general category
 754   7   Left space vs right general category
 755   8   Left word vs right general category
 756
 757   9   Right alphanum vs left general category
 758  10   Right space vs left general category
 759  11   Right word vs left general category
 760
 761  12   Left alphanum vs right particular category
 762  13   Left space vs right particular category
 763  14   Left word vs right particular category
 764
 765  15   Right alphanum vs left particular category
 766  16   Right space vs left particular category
 767  17   Right word vs left particular category
 768 */
 769
 770 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
 771 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
 772   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
 773   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
 774   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
 775   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
 776   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
 777   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
 778   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
 779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
 780   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
 781   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
 782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
 783 };
 784
 785 /* This table is used to check whether auto-possessification is possible
 786 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
 787 specifies a general category and the other specifies a particular category. The
 788 row is selected by the general category and the column by the particular
 789 category. The value is 1 if the particular category is not part of the general
 790 category. */
 791
 792 static const pcre_uint8 catposstab[7][30] = {
 793 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
 794   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
 795   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
 796   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
 797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
 798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
 799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
 800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
 801 };
 802
 803 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
 804 a general or particular category. The properties in each row are those
 805 that apply to the character set in question. Duplication means that a little
 806 unnecessary work is done when checking, but this keeps things much simpler
 807 because they can all use the same code. For more details see the comment where
 808 this table is used.
 809
 810 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
 811 "space", but from Perl 5.18 it's included, so both categories are treated the
 812 same here. */
 813
 814 static const pcre_uint8 posspropstab[3][4] = {
 815   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
 816   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
 817   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
 818 };
 819
 820 /* This table is used when converting repeating opcodes into possessified
 821 versions as a result of an explicit possessive quantifier such as ++. A zero
 822 value means there is no possessified version - in those cases the item in
 823 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
 824 because all relevant opcodes are less than that. */
 825
 826 static const pcre_uint8 opcode_possessify[] = {
 827   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
 828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
 829
 830   0,                       /* NOTI */
 831   OP_POSSTAR, 0,           /* STAR, MINSTAR */
 832   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
 833   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
 834   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
 835   0,                       /* EXACT */
 836   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
 837
 838   OP_POSSTARI, 0,          /* STARI, MINSTARI */
 839   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
 840   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
 841   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
 842   0,                       /* EXACTI */
 843   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
 844
 845   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
 846   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
 847   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
 848   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
 849   0,                       /* NOTEXACT */
 850   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
 851
 852   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
 853   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
 854   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
 855   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
 856   0,                       /* NOTEXACTI */
 857   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
 858
 859   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
 860   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
 861   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
 862   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
 863   0,                       /* TYPEEXACT */
 864   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
 865
 866   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
 867   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
 868   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
 869   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
 870   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
 871
 872   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
 873   0, 0,                    /* REF, REFI */
 874   0, 0,                    /* DNREF, DNREFI */
 875   0, 0                     /* RECURSE, CALLOUT */
 876 };
 877
 878
 879
 880 /*************************************************
 881 *            Find an error text                  *
 882 *************************************************/
 883
 884 /* The error texts are now all in one long string, to save on relocations. As
 885 some of the text is of unknown length, we can't use a table of offsets.
 886 Instead, just count through the strings. This is not a performance issue
 887 because it happens only when there has been a compilation error.
 888
 889 Argument:   the error number
 890 Returns:    pointer to the error string
 891 */
 892
 893 static const char *
 894 find_error_text(int n)
 895 {
 896 const char *s = error_texts;
 897 for (; n > 0; n--)
 898   {
 899   while (*s++ != CHAR_NULL) {};
 900   if (*s == CHAR_NULL) return "Error text not found (please report)";
 901   }
 902 return s;
 903 }
 904
 905
 906
 907 /*************************************************
 908 *           Expand the workspace                 *
 909 *************************************************/
 910
 911 /* This function is called during the second compiling phase, if the number of
 912 forward references fills the existing workspace, which is originally a block on
 913 the stack. A larger block is obtained from malloc() unless the ultimate limit
 914 has been reached or the increase will be rather small.
 915
 916 Argument: pointer to the compile data block
 917 Returns:  0 if all went well, else an error number
 918 */
 919
 920 static int
 921 expand_workspace(compile_data *cd)
 922 {
 923 pcre_uchar *newspace;
 924 int newsize = cd->workspace_size * 2;
 925
 926 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
 927 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
 928     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
 929  return ERR72;
 930
 931 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
 932 if (newspace == NULL) return ERR21;
 933 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
 934 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
 935 if (cd->workspace_size > COMPILE_WORK_SIZE)
 936   (PUBL(free))((void *)cd->start_workspace);
 937 cd->start_workspace = newspace;
 938 cd->workspace_size = newsize;
 939 return 0;
 940 }
 941
 942
 943
 944 /*************************************************
 945 *            Check for counted repeat            *
 946 *************************************************/
 947
 948 /* This function is called when a '{' is encountered in a place where it might
 949 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 950 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 951 where the ddds are digits.
 952
 953 Arguments:
 954   p         pointer to the first char after '{'
 955
 956 Returns:    TRUE or FALSE
 957 */
 958
 959 static BOOL
 960 is_counted_repeat(const pcre_uchar *p)
 961 {
 962 if (!IS_DIGIT(*p)) return FALSE;
 963 p++;
 964 while (IS_DIGIT(*p)) p++;
 965 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 966
 967 if (*p++ != CHAR_COMMA) return FALSE;
 968 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 969
 970 if (!IS_DIGIT(*p)) return FALSE;
 971 p++;
 972 while (IS_DIGIT(*p)) p++;
 973
 974 return (*p == CHAR_RIGHT_CURLY_BRACKET);
 975 }
 976
 977
 978
 979 /*************************************************
 980 *            Handle escapes                      *
 981 *************************************************/
 982
 983 /* This function is called when a \ has been encountered. It either returns a
 984 positive value for a simple escape such as \n, or 0 for a data character which
 985 will be placed in chptr. A backreference to group n is returned as negative n.
 986 When UTF-8 is enabled, a positive value greater than 255 may be returned in
 987 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
 988 character of the escape sequence.
 989
 990 Arguments:
 991   ptrptr         points to the pattern position pointer
 992   chptr          points to a returned data character
 993   errorcodeptr   points to the errorcode variable
 994   bracount       number of previous extracting brackets
 995   options        the options bits
 996   isclass        TRUE if inside a character class
 997
 998 Returns:         zero => a data character
 999                  positive => a special escape sequence
1000                  negative => a back reference
1001                  on error, errorcodeptr is set
1002 */
1003
1004 static int
1005 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1006   int bracount, int options, BOOL isclass)
1007 {
1008 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1009 BOOL utf = (options & PCRE_UTF8) != 0;
1010 const pcre_uchar *ptr = *ptrptr + 1;
1011 pcre_uint32 c;
1012 int escape = 0;
1013 int i;
1014
1015 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1016 ptr--;                            /* Set pointer back to the last byte */
1017
1018 /* If backslash is at the end of the pattern, it's an error. */
1019
1020 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1021
1022 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1023 in a table. A non-zero result is something that can be returned immediately.
1024 Otherwise further processing may be required. */
1025
1026 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1027 /* Not alphanumeric */
1028 else if (c < CHAR_0 || c > CHAR_z) {}
1029 else if ((i = escapes[c - CHAR_0]) != 0)
1030   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1031
1032 #else           /* EBCDIC coding */
1033 /* Not alphanumeric */
1034 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1035 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1036 #endif
1037
1038 /* Escapes that need further processing, or are illegal. */
1039
1040 else
1041   {
1042   const pcre_uchar *oldptr;
1043   BOOL braced, negated, overflow;
1044   int s;
1045
1046   switch (c)
1047     {
1048     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1049     error. */
1050
1051     case CHAR_l:
1052     case CHAR_L:
1053     *errorcodeptr = ERR37;
1054     break;
1055
1056     case CHAR_u:
1057     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1058       {
1059       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1060       Otherwise it is a lowercase u letter. */
1061       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1062         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1063         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1064         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1065         {
1066         c = 0;
1067         for (i = 0; i < 4; ++i)
1068           {
1069           register pcre_uint32 cc = *(++ptr);
1070 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1071           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1072           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1073 #else           /* EBCDIC coding */
1074           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1075           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1076 #endif
1077           }
1078
1079 #if defined COMPILE_PCRE8
1080         if (c > (utf ? 0x10ffffU : 0xffU))
1081 #elif defined COMPILE_PCRE16
1082         if (c > (utf ? 0x10ffffU : 0xffffU))
1083 #elif defined COMPILE_PCRE32
1084         if (utf && c > 0x10ffffU)
1085 #endif
1086           {
1087           *errorcodeptr = ERR76;
1088           }
1089         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1090         }
1091       }
1092     else
1093       *errorcodeptr = ERR37;
1094     break;
1095
1096     case CHAR_U:
1097     /* In JavaScript, \U is an uppercase U letter. */
1098     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1099     break;
1100
1101     /* In a character class, \g is just a literal "g". Outside a character
1102     class, \g must be followed by one of a number of specific things:
1103
1104     (1) A number, either plain or braced. If positive, it is an absolute
1105     backreference. If negative, it is a relative backreference. This is a Perl
1106     5.10 feature.
1107
1108     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1109     is part of Perl's movement towards a unified syntax for back references. As
1110     this is synonymous with \k{name}, we fudge it up by pretending it really
1111     was \k.
1112
1113     (3) For Oniguruma compatibility we also support \g followed by a name or a
1114     number either in angle brackets or in single quotes. However, these are
1115     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1116     the ESC_g code (cf \k). */
1117
1118     case CHAR_g:
1119     if (isclass) break;
1120     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1121       {
1122       escape = ESC_g;
1123       break;
1124       }
1125
1126     /* Handle the Perl-compatible cases */
1127
1128     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1129       {
1130       const pcre_uchar *p;
1131       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1132         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1133       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1134         {
1135         escape = ESC_k;
1136         break;
1137         }
1138       braced = TRUE;
1139       ptr++;
1140       }
1141     else braced = FALSE;
1142
1143     if (ptr[1] == CHAR_MINUS)
1144       {
1145       negated = TRUE;
1146       ptr++;
1147       }
1148     else negated = FALSE;
1149
1150     /* The integer range is limited by the machine's int representation. */
1151     s = 0;
1152     overflow = FALSE;
1153     while (IS_DIGIT(ptr[1]))
1154       {
1155       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1156         {
1157         overflow = TRUE;
1158         break;
1159         }
1160       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1161       }
1162     if (overflow) /* Integer overflow */
1163       {
1164       while (IS_DIGIT(ptr[1]))
1165         ptr++;
1166       *errorcodeptr = ERR61;
1167       break;
1168       }
1169
1170     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1171       {
1172       *errorcodeptr = ERR57;
1173       break;
1174       }
1175
1176     if (s == 0)
1177       {
1178       *errorcodeptr = ERR58;
1179       break;
1180       }
1181
1182     if (negated)
1183       {
1184       if (s > bracount)
1185         {
1186         *errorcodeptr = ERR15;
1187         break;
1188         }
1189       s = bracount - (s - 1);
1190       }
1191
1192     escape = -s;
1193     break;
1194
1195     /* The handling of escape sequences consisting of a string of digits
1196     starting with one that is not zero is not straightforward. Perl has changed
1197     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1198     recommended to avoid the ambiguities in the old syntax.
1199
1200     Outside a character class, the digits are read as a decimal number. If the
1201     number is less than 8 (used to be 10), or if there are that many previous
1202     extracting left brackets, then it is a back reference. Otherwise, up to
1203     three octal digits are read to form an escaped byte. Thus \123 is likely to
1204     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1205     the octal value is greater than 377, the least significant 8 bits are
1206     taken. \8 and \9 are treated as the literal characters 8 and 9.
1207
1208     Inside a character class, \ followed by a digit is always either a literal
1209     8 or 9 or an octal number. */
1210
1211     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1212     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1213
1214     if (!isclass)
1215       {
1216       oldptr = ptr;
1217       /* The integer range is limited by the machine's int representation. */
1218       s = (int)(c -CHAR_0);
1219       overflow = FALSE;
1220       while (IS_DIGIT(ptr[1]))
1221         {
1222         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1223           {
1224           overflow = TRUE;
1225           break;
1226           }
1227         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1228         }
1229       if (overflow) /* Integer overflow */
1230         {
1231         while (IS_DIGIT(ptr[1]))
1232           ptr++;
1233         *errorcodeptr = ERR61;
1234         break;
1235         }
1236       if (s < 8 || s <= bracount)  /* Check for back reference */
1237         {
1238         escape = -s;
1239         break;
1240         }
1241       ptr = oldptr;      /* Put the pointer back and fall through */
1242       }
1243
1244     /* Handle a digit following \ when the number is not a back reference. If
1245     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1246     then treat the digit as a following literal. At least by Perl 5.18 this
1247     changed so as not to insert the binary zero. */
1248
1249     if ((c = *ptr) >= CHAR_8) break;
1250
1251     /* Fall through with a digit less than 8 */
1252
1253     /* \0 always starts an octal number, but we may drop through to here with a
1254     larger first octal digit. The original code used just to take the least
1255     significant 8 bits of octal numbers (I think this is what early Perls used
1256     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1257     but no more than 3 octal digits. */
1258
1259     case CHAR_0:
1260     c -= CHAR_0;
1261     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1262         c = c * 8 + *(++ptr) - CHAR_0;
1263 #ifdef COMPILE_PCRE8
1264     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1265 #endif
1266     break;
1267
1268     /* \o is a relatively new Perl feature, supporting a more general way of
1269     specifying character codes in octal. The only supported form is \o{ddd}. */
1270
1271     case CHAR_o:
1272     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1273     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1274       {
1275       ptr += 2;
1276       c = 0;
1277       overflow = FALSE;
1278       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1279         {
1280         register pcre_uint32 cc = *ptr++;
1281         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1282 #ifdef COMPILE_PCRE32
1283         if (c >= 0x20000000l) { overflow = TRUE; break; }
1284 #endif
1285         c = (c << 3) + cc - CHAR_0 ;
1286 #if defined COMPILE_PCRE8
1287         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1288 #elif defined COMPILE_PCRE16
1289         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1290 #elif defined COMPILE_PCRE32
1291         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1292 #endif
1293         }
1294       if (overflow)
1295         {
1296         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1297         *errorcodeptr = ERR34;
1298         }
1299       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1300         {
1301         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1302         }
1303       else *errorcodeptr = ERR80;
1304       }
1305     break;
1306
1307     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1308     numbers. Otherwise it is a lowercase x letter. */
1309
1310     case CHAR_x:
1311     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1312       {
1313       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1314         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1315         {
1316         c = 0;
1317         for (i = 0; i < 2; ++i)
1318           {
1319           register pcre_uint32 cc = *(++ptr);
1320 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1321           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1322           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1323 #else           /* EBCDIC coding */
1324           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1325           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1326 #endif
1327           }
1328         }
1329       }    /* End JavaScript handling */
1330
1331     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1332     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1333     digits. If not, { used to be treated as a data character. However, Perl
1334     seems to read hex digits up to the first non-such, and ignore the rest, so
1335     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1336     now gives an error. */
1337
1338     else
1339       {
1340       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1341         {
1342         ptr += 2;
1343         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1344           {
1345           *errorcodeptr = ERR86;
1346           break;
1347           }
1348         c = 0;
1349         overflow = FALSE;
1350         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1351           {
1352           register pcre_uint32 cc = *ptr++;
1353           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1354
1355 #ifdef COMPILE_PCRE32
1356           if (c >= 0x10000000l) { overflow = TRUE; break; }
1357 #endif
1358
1359 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1360           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1361           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1362 #else           /* EBCDIC coding */
1363           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1364           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1365 #endif
1366
1367 #if defined COMPILE_PCRE8
1368           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1369 #elif defined COMPILE_PCRE16
1370           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1371 #elif defined COMPILE_PCRE32
1372           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1373 #endif
1374           }
1375
1376         if (overflow)
1377           {
1378           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1379           *errorcodeptr = ERR34;
1380           }
1381
1382         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1383           {
1384           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1385           }
1386
1387         /* If the sequence of hex digits does not end with '}', give an error.
1388         We used just to recognize this construct and fall through to the normal
1389         \x handling, but nowadays Perl gives an error, which seems much more
1390         sensible, so we do too. */
1391
1392         else *errorcodeptr = ERR79;
1393         }   /* End of \x{} processing */
1394
1395       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1396
1397       else
1398         {
1399         c = 0;
1400         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1401           {
1402           pcre_uint32 cc;                          /* Some compilers don't like */
1403           cc = *(++ptr);                           /* ++ in initializers */
1404 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1405           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1406           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1407 #else           /* EBCDIC coding */
1408           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1409           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1410 #endif
1411           }
1412         }     /* End of \xdd handling */
1413       }       /* End of Perl-style \x handling */
1414     break;
1415
1416     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1417     An error is given if the byte following \c is not an ASCII character. This
1418     coding is ASCII-specific, but then the whole concept of \cx is
1419     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1420
1421     case CHAR_c:
1422     c = *(++ptr);
1423     if (c == CHAR_NULL)
1424       {
1425       *errorcodeptr = ERR2;
1426       break;
1427       }
1428 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1429     if (c > 127)  /* Excludes all non-ASCII in either mode */
1430       {
1431       *errorcodeptr = ERR68;
1432       break;
1433       }
1434     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1435     c ^= 0x40;
1436 #else             /* EBCDIC coding */
1437     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1438     if (c == CHAR_QUESTION_MARK)
1439       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1440     else
1441       {
1442       for (i = 0; i < 32; i++)
1443         {
1444         if (c == ebcdic_escape_c[i]) break;
1445         }
1446       if (i < 32) c = i; else *errorcodeptr = ERR68;
1447       }
1448 #endif
1449     break;
1450
1451     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1452     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1453     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1454     odd, but there used to be some cases other than the default, and there may
1455     be again in future, so I haven't "optimized" it. */
1456
1457     default:
1458     if ((options & PCRE_EXTRA) != 0) switch(c)
1459       {
1460       default:
1461       *errorcodeptr = ERR3;
1462       break;
1463       }
1464     break;
1465     }
1466   }
1467
1468 /* Perl supports \N{name} for character names, as well as plain \N for "not
1469 newline". PCRE does not support \N{name}. However, it does support
1470 quantification such as \N{2,3}. */
1471
1472 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1473      !is_counted_repeat(ptr+2))
1474   *errorcodeptr = ERR37;
1475
1476 /* If PCRE_UCP is set, we change the values for \d etc. */
1477
1478 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1479   escape += (ESC_DU - ESC_D);
1480
1481 /* Set the pointer to the final character before returning. */
1482
1483 *ptrptr = ptr;
1484 *chptr = c;
1485 return escape;
1486 }
1487
1488
1489
1490 #ifdef SUPPORT_UCP
1491 /*************************************************
1492 *               Handle \P and \p                 *
1493 *************************************************/
1494
1495 /* This function is called after \P or \p has been encountered, provided that
1496 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1497 pointing at the P or p. On exit, it is pointing at the final character of the
1498 escape sequence.
1499
1500 Argument:
1501   ptrptr         points to the pattern position pointer
1502   negptr         points to a boolean that is set TRUE for negation else FALSE
1503   ptypeptr       points to an unsigned int that is set to the type value
1504   pdataptr       points to an unsigned int that is set to the detailed property value
1505   errorcodeptr   points to the error code variable
1506
1507 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1508 */
1509
1510 static BOOL
1511 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1512   unsigned int *pdataptr, int *errorcodeptr)
1513 {
1514 pcre_uchar c;
1515 int i, bot, top;
1516 const pcre_uchar *ptr = *ptrptr;
1517 pcre_uchar name[32];
1518
1519 c = *(++ptr);
1520 if (c == CHAR_NULL) goto ERROR_RETURN;
1521
1522 *negptr = FALSE;
1523
1524 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1525 negation. */
1526
1527 if (c == CHAR_LEFT_CURLY_BRACKET)
1528   {
1529   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1530     {
1531     *negptr = TRUE;
1532     ptr++;
1533     }
1534   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1535     {
1536     c = *(++ptr);
1537     if (c == CHAR_NULL) goto ERROR_RETURN;
1538     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1539     name[i] = c;
1540     }
1541   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1542   name[i] = 0;
1543   }
1544
1545 /* Otherwise there is just one following character */
1546
1547 else
1548   {
1549   name[0] = c;
1550   name[1] = 0;
1551   }
1552
1553 *ptrptr = ptr;
1554
1555 /* Search for a recognized property name using binary chop */
1556
1557 bot = 0;
1558 top = PRIV(utt_size);
1559
1560 while (bot < top)
1561   {
1562   int r;
1563   i = (bot + top) >> 1;
1564   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1565   if (r == 0)
1566     {
1567     *ptypeptr = PRIV(utt)[i].type;
1568     *pdataptr = PRIV(utt)[i].value;
1569     return TRUE;
1570     }
1571   if (r > 0) bot = i + 1; else top = i;
1572   }
1573
1574 *errorcodeptr = ERR47;
1575 *ptrptr = ptr;
1576 return FALSE;
1577
1578 ERROR_RETURN:
1579 *errorcodeptr = ERR46;
1580 *ptrptr = ptr;
1581 return FALSE;
1582 }
1583 #endif
1584
1585
1586
1587 /*************************************************
1588 *         Read repeat counts                     *
1589 *************************************************/
1590
1591 /* Read an item of the form {n,m} and return the values. This is called only
1592 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1593 so the syntax is guaranteed to be correct, but we need to check the values.
1594
1595 Arguments:
1596   p              pointer to first char after '{'
1597   minp           pointer to int for min
1598   maxp           pointer to int for max
1599                  returned as -1 if no max
1600   errorcodeptr   points to error code variable
1601
1602 Returns:         pointer to '}' on success;
1603                  current ptr on error, with errorcodeptr set non-zero
1604 */
1605
1606 static const pcre_uchar *
1607 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1608 {
1609 int min = 0;
1610 int max = -1;
1611
1612 while (IS_DIGIT(*p))
1613   {
1614   min = min * 10 + (int)(*p++ - CHAR_0);
1615   if (min > 65535)
1616     {
1617     *errorcodeptr = ERR5;
1618     return p;
1619     }
1620   }
1621
1622 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1623   {
1624   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1625     {
1626     max = 0;
1627     while(IS_DIGIT(*p))
1628       {
1629       max = max * 10 + (int)(*p++ - CHAR_0);
1630       if (max > 65535)
1631         {
1632         *errorcodeptr = ERR5;
1633         return p;
1634         }
1635       }
1636     if (max < min)
1637       {
1638       *errorcodeptr = ERR4;
1639       return p;
1640       }
1641     }
1642   }
1643
1644 *minp = min;
1645 *maxp = max;
1646 return p;
1647 }
1648
1649
1650
1651 /*************************************************
1652 *      Find first significant op code            *
1653 *************************************************/
1654
1655 /* This is called by several functions that scan a compiled expression looking
1656 for a fixed first character, or an anchoring op code etc. It skips over things
1657 that do not influence this. For some calls, it makes sense to skip negative
1658 forward and all backward assertions, and also the \b assertion; for others it
1659 does not.
1660
1661 Arguments:
1662   code         pointer to the start of the group
1663   skipassert   TRUE if certain assertions are to be skipped
1664
1665 Returns:       pointer to the first significant opcode
1666 */
1667
1668 static const pcre_uchar*
1669 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1670 {
1671 for (;;)
1672   {
1673   switch ((int)*code)
1674     {
1675     case OP_ASSERT_NOT:
1676     case OP_ASSERTBACK:
1677     case OP_ASSERTBACK_NOT:
1678     if (!skipassert) return code;
1679     do code += GET(code, 1); while (*code == OP_ALT);
1680     code += PRIV(OP_lengths)[*code];
1681     break;
1682
1683     case OP_WORD_BOUNDARY:
1684     case OP_NOT_WORD_BOUNDARY:
1685     if (!skipassert) return code;
1686     /* Fall through */
1687
1688     case OP_CALLOUT:
1689     case OP_CREF:
1690     case OP_DNCREF:
1691     case OP_RREF:
1692     case OP_DNRREF:
1693     case OP_DEF:
1694     code += PRIV(OP_lengths)[*code];
1695     break;
1696
1697     default:
1698     return code;
1699     }
1700   }
1701 /* Control never reaches here */
1702 }
1703
1704
1705
1706 /*************************************************
1707 *        Find the fixed length of a branch       *
1708 *************************************************/
1709
1710 /* Scan a branch and compute the fixed length of subject that will match it,
1711 if the length is fixed. This is needed for dealing with backward assertions.
1712 In UTF8 mode, the result is in characters rather than bytes. The branch is
1713 temporarily terminated with OP_END when this function is called.
1714
1715 This function is called when a backward assertion is encountered, so that if it
1716 fails, the error message can point to the correct place in the pattern.
1717 However, we cannot do this when the assertion contains subroutine calls,
1718 because they can be forward references. We solve this by remembering this case
1719 and doing the check at the end; a flag specifies which mode we are running in.
1720
1721 Arguments:
1722   code     points to the start of the pattern (the bracket)
1723   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1724   atend    TRUE if called when the pattern is complete
1725   cd       the "compile data" structure
1726   recurses    chain of recurse_check to catch mutual recursion
1727
1728 Returns:   the fixed length,
1729              or -1 if there is no fixed length,
1730              or -2 if \C was encountered (in UTF-8 mode only)
1731              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1732              or -4 if an unknown opcode was encountered (internal error)
1733 */
1734
1735 static int
1736 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1737   recurse_check *recurses)
1738 {
1739 int length = -1;
1740 recurse_check this_recurse;
1741 register int branchlength = 0;
1742 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1743
1744 /* Scan along the opcodes for this branch. If we get to the end of the
1745 branch, check the length against that of the other branches. */
1746
1747 for (;;)
1748   {
1749   int d;
1750   pcre_uchar *ce, *cs;
1751   register pcre_uchar op = *cc;
1752
1753   switch (op)
1754     {
1755     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1756     OP_BRA (normal non-capturing bracket) because the other variants of these
1757     opcodes are all concerned with unlimited repeated groups, which of course
1758     are not of fixed length. */
1759
1760     case OP_CBRA:
1761     case OP_BRA:
1762     case OP_ONCE:
1763     case OP_ONCE_NC:
1764     case OP_COND:
1765     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1766       recurses);
1767     if (d < 0) return d;
1768     branchlength += d;
1769     do cc += GET(cc, 1); while (*cc == OP_ALT);
1770     cc += 1 + LINK_SIZE;
1771     break;
1772
1773     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1774     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1775     an ALT. If it is END it's the end of the outer call. All can be handled by
1776     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1777     because they all imply an unlimited repeat. */
1778
1779     case OP_ALT:
1780     case OP_KET:
1781     case OP_END:
1782     case OP_ACCEPT:
1783     case OP_ASSERT_ACCEPT:
1784     if (length < 0) length = branchlength;
1785       else if (length != branchlength) return -1;
1786     if (*cc != OP_ALT) return length;
1787     cc += 1 + LINK_SIZE;
1788     branchlength = 0;
1789     break;
1790
1791     /* A true recursion implies not fixed length, but a subroutine call may
1792     be OK. If the subroutine is a forward reference, we can't deal with
1793     it until the end of the pattern, so return -3. */
1794
1795     case OP_RECURSE:
1796     if (!atend) return -3;
1797     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1798     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1799     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1800     else   /* Check for mutual recursion */
1801       {
1802       recurse_check *r = recurses;
1803       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1804       if (r != NULL) return -1;   /* Mutual recursion */
1805       }
1806     this_recurse.prev = recurses;
1807     this_recurse.group = cs;
1808     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1809     if (d < 0) return d;
1810     branchlength += d;
1811     cc += 1 + LINK_SIZE;
1812     break;
1813
1814     /* Skip over assertive subpatterns */
1815
1816     case OP_ASSERT:
1817     case OP_ASSERT_NOT:
1818     case OP_ASSERTBACK:
1819     case OP_ASSERTBACK_NOT:
1820     do cc += GET(cc, 1); while (*cc == OP_ALT);
1821     cc += 1 + LINK_SIZE;
1822     break;
1823
1824     /* Skip over things that don't match chars */
1825
1826     case OP_MARK:
1827     case OP_PRUNE_ARG:
1828     case OP_SKIP_ARG:
1829     case OP_THEN_ARG:
1830     cc += cc[1] + PRIV(OP_lengths)[*cc];
1831     break;
1832
1833     case OP_CALLOUT:
1834     case OP_CIRC:
1835     case OP_CIRCM:
1836     case OP_CLOSE:
1837     case OP_COMMIT:
1838     case OP_CREF:
1839     case OP_DEF:
1840     case OP_DNCREF:
1841     case OP_DNRREF:
1842     case OP_DOLL:
1843     case OP_DOLLM:
1844     case OP_EOD:
1845     case OP_EODN:
1846     case OP_FAIL:
1847     case OP_NOT_WORD_BOUNDARY:
1848     case OP_PRUNE:
1849     case OP_REVERSE:
1850     case OP_RREF:
1851     case OP_SET_SOM:
1852     case OP_SKIP:
1853     case OP_SOD:
1854     case OP_SOM:
1855     case OP_THEN:
1856     case OP_WORD_BOUNDARY:
1857     cc += PRIV(OP_lengths)[*cc];
1858     break;
1859
1860     /* Handle literal characters */
1861
1862     case OP_CHAR:
1863     case OP_CHARI:
1864     case OP_NOT:
1865     case OP_NOTI:
1866     branchlength++;
1867     cc += 2;
1868 #ifdef SUPPORT_UTF
1869     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1870 #endif
1871     break;
1872
1873     /* Handle exact repetitions. The count is already in characters, but we
1874     need to skip over a multibyte character in UTF8 mode.  */
1875
1876     case OP_EXACT:
1877     case OP_EXACTI:
1878     case OP_NOTEXACT:
1879     case OP_NOTEXACTI:
1880     branchlength += (int)GET2(cc,1);
1881     cc += 2 + IMM2_SIZE;
1882 #ifdef SUPPORT_UTF
1883     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1884 #endif
1885     break;
1886
1887     case OP_TYPEEXACT:
1888     branchlength += GET2(cc,1);
1889     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1890       cc += 2;
1891     cc += 1 + IMM2_SIZE + 1;
1892     break;
1893
1894     /* Handle single-char matchers */
1895
1896     case OP_PROP:
1897     case OP_NOTPROP:
1898     cc += 2;
1899     /* Fall through */
1900
1901     case OP_HSPACE:
1902     case OP_VSPACE:
1903     case OP_NOT_HSPACE:
1904     case OP_NOT_VSPACE:
1905     case OP_NOT_DIGIT:
1906     case OP_DIGIT:
1907     case OP_NOT_WHITESPACE:
1908     case OP_WHITESPACE:
1909     case OP_NOT_WORDCHAR:
1910     case OP_WORDCHAR:
1911     case OP_ANY:
1912     case OP_ALLANY:
1913     branchlength++;
1914     cc++;
1915     break;
1916
1917     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1918     otherwise \C is coded as OP_ALLANY. */
1919
1920     case OP_ANYBYTE:
1921     return -2;
1922
1923     /* Check a class for variable quantification */
1924
1925     case OP_CLASS:
1926     case OP_NCLASS:
1927 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1928     case OP_XCLASS:
1929     /* The original code caused an unsigned overflow in 64 bit systems,
1930     so now we use a conditional statement. */
1931     if (op == OP_XCLASS)
1932       cc += GET(cc, 1);
1933     else
1934       cc += PRIV(OP_lengths)[OP_CLASS];
1935 #else
1936     cc += PRIV(OP_lengths)[OP_CLASS];
1937 #endif
1938
1939     switch (*cc)
1940       {
1941       case OP_CRSTAR:
1942       case OP_CRMINSTAR:
1943       case OP_CRPLUS:
1944       case OP_CRMINPLUS:
1945       case OP_CRQUERY:
1946       case OP_CRMINQUERY:
1947       case OP_CRPOSSTAR:
1948       case OP_CRPOSPLUS:
1949       case OP_CRPOSQUERY:
1950       return -1;
1951
1952       case OP_CRRANGE:
1953       case OP_CRMINRANGE:
1954       case OP_CRPOSRANGE:
1955       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1956       branchlength += (int)GET2(cc,1);
1957       cc += 1 + 2 * IMM2_SIZE;
1958       break;
1959
1960       default:
1961       branchlength++;
1962       }
1963     break;
1964
1965     /* Anything else is variable length */
1966
1967     case OP_ANYNL:
1968     case OP_BRAMINZERO:
1969     case OP_BRAPOS:
1970     case OP_BRAPOSZERO:
1971     case OP_BRAZERO:
1972     case OP_CBRAPOS:
1973     case OP_EXTUNI:
1974     case OP_KETRMAX:
1975     case OP_KETRMIN:
1976     case OP_KETRPOS:
1977     case OP_MINPLUS:
1978     case OP_MINPLUSI:
1979     case OP_MINQUERY:
1980     case OP_MINQUERYI:
1981     case OP_MINSTAR:
1982     case OP_MINSTARI:
1983     case OP_MINUPTO:
1984     case OP_MINUPTOI:
1985     case OP_NOTMINPLUS:
1986     case OP_NOTMINPLUSI:
1987     case OP_NOTMINQUERY:
1988     case OP_NOTMINQUERYI:
1989     case OP_NOTMINSTAR:
1990     case OP_NOTMINSTARI:
1991     case OP_NOTMINUPTO:
1992     case OP_NOTMINUPTOI:
1993     case OP_NOTPLUS:
1994     case OP_NOTPLUSI:
1995     case OP_NOTPOSPLUS:
1996     case OP_NOTPOSPLUSI:
1997     case OP_NOTPOSQUERY:
1998     case OP_NOTPOSQUERYI:
1999     case OP_NOTPOSSTAR:
2000     case OP_NOTPOSSTARI:
2001     case OP_NOTPOSUPTO:
2002     case OP_NOTPOSUPTOI:
2003     case OP_NOTQUERY:
2004     case OP_NOTQUERYI:
2005     case OP_NOTSTAR:
2006     case OP_NOTSTARI:
2007     case OP_NOTUPTO:
2008     case OP_NOTUPTOI:
2009     case OP_PLUS:
2010     case OP_PLUSI:
2011     case OP_POSPLUS:
2012     case OP_POSPLUSI:
2013     case OP_POSQUERY:
2014     case OP_POSQUERYI:
2015     case OP_POSSTAR:
2016     case OP_POSSTARI:
2017     case OP_POSUPTO:
2018     case OP_POSUPTOI:
2019     case OP_QUERY:
2020     case OP_QUERYI:
2021     case OP_REF:
2022     case OP_REFI:
2023     case OP_DNREF:
2024     case OP_DNREFI:
2025     case OP_SBRA:
2026     case OP_SBRAPOS:
2027     case OP_SCBRA:
2028     case OP_SCBRAPOS:
2029     case OP_SCOND:
2030     case OP_SKIPZERO:
2031     case OP_STAR:
2032     case OP_STARI:
2033     case OP_TYPEMINPLUS:
2034     case OP_TYPEMINQUERY:
2035     case OP_TYPEMINSTAR:
2036     case OP_TYPEMINUPTO:
2037     case OP_TYPEPLUS:
2038     case OP_TYPEPOSPLUS:
2039     case OP_TYPEPOSQUERY:
2040     case OP_TYPEPOSSTAR:
2041     case OP_TYPEPOSUPTO:
2042     case OP_TYPEQUERY:
2043     case OP_TYPESTAR:
2044     case OP_TYPEUPTO:
2045     case OP_UPTO:
2046     case OP_UPTOI:
2047     return -1;
2048
2049     /* Catch unrecognized opcodes so that when new ones are added they
2050     are not forgotten, as has happened in the past. */
2051
2052     default:
2053     return -4;
2054     }
2055   }
2056 /* Control never gets here */
2057 }
2058
2059
2060
2061 /*************************************************
2062 *    Scan compiled regex for specific bracket    *
2063 *************************************************/
2064
2065 /* This little function scans through a compiled pattern until it finds a
2066 capturing bracket with the given number, or, if the number is negative, an
2067 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2068 so that it can be called from pcre_study() when finding the minimum matching
2069 length.
2070
2071 Arguments:
2072   code        points to start of expression
2073   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2074   number      the required bracket number or negative to find a lookbehind
2075
2076 Returns:      pointer to the opcode for the bracket, or NULL if not found
2077 */
2078
2079 const pcre_uchar *
2080 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2081 {
2082 for (;;)
2083   {
2084   register pcre_uchar c = *code;
2085
2086   if (c == OP_END) return NULL;
2087
2088   /* XCLASS is used for classes that cannot be represented just by a bit
2089   map. This includes negated single high-valued characters. The length in
2090   the table is zero; the actual length is stored in the compiled code. */
2091
2092   if (c == OP_XCLASS) code += GET(code, 1);
2093
2094   /* Handle recursion */
2095
2096   else if (c == OP_REVERSE)
2097     {
2098     if (number < 0) return (pcre_uchar *)code;
2099     code += PRIV(OP_lengths)[c];
2100     }
2101
2102   /* Handle capturing bracket */
2103
2104   else if (c == OP_CBRA || c == OP_SCBRA ||
2105            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2106     {
2107     int n = (int)GET2(code, 1+LINK_SIZE);
2108     if (n == number) return (pcre_uchar *)code;
2109     code += PRIV(OP_lengths)[c];
2110     }
2111
2112   /* Otherwise, we can get the item's length from the table, except that for
2113   repeated character types, we have to test for \p and \P, which have an extra
2114   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2115   must add in its length. */
2116
2117   else
2118     {
2119     switch(c)
2120       {
2121       case OP_TYPESTAR:
2122       case OP_TYPEMINSTAR:
2123       case OP_TYPEPLUS:
2124       case OP_TYPEMINPLUS:
2125       case OP_TYPEQUERY:
2126       case OP_TYPEMINQUERY:
2127       case OP_TYPEPOSSTAR:
2128       case OP_TYPEPOSPLUS:
2129       case OP_TYPEPOSQUERY:
2130       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2131       break;
2132
2133       case OP_TYPEUPTO:
2134       case OP_TYPEMINUPTO:
2135       case OP_TYPEEXACT:
2136       case OP_TYPEPOSUPTO:
2137       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2138         code += 2;
2139       break;
2140
2141       case OP_MARK:
2142       case OP_PRUNE_ARG:
2143       case OP_SKIP_ARG:
2144       case OP_THEN_ARG:
2145       code += code[1];
2146       break;
2147       }
2148
2149     /* Add in the fixed length from the table */
2150
2151     code += PRIV(OP_lengths)[c];
2152
2153   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2154   a multi-byte character. The length in the table is a minimum, so we have to
2155   arrange to skip the extra bytes. */
2156
2157 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2158     if (utf) switch(c)
2159       {
2160       case OP_CHAR:
2161       case OP_CHARI:
2162       case OP_NOT:
2163       case OP_NOTI:
2164       case OP_EXACT:
2165       case OP_EXACTI:
2166       case OP_NOTEXACT:
2167       case OP_NOTEXACTI:
2168       case OP_UPTO:
2169       case OP_UPTOI:
2170       case OP_NOTUPTO:
2171       case OP_NOTUPTOI:
2172       case OP_MINUPTO:
2173       case OP_MINUPTOI:
2174       case OP_NOTMINUPTO:
2175       case OP_NOTMINUPTOI:
2176       case OP_POSUPTO:
2177       case OP_POSUPTOI:
2178       case OP_NOTPOSUPTO:
2179       case OP_NOTPOSUPTOI:
2180       case OP_STAR:
2181       case OP_STARI:
2182       case OP_NOTSTAR:
2183       case OP_NOTSTARI:
2184       case OP_MINSTAR:
2185       case OP_MINSTARI:
2186       case OP_NOTMINSTAR:
2187       case OP_NOTMINSTARI:
2188       case OP_POSSTAR:
2189       case OP_POSSTARI:
2190       case OP_NOTPOSSTAR:
2191       case OP_NOTPOSSTARI:
2192       case OP_PLUS:
2193       case OP_PLUSI:
2194       case OP_NOTPLUS:
2195       case OP_NOTPLUSI:
2196       case OP_MINPLUS:
2197       case OP_MINPLUSI:
2198       case OP_NOTMINPLUS:
2199       case OP_NOTMINPLUSI:
2200       case OP_POSPLUS:
2201       case OP_POSPLUSI:
2202       case OP_NOTPOSPLUS:
2203       case OP_NOTPOSPLUSI:
2204       case OP_QUERY:
2205       case OP_QUERYI:
2206       case OP_NOTQUERY:
2207       case OP_NOTQUERYI:
2208       case OP_MINQUERY:
2209       case OP_MINQUERYI:
2210       case OP_NOTMINQUERY:
2211       case OP_NOTMINQUERYI:
2212       case OP_POSQUERY:
2213       case OP_POSQUERYI:
2214       case OP_NOTPOSQUERY:
2215       case OP_NOTPOSQUERYI:
2216       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2217       break;
2218       }
2219 #else
2220     (void)(utf);  /* Keep compiler happy by referencing function argument */
2221 #endif
2222     }
2223   }
2224 }
2225
2226
2227
2228 /*************************************************
2229 *   Scan compiled regex for recursion reference  *
2230 *************************************************/
2231
2232 /* This little function scans through a compiled pattern until it finds an
2233 instance of OP_RECURSE.
2234
2235 Arguments:
2236   code        points to start of expression
2237   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2238
2239 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2240 */
2241
2242 static const pcre_uchar *
2243 find_recurse(const pcre_uchar *code, BOOL utf)
2244 {
2245 for (;;)
2246   {
2247   register pcre_uchar c = *code;
2248   if (c == OP_END) return NULL;
2249   if (c == OP_RECURSE) return code;
2250
2251   /* XCLASS is used for classes that cannot be represented just by a bit
2252   map. This includes negated single high-valued characters. The length in
2253   the table is zero; the actual length is stored in the compiled code. */
2254
2255   if (c == OP_XCLASS) code += GET(code, 1);
2256
2257   /* Otherwise, we can get the item's length from the table, except that for
2258   repeated character types, we have to test for \p and \P, which have an extra
2259   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2260   must add in its length. */
2261
2262   else
2263     {
2264     switch(c)
2265       {
2266       case OP_TYPESTAR:
2267       case OP_TYPEMINSTAR:
2268       case OP_TYPEPLUS:
2269       case OP_TYPEMINPLUS:
2270       case OP_TYPEQUERY:
2271       case OP_TYPEMINQUERY:
2272       case OP_TYPEPOSSTAR:
2273       case OP_TYPEPOSPLUS:
2274       case OP_TYPEPOSQUERY:
2275       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2276       break;
2277
2278       case OP_TYPEPOSUPTO:
2279       case OP_TYPEUPTO:
2280       case OP_TYPEMINUPTO:
2281       case OP_TYPEEXACT:
2282       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2283         code += 2;
2284       break;
2285
2286       case OP_MARK:
2287       case OP_PRUNE_ARG:
2288       case OP_SKIP_ARG:
2289       case OP_THEN_ARG:
2290       code += code[1];
2291       break;
2292       }
2293
2294     /* Add in the fixed length from the table */
2295
2296     code += PRIV(OP_lengths)[c];
2297
2298     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2299     by a multi-byte character. The length in the table is a minimum, so we have
2300     to arrange to skip the extra bytes. */
2301
2302 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2303     if (utf) switch(c)
2304       {
2305       case OP_CHAR:
2306       case OP_CHARI:
2307       case OP_NOT:
2308       case OP_NOTI:
2309       case OP_EXACT:
2310       case OP_EXACTI:
2311       case OP_NOTEXACT:
2312       case OP_NOTEXACTI:
2313       case OP_UPTO:
2314       case OP_UPTOI:
2315       case OP_NOTUPTO:
2316       case OP_NOTUPTOI:
2317       case OP_MINUPTO:
2318       case OP_MINUPTOI:
2319       case OP_NOTMINUPTO:
2320       case OP_NOTMINUPTOI:
2321       case OP_POSUPTO:
2322       case OP_POSUPTOI:
2323       case OP_NOTPOSUPTO:
2324       case OP_NOTPOSUPTOI:
2325       case OP_STAR:
2326       case OP_STARI:
2327       case OP_NOTSTAR:
2328       case OP_NOTSTARI:
2329       case OP_MINSTAR:
2330       case OP_MINSTARI:
2331       case OP_NOTMINSTAR:
2332       case OP_NOTMINSTARI:
2333       case OP_POSSTAR:
2334       case OP_POSSTARI:
2335       case OP_NOTPOSSTAR:
2336       case OP_NOTPOSSTARI:
2337       case OP_PLUS:
2338       case OP_PLUSI:
2339       case OP_NOTPLUS:
2340       case OP_NOTPLUSI:
2341       case OP_MINPLUS:
2342       case OP_MINPLUSI:
2343       case OP_NOTMINPLUS:
2344       case OP_NOTMINPLUSI:
2345       case OP_POSPLUS:
2346       case OP_POSPLUSI:
2347       case OP_NOTPOSPLUS:
2348       case OP_NOTPOSPLUSI:
2349       case OP_QUERY:
2350       case OP_QUERYI:
2351       case OP_NOTQUERY:
2352       case OP_NOTQUERYI:
2353       case OP_MINQUERY:
2354       case OP_MINQUERYI:
2355       case OP_NOTMINQUERY:
2356       case OP_NOTMINQUERYI:
2357       case OP_POSQUERY:
2358       case OP_POSQUERYI:
2359       case OP_NOTPOSQUERY:
2360       case OP_NOTPOSQUERYI:
2361       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2362       break;
2363       }
2364 #else
2365     (void)(utf);  /* Keep compiler happy by referencing function argument */
2366 #endif
2367     }
2368   }
2369 }
2370
2371
2372
2373 /*************************************************
2374 *    Scan compiled branch for non-emptiness      *
2375 *************************************************/
2376
2377 /* This function scans through a branch of a compiled pattern to see whether it
2378 can match the empty string or not. It is called from could_be_empty()
2379 below and from compile_branch() when checking for an unlimited repeat of a
2380 group that can match nothing. Note that first_significant_code() skips over
2381 backward and negative forward assertions when its final argument is TRUE. If we
2382 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2383 bracket whose current branch will already have been scanned.
2384
2385 Arguments:
2386   code        points to start of search
2387   endcode     points to where to stop
2388   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2389   cd          contains pointers to tables etc.
2390   recurses    chain of recurse_check to catch mutual recursion
2391
2392 Returns:      TRUE if what is matched could be empty
2393 */
2394
2395 static BOOL
2396 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2397   BOOL utf, compile_data *cd, recurse_check *recurses)
2398 {
2399 register pcre_uchar c;
2400 recurse_check this_recurse;
2401
2402 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2403      code < endcode;
2404      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2405   {
2406   const pcre_uchar *ccode;
2407
2408   c = *code;
2409
2410   /* Skip over forward assertions; the other assertions are skipped by
2411   first_significant_code() with a TRUE final argument. */
2412
2413   if (c == OP_ASSERT)
2414     {
2415     do code += GET(code, 1); while (*code == OP_ALT);
2416     c = *code;
2417     continue;
2418     }
2419
2420   /* For a recursion/subroutine call, if its end has been reached, which
2421   implies a backward reference subroutine call, we can scan it. If it's a
2422   forward reference subroutine call, we can't. To detect forward reference
2423   we have to scan up the list that is kept in the workspace. This function is
2424   called only when doing the real compile, not during the pre-compile that
2425   measures the size of the compiled pattern. */
2426
2427   if (c == OP_RECURSE)
2428     {
2429     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2430     const pcre_uchar *endgroup = scode;
2431     BOOL empty_branch;
2432
2433     /* Test for forward reference or uncompleted reference. This is disabled
2434     when called to scan a completed pattern by setting cd->start_workspace to
2435     NULL. */
2436
2437     if (cd->start_workspace != NULL)
2438       {
2439       const pcre_uchar *tcode;
2440       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2441         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2442       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2443       }
2444
2445     /* If the reference is to a completed group, we need to detect whether this
2446     is a recursive call, as otherwise there will be an infinite loop. If it is
2447     a recursion, just skip over it. Simple recursions are easily detected. For
2448     mutual recursions we keep a chain on the stack. */
2449
2450     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2451     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2452     else
2453       {
2454       recurse_check *r = recurses;
2455       for (r = recurses; r != NULL; r = r->prev)
2456         if (r->group == scode) break;
2457       if (r != NULL) continue;   /* Mutual recursion */
2458       }
2459
2460     /* Completed reference; scan the referenced group, remembering it on the
2461     stack chain to detect mutual recursions. */
2462
2463     empty_branch = FALSE;
2464     this_recurse.prev = recurses;
2465     this_recurse.group = scode;
2466
2467     do
2468       {
2469       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2470         {
2471         empty_branch = TRUE;
2472         break;
2473         }
2474       scode += GET(scode, 1);
2475       }
2476     while (*scode == OP_ALT);
2477
2478     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2479     continue;
2480     }
2481
2482   /* Groups with zero repeats can of course be empty; skip them. */
2483
2484   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2485       c == OP_BRAPOSZERO)
2486     {
2487     code += PRIV(OP_lengths)[c];
2488     do code += GET(code, 1); while (*code == OP_ALT);
2489     c = *code;
2490     continue;
2491     }
2492
2493   /* A nested group that is already marked as "could be empty" can just be
2494   skipped. */
2495
2496   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2497       c == OP_SCBRA || c == OP_SCBRAPOS)
2498     {
2499     do code += GET(code, 1); while (*code == OP_ALT);
2500     c = *code;
2501     continue;
2502     }
2503
2504   /* For other groups, scan the branches. */
2505
2506   if (c == OP_BRA  || c == OP_BRAPOS ||
2507       c == OP_CBRA || c == OP_CBRAPOS ||
2508       c == OP_ONCE || c == OP_ONCE_NC ||
2509       c == OP_COND || c == OP_SCOND)
2510     {
2511     BOOL empty_branch;
2512     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2513
2514     /* If a conditional group has only one branch, there is a second, implied,
2515     empty branch, so just skip over the conditional, because it could be empty.
2516     Otherwise, scan the individual branches of the group. */
2517
2518     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2519       code += GET(code, 1);
2520     else
2521       {
2522       empty_branch = FALSE;
2523       do
2524         {
2525         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2526           recurses)) empty_branch = TRUE;
2527         code += GET(code, 1);
2528         }
2529       while (*code == OP_ALT);
2530       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2531       }
2532
2533     c = *code;
2534     continue;
2535     }
2536
2537   /* Handle the other opcodes */
2538
2539   switch (c)
2540     {
2541     /* Check for quantifiers after a class. XCLASS is used for classes that
2542     cannot be represented just by a bit map. This includes negated single
2543     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2544     actual length is stored in the compiled code, so we must update "code"
2545     here. */
2546
2547 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2548     case OP_XCLASS:
2549     ccode = code += GET(code, 1);
2550     goto CHECK_CLASS_REPEAT;
2551 #endif
2552
2553     case OP_CLASS:
2554     case OP_NCLASS:
2555     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2556
2557 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2558     CHECK_CLASS_REPEAT:
2559 #endif
2560
2561     switch (*ccode)
2562       {
2563       case OP_CRSTAR:            /* These could be empty; continue */
2564       case OP_CRMINSTAR:
2565       case OP_CRQUERY:
2566       case OP_CRMINQUERY:
2567       case OP_CRPOSSTAR:
2568       case OP_CRPOSQUERY:
2569       break;
2570
2571       default:                   /* Non-repeat => class must match */
2572       case OP_CRPLUS:            /* These repeats aren't empty */
2573       case OP_CRMINPLUS:
2574       case OP_CRPOSPLUS:
2575       return FALSE;
2576
2577       case OP_CRRANGE:
2578       case OP_CRMINRANGE:
2579       case OP_CRPOSRANGE:
2580       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2581       break;
2582       }
2583     break;
2584
2585     /* Opcodes that must match a character */
2586
2587     case OP_ANY:
2588     case OP_ALLANY:
2589     case OP_ANYBYTE:
2590
2591     case OP_PROP:
2592     case OP_NOTPROP:
2593     case OP_ANYNL:
2594
2595     case OP_NOT_HSPACE:
2596     case OP_HSPACE:
2597     case OP_NOT_VSPACE:
2598     case OP_VSPACE:
2599     case OP_EXTUNI:
2600
2601     case OP_NOT_DIGIT:
2602     case OP_DIGIT:
2603     case OP_NOT_WHITESPACE:
2604     case OP_WHITESPACE:
2605     case OP_NOT_WORDCHAR:
2606     case OP_WORDCHAR:
2607
2608     case OP_CHAR:
2609     case OP_CHARI:
2610     case OP_NOT:
2611     case OP_NOTI:
2612
2613     case OP_PLUS:
2614     case OP_PLUSI:
2615     case OP_MINPLUS:
2616     case OP_MINPLUSI:
2617
2618     case OP_NOTPLUS:
2619     case OP_NOTPLUSI:
2620     case OP_NOTMINPLUS:
2621     case OP_NOTMINPLUSI:
2622
2623     case OP_POSPLUS:
2624     case OP_POSPLUSI:
2625     case OP_NOTPOSPLUS:
2626     case OP_NOTPOSPLUSI:
2627
2628     case OP_EXACT:
2629     case OP_EXACTI:
2630     case OP_NOTEXACT:
2631     case OP_NOTEXACTI:
2632
2633     case OP_TYPEPLUS:
2634     case OP_TYPEMINPLUS:
2635     case OP_TYPEPOSPLUS:
2636     case OP_TYPEEXACT:
2637
2638     return FALSE;
2639
2640     /* These are going to continue, as they may be empty, but we have to
2641     fudge the length for the \p and \P cases. */
2642
2643     case OP_TYPESTAR:
2644     case OP_TYPEMINSTAR:
2645     case OP_TYPEPOSSTAR:
2646     case OP_TYPEQUERY:
2647     case OP_TYPEMINQUERY:
2648     case OP_TYPEPOSQUERY:
2649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2650     break;
2651
2652     /* Same for these */
2653
2654     case OP_TYPEUPTO:
2655     case OP_TYPEMINUPTO:
2656     case OP_TYPEPOSUPTO:
2657     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2658       code += 2;
2659     break;
2660
2661     /* End of branch */
2662
2663     case OP_KET:
2664     case OP_KETRMAX:
2665     case OP_KETRMIN:
2666     case OP_KETRPOS:
2667     case OP_ALT:
2668     return TRUE;
2669
2670     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2671     MINUPTO, and POSUPTO and their caseless and negative versions may be
2672     followed by a multibyte character. */
2673
2674 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2675     case OP_STAR:
2676     case OP_STARI:
2677     case OP_NOTSTAR:
2678     case OP_NOTSTARI:
2679
2680     case OP_MINSTAR:
2681     case OP_MINSTARI:
2682     case OP_NOTMINSTAR:
2683     case OP_NOTMINSTARI:
2684
2685     case OP_POSSTAR:
2686     case OP_POSSTARI:
2687     case OP_NOTPOSSTAR:
2688     case OP_NOTPOSSTARI:
2689
2690     case OP_QUERY:
2691     case OP_QUERYI:
2692     case OP_NOTQUERY:
2693     case OP_NOTQUERYI:
2694
2695     case OP_MINQUERY:
2696     case OP_MINQUERYI:
2697     case OP_NOTMINQUERY:
2698     case OP_NOTMINQUERYI:
2699
2700     case OP_POSQUERY:
2701     case OP_POSQUERYI:
2702     case OP_NOTPOSQUERY:
2703     case OP_NOTPOSQUERYI:
2704
2705     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2706     break;
2707
2708     case OP_UPTO:
2709     case OP_UPTOI:
2710     case OP_NOTUPTO:
2711     case OP_NOTUPTOI:
2712
2713     case OP_MINUPTO:
2714     case OP_MINUPTOI:
2715     case OP_NOTMINUPTO:
2716     case OP_NOTMINUPTOI:
2717
2718     case OP_POSUPTO:
2719     case OP_POSUPTOI:
2720     case OP_NOTPOSUPTO:
2721     case OP_NOTPOSUPTOI:
2722
2723     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2724     break;
2725 #endif
2726
2727     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2728     string. */
2729
2730     case OP_MARK:
2731     case OP_PRUNE_ARG:
2732     case OP_SKIP_ARG:
2733     case OP_THEN_ARG:
2734     code += code[1];
2735     break;
2736
2737     /* None of the remaining opcodes are required to match a character. */
2738
2739     default:
2740     break;
2741     }
2742   }
2743
2744 return TRUE;
2745 }
2746
2747
2748
2749 /*************************************************
2750 *    Scan compiled regex for non-emptiness       *
2751 *************************************************/
2752
2753 /* This function is called to check for left recursive calls. We want to check
2754 the current branch of the current pattern to see if it could match the empty
2755 string. If it could, we must look outwards for branches at other levels,
2756 stopping when we pass beyond the bracket which is the subject of the recursion.
2757 This function is called only during the real compile, not during the
2758 pre-compile.
2759
2760 Arguments:
2761   code        points to start of the recursion
2762   endcode     points to where to stop (current RECURSE item)
2763   bcptr       points to the chain of current (unclosed) branch starts
2764   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2765   cd          pointers to tables etc
2766
2767 Returns:      TRUE if what is matched could be empty
2768 */
2769
2770 static BOOL
2771 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2772   branch_chain *bcptr, BOOL utf, compile_data *cd)
2773 {
2774 while (bcptr != NULL && bcptr->current_branch >= code)
2775   {
2776   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2777     return FALSE;
2778   bcptr = bcptr->outer;
2779   }
2780 return TRUE;
2781 }
2782
2783
2784
2785 /*************************************************
2786 *        Base opcode of repeated opcodes         *
2787 *************************************************/
2788
2789 /* Returns the base opcode for repeated single character type opcodes. If the
2790 opcode is not a repeated character type, it returns with the original value.
2791
2792 Arguments:  c opcode
2793 Returns:    base opcode for the type
2794 */
2795
2796 static pcre_uchar
2797 get_repeat_base(pcre_uchar c)
2798 {
2799 return (c > OP_TYPEPOSUPTO)? c :
2800        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2801        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2802        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2803        (c >= OP_STARI)?      OP_STARI :
2804                              OP_STAR;
2805 }
2806
2807
2808
2809 #ifdef SUPPORT_UCP
2810 /*************************************************
2811 *        Check a character and a property        *
2812 *************************************************/
2813
2814 /* This function is called by check_auto_possessive() when a property item
2815 is adjacent to a fixed character.
2816
2817 Arguments:
2818   c            the character
2819   ptype        the property type
2820   pdata        the data for the type
2821   negated      TRUE if it's a negated property (\P or \p{^)
2822
2823 Returns:       TRUE if auto-possessifying is OK
2824 */
2825
2826 static BOOL
2827 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2828   BOOL negated)
2829 {
2830 const pcre_uint32 *p;
2831 const ucd_record *prop = GET_UCD(c);
2832
2833 switch(ptype)
2834   {
2835   case PT_LAMP:
2836   return (prop->chartype == ucp_Lu ||
2837           prop->chartype == ucp_Ll ||
2838           prop->chartype == ucp_Lt) == negated;
2839
2840   case PT_GC:
2841   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2842
2843   case PT_PC:
2844   return (pdata == prop->chartype) == negated;
2845
2846   case PT_SC:
2847   return (pdata == prop->script) == negated;
2848
2849   /* These are specials */
2850
2851   case PT_ALNUM:
2852   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2853           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2854
2855   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2856   means that Perl space and POSIX space are now identical. PCRE was changed
2857   at release 8.34. */
2858
2859   case PT_SPACE:    /* Perl space */
2860   case PT_PXSPACE:  /* POSIX space */
2861   switch(c)
2862     {
2863     HSPACE_CASES:
2864     VSPACE_CASES:
2865     return negated;
2866
2867     default:
2868     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2869     }
2870   break;  /* Control never reaches here */
2871
2872   case PT_WORD:
2873   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2874           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2875           c == CHAR_UNDERSCORE) == negated;
2876
2877   case PT_CLIST:
2878   p = PRIV(ucd_caseless_sets) + prop->caseset;
2879   for (;;)
2880     {
2881     if (c < *p) return !negated;
2882     if (c == *p++) return negated;
2883     }
2884   break;  /* Control never reaches here */
2885   }
2886
2887 return FALSE;
2888 }
2889 #endif  /* SUPPORT_UCP */
2890
2891
2892
2893 /*************************************************
2894 *        Fill the character property list        *
2895 *************************************************/
2896
2897 /* Checks whether the code points to an opcode that can take part in auto-
2898 possessification, and if so, fills a list with its properties.
2899
2900 Arguments:
2901   code        points to start of expression
2902   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2903   fcc         points to case-flipping table
2904   list        points to output list
2905               list[0] will be filled with the opcode
2906               list[1] will be non-zero if this opcode
2907                 can match an empty character string
2908               list[2..7] depends on the opcode
2909
2910 Returns:      points to the start of the next opcode if *code is accepted
2911               NULL if *code is not accepted
2912 */
2913
2914 static const pcre_uchar *
2915 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2916   const pcre_uint8 *fcc, pcre_uint32 *list)
2917 {
2918 pcre_uchar c = *code;
2919 pcre_uchar base;
2920 const pcre_uchar *end;
2921 pcre_uint32 chr;
2922
2923 #ifdef SUPPORT_UCP
2924 pcre_uint32 *clist_dest;
2925 const pcre_uint32 *clist_src;
2926 #else
2927 utf = utf;  /* Suppress "unused parameter" compiler warning */
2928 #endif
2929
2930 list[0] = c;
2931 list[1] = FALSE;
2932 code++;
2933
2934 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2935   {
2936   base = get_repeat_base(c);
2937   c -= (base - OP_STAR);
2938
2939   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2940     code += IMM2_SIZE;
2941
2942   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2943
2944   switch(base)
2945     {
2946     case OP_STAR:
2947     list[0] = OP_CHAR;
2948     break;
2949
2950     case OP_STARI:
2951     list[0] = OP_CHARI;
2952     break;
2953
2954     case OP_NOTSTAR:
2955     list[0] = OP_NOT;
2956     break;
2957
2958     case OP_NOTSTARI:
2959     list[0] = OP_NOTI;
2960     break;
2961
2962     case OP_TYPESTAR:
2963     list[0] = *code;
2964     code++;
2965     break;
2966     }
2967   c = list[0];
2968   }
2969
2970 switch(c)
2971   {
2972   case OP_NOT_DIGIT:
2973   case OP_DIGIT:
2974   case OP_NOT_WHITESPACE:
2975   case OP_WHITESPACE:
2976   case OP_NOT_WORDCHAR:
2977   case OP_WORDCHAR:
2978   case OP_ANY:
2979   case OP_ALLANY:
2980   case OP_ANYNL:
2981   case OP_NOT_HSPACE:
2982   case OP_HSPACE:
2983   case OP_NOT_VSPACE:
2984   case OP_VSPACE:
2985   case OP_EXTUNI:
2986   case OP_EODN:
2987   case OP_EOD:
2988   case OP_DOLL:
2989   case OP_DOLLM:
2990   return code;
2991
2992   case OP_CHAR:
2993   case OP_NOT:
2994   GETCHARINCTEST(chr, code);
2995   list[2] = chr;
2996   list[3] = NOTACHAR;
2997   return code;
2998
2999   case OP_CHARI:
3000   case OP_NOTI:
3001   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3002   GETCHARINCTEST(chr, code);
3003   list[2] = chr;
3004
3005 #ifdef SUPPORT_UCP
3006   if (chr < 128 || (chr < 256 && !utf))
3007     list[3] = fcc[chr];
3008   else
3009     list[3] = UCD_OTHERCASE(chr);
3010 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3011   list[3] = (chr < 256) ? fcc[chr] : chr;
3012 #else
3013   list[3] = fcc[chr];
3014 #endif
3015
3016   /* The othercase might be the same value. */
3017
3018   if (chr == list[3])
3019     list[3] = NOTACHAR;
3020   else
3021     list[4] = NOTACHAR;
3022   return code;
3023
3024 #ifdef SUPPORT_UCP
3025   case OP_PROP:
3026   case OP_NOTPROP:
3027   if (code[0] != PT_CLIST)
3028     {
3029     list[2] = code[0];
3030     list[3] = code[1];
3031     return code + 2;
3032     }
3033
3034   /* Convert only if we have enough space. */
3035
3036   clist_src = PRIV(ucd_caseless_sets) + code[1];
3037   clist_dest = list + 2;
3038   code += 2;
3039
3040   do {
3041      if (clist_dest >= list + 8)
3042        {
3043        /* Early return if there is not enough space. This should never
3044        happen, since all clists are shorter than 5 character now. */
3045        list[2] = code[0];
3046        list[3] = code[1];
3047        return code;
3048        }
3049      *clist_dest++ = *clist_src;
3050      }
3051   while(*clist_src++ != NOTACHAR);
3052
3053   /* All characters are stored. The terminating NOTACHAR
3054   is copied form the clist itself. */
3055
3056   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3057   return code;
3058 #endif
3059
3060   case OP_NCLASS:
3061   case OP_CLASS:
3062 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3063   case OP_XCLASS:
3064   if (c == OP_XCLASS)
3065     end = code + GET(code, 0) - 1;
3066   else
3067 #endif
3068     end = code + 32 / sizeof(pcre_uchar);
3069
3070   switch(*end)
3071     {
3072     case OP_CRSTAR:
3073     case OP_CRMINSTAR:
3074     case OP_CRQUERY:
3075     case OP_CRMINQUERY:
3076     case OP_CRPOSSTAR:
3077     case OP_CRPOSQUERY:
3078     list[1] = TRUE;
3079     end++;
3080     break;
3081
3082     case OP_CRPLUS:
3083     case OP_CRMINPLUS:
3084     case OP_CRPOSPLUS:
3085     end++;
3086     break;
3087
3088     case OP_CRRANGE:
3089     case OP_CRMINRANGE:
3090     case OP_CRPOSRANGE:
3091     list[1] = (GET2(end, 1) == 0);
3092     end += 1 + 2 * IMM2_SIZE;
3093     break;
3094     }
3095   list[2] = (pcre_uint32)(end - code);
3096   return end;
3097   }
3098 return NULL;    /* Opcode not accepted */
3099 }
3100
3101
3102
3103 /*************************************************
3104 *    Scan further character sets for match       *
3105 *************************************************/
3106
3107 /* Checks whether the base and the current opcode have a common character, in
3108 which case the base cannot be possessified.
3109
3110 Arguments:
3111   code        points to the byte code
3112   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3113   cd          static compile data
3114   base_list   the data list of the base opcode
3115
3116 Returns:      TRUE if the auto-possessification is possible
3117 */
3118
3119 static BOOL
3120 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3121   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3122 {
3123 pcre_uchar c;
3124 pcre_uint32 list[8];
3125 const pcre_uint32 *chr_ptr;
3126 const pcre_uint32 *ochr_ptr;
3127 const pcre_uint32 *list_ptr;
3128 const pcre_uchar *next_code;
3129 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3130 const pcre_uchar *xclass_flags;
3131 #endif
3132 const pcre_uint8 *class_bitset;
3133 const pcre_uint8 *set1, *set2, *set_end;
3134 pcre_uint32 chr;
3135 BOOL accepted, invert_bits;
3136 BOOL entered_a_group = FALSE;
3137
3138 if (*rec_limit == 0) return FALSE;
3139 --(*rec_limit);
3140
3141 /* Note: the base_list[1] contains whether the current opcode has greedy
3142 (represented by a non-zero value) quantifier. This is a different from
3143 other character type lists, which stores here that the character iterator
3144 matches to an empty string (also represented by a non-zero value). */
3145
3146 for(;;)
3147   {
3148   /* All operations move the code pointer forward.
3149   Therefore infinite recursions are not possible. */
3150
3151   c = *code;
3152
3153   /* Skip over callouts */
3154
3155   if (c == OP_CALLOUT)
3156     {
3157     code += PRIV(OP_lengths)[c];
3158     continue;
3159     }
3160
3161   if (c == OP_ALT)
3162     {
3163     do code += GET(code, 1); while (*code == OP_ALT);
3164     c = *code;
3165     }
3166
3167   switch(c)
3168     {
3169     case OP_END:
3170     case OP_KETRPOS:
3171     /* TRUE only in greedy case. The non-greedy case could be replaced by
3172     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3173     uses more memory, which we cannot get at this stage.) */
3174
3175     return base_list[1] != 0;
3176
3177     case OP_KET:
3178     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3179     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3180     cannot be converted to a possessive form. */
3181
3182     if (base_list[1] == 0) return FALSE;
3183
3184     switch(*(code - GET(code, 1)))
3185       {
3186       case OP_ASSERT:
3187       case OP_ASSERT_NOT:
3188       case OP_ASSERTBACK:
3189       case OP_ASSERTBACK_NOT:
3190       case OP_ONCE:
3191       case OP_ONCE_NC:
3192       /* Atomic sub-patterns and assertions can always auto-possessify their
3193       last iterator. However, if the group was entered as a result of checking
3194       a previous iterator, this is not possible. */
3195
3196       return !entered_a_group;
3197       }
3198
3199     code += PRIV(OP_lengths)[c];
3200     continue;
3201
3202     case OP_ONCE:
3203     case OP_ONCE_NC:
3204     case OP_BRA:
3205     case OP_CBRA:
3206     next_code = code + GET(code, 1);
3207     code += PRIV(OP_lengths)[c];
3208
3209     while (*next_code == OP_ALT)
3210       {
3211       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3212         return FALSE;
3213       code = next_code + 1 + LINK_SIZE;
3214       next_code += GET(next_code, 1);
3215       }
3216
3217     entered_a_group = TRUE;
3218     continue;
3219
3220     case OP_BRAZERO:
3221     case OP_BRAMINZERO:
3222
3223     next_code = code + 1;
3224     if (*next_code != OP_BRA && *next_code != OP_CBRA
3225         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3226
3227     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3228
3229     /* The bracket content will be checked by the
3230     OP_BRA/OP_CBRA case above. */
3231     next_code += 1 + LINK_SIZE;
3232     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3233       return FALSE;
3234
3235     code += PRIV(OP_lengths)[c];
3236     continue;
3237
3238     default:
3239     break;
3240     }
3241
3242   /* Check for a supported opcode, and load its properties. */
3243
3244   code = get_chr_property_list(code, utf, cd->fcc, list);
3245   if (code == NULL) return FALSE;    /* Unsupported */
3246
3247   /* If either opcode is a small character list, set pointers for comparing
3248   characters from that list with another list, or with a property. */
3249
3250   if (base_list[0] == OP_CHAR)
3251     {
3252     chr_ptr = base_list + 2;
3253     list_ptr = list;
3254     }
3255   else if (list[0] == OP_CHAR)
3256     {
3257     chr_ptr = list + 2;
3258     list_ptr = base_list;
3259     }
3260
3261   /* Character bitsets can also be compared to certain opcodes. */
3262
3263   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3264 #ifdef COMPILE_PCRE8
3265       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3266       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3267 #endif
3268       )
3269     {
3270 #ifdef COMPILE_PCRE8
3271     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3272 #else
3273     if (base_list[0] == OP_CLASS)
3274 #endif
3275       {
3276       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3277       list_ptr = list;
3278       }
3279     else
3280       {
3281       set1 = (pcre_uint8 *)(code - list[2]);
3282       list_ptr = base_list;
3283       }
3284
3285     invert_bits = FALSE;
3286     switch(list_ptr[0])
3287       {
3288       case OP_CLASS:
3289       case OP_NCLASS:
3290       set2 = (pcre_uint8 *)
3291         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3292       break;
3293
3294 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3295       case OP_XCLASS:
3296       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3297       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3298       if ((*xclass_flags & XCL_MAP) == 0)
3299         {
3300         /* No bits are set for characters < 256. */
3301         if (list[1] == 0) return TRUE;
3302         /* Might be an empty repeat. */
3303         continue;
3304         }
3305       set2 = (pcre_uint8 *)(xclass_flags + 1);
3306       break;
3307 #endif
3308
3309       case OP_NOT_DIGIT:
3310       invert_bits = TRUE;
3311       /* Fall through */
3312       case OP_DIGIT:
3313       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3314       break;
3315
3316       case OP_NOT_WHITESPACE:
3317       invert_bits = TRUE;
3318       /* Fall through */
3319       case OP_WHITESPACE:
3320       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3321       break;
3322
3323       case OP_NOT_WORDCHAR:
3324       invert_bits = TRUE;
3325       /* Fall through */
3326       case OP_WORDCHAR:
3327       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3328       break;
3329
3330       default:
3331       return FALSE;
3332       }
3333
3334     /* Because the sets are unaligned, we need
3335     to perform byte comparison here. */
3336     set_end = set1 + 32;
3337     if (invert_bits)
3338       {
3339       do
3340         {
3341         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3342         }
3343       while (set1 < set_end);
3344       }
3345     else
3346       {
3347       do
3348         {
3349         if ((*set1++ & *set2++) != 0) return FALSE;
3350         }
3351       while (set1 < set_end);
3352       }
3353
3354     if (list[1] == 0) return TRUE;
3355     /* Might be an empty repeat. */
3356     continue;
3357     }
3358
3359   /* Some property combinations also acceptable. Unicode property opcodes are
3360   processed specially; the rest can be handled with a lookup table. */
3361
3362   else
3363     {
3364     pcre_uint32 leftop, rightop;
3365
3366     leftop = base_list[0];
3367     rightop = list[0];
3368
3369 #ifdef SUPPORT_UCP
3370     accepted = FALSE; /* Always set in non-unicode case. */
3371     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3372       {
3373       if (rightop == OP_EOD)
3374         accepted = TRUE;
3375       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3376         {
3377         int n;
3378         const pcre_uint8 *p;
3379         BOOL same = leftop == rightop;
3380         BOOL lisprop = leftop == OP_PROP;
3381         BOOL risprop = rightop == OP_PROP;
3382         BOOL bothprop = lisprop && risprop;
3383
3384         /* There's a table that specifies how each combination is to be
3385         processed:
3386           0   Always return FALSE (never auto-possessify)
3387           1   Character groups are distinct (possessify if both are OP_PROP)
3388           2   Check character categories in the same group (general or particular)
3389           3   Return TRUE if the two opcodes are not the same
3390           ... see comments below
3391         */
3392
3393         n = propposstab[base_list[2]][list[2]];
3394         switch(n)
3395           {
3396           case 0: break;
3397           case 1: accepted = bothprop; break;
3398           case 2: accepted = (base_list[3] == list[3]) != same; break;
3399           case 3: accepted = !same; break;
3400
3401           case 4:  /* Left general category, right particular category */
3402           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3403           break;
3404
3405           case 5:  /* Right general category, left particular category */
3406           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3407           break;
3408
3409           /* This code is logically tricky. Think hard before fiddling with it.
3410           The posspropstab table has four entries per row. Each row relates to
3411           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3412           Only WORD actually needs all four entries, but using repeats for the
3413           others means they can all use the same code below.
3414
3415           The first two entries in each row are Unicode general categories, and
3416           apply always, because all the characters they include are part of the
3417           PCRE character set. The third and fourth entries are a general and a
3418           particular category, respectively, that include one or more relevant
3419           characters. One or the other is used, depending on whether the check
3420           is for a general or a particular category. However, in both cases the
3421           category contains more characters than the specials that are defined
3422           for the property being tested against. Therefore, it cannot be used
3423           in a NOTPROP case.
3424
3425           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3426           Underscore is covered by ucp_P or ucp_Po. */
3427
3428           case 6:  /* Left alphanum vs right general category */
3429           case 7:  /* Left space vs right general category */
3430           case 8:  /* Left word vs right general category */
3431           p = posspropstab[n-6];
3432           accepted = risprop && lisprop ==
3433             (list[3] != p[0] &&
3434              list[3] != p[1] &&
3435             (list[3] != p[2] || !lisprop));
3436           break;
3437
3438           case 9:   /* Right alphanum vs left general category */
3439           case 10:  /* Right space vs left general category */
3440           case 11:  /* Right word vs left general category */
3441           p = posspropstab[n-9];
3442           accepted = lisprop && risprop ==
3443             (base_list[3] != p[0] &&
3444              base_list[3] != p[1] &&
3445             (base_list[3] != p[2] || !risprop));
3446           break;
3447
3448           case 12:  /* Left alphanum vs right particular category */
3449           case 13:  /* Left space vs right particular category */
3450           case 14:  /* Left word vs right particular category */
3451           p = posspropstab[n-12];
3452           accepted = risprop && lisprop ==
3453             (catposstab[p[0]][list[3]] &&
3454              catposstab[p[1]][list[3]] &&
3455             (list[3] != p[3] || !lisprop));
3456           break;
3457
3458           case 15:  /* Right alphanum vs left particular category */
3459           case 16:  /* Right space vs left particular category */
3460           case 17:  /* Right word vs left particular category */
3461           p = posspropstab[n-15];
3462           accepted = lisprop && risprop ==
3463             (catposstab[p[0]][base_list[3]] &&
3464              catposstab[p[1]][base_list[3]] &&
3465             (base_list[3] != p[3] || !risprop));
3466           break;
3467           }
3468         }
3469       }
3470
3471     else
3472 #endif  /* SUPPORT_UCP */
3473
3474     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3475            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3476            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3477
3478     if (!accepted) return FALSE;
3479
3480     if (list[1] == 0) return TRUE;
3481     /* Might be an empty repeat. */
3482     continue;
3483     }
3484
3485   /* Control reaches here only if one of the items is a small character list.
3486   All characters are checked against the other side. */
3487
3488   do
3489     {
3490     chr = *chr_ptr;
3491
3492     switch(list_ptr[0])
3493       {
3494       case OP_CHAR:
3495       ochr_ptr = list_ptr + 2;
3496       do
3497         {
3498         if (chr == *ochr_ptr) return FALSE;
3499         ochr_ptr++;
3500         }
3501       while(*ochr_ptr != NOTACHAR);
3502       break;
3503
3504       case OP_NOT:
3505       ochr_ptr = list_ptr + 2;
3506       do
3507         {
3508         if (chr == *ochr_ptr)
3509           break;
3510         ochr_ptr++;
3511         }
3512       while(*ochr_ptr != NOTACHAR);
3513       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3514       break;
3515
3516       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3517       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3518
3519       case OP_DIGIT:
3520       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3521       break;
3522
3523       case OP_NOT_DIGIT:
3524       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3525       break;
3526
3527       case OP_WHITESPACE:
3528       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3529       break;
3530
3531       case OP_NOT_WHITESPACE:
3532       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3533       break;
3534
3535       case OP_WORDCHAR:
3536       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3537       break;
3538
3539       case OP_NOT_WORDCHAR:
3540       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3541       break;
3542
3543       case OP_HSPACE:
3544       switch(chr)
3545         {
3546         HSPACE_CASES: return FALSE;
3547         default: break;
3548         }
3549       break;
3550
3551       case OP_NOT_HSPACE:
3552       switch(chr)
3553         {
3554         HSPACE_CASES: break;
3555         default: return FALSE;
3556         }
3557       break;
3558
3559       case OP_ANYNL:
3560       case OP_VSPACE:
3561       switch(chr)
3562         {
3563         VSPACE_CASES: return FALSE;
3564         default: break;
3565         }
3566       break;
3567
3568       case OP_NOT_VSPACE:
3569       switch(chr)
3570         {
3571         VSPACE_CASES: break;
3572         default: return FALSE;
3573         }
3574       break;
3575
3576       case OP_DOLL:
3577       case OP_EODN:
3578       switch (chr)
3579         {
3580         case CHAR_CR:
3581         case CHAR_LF:
3582         case CHAR_VT:
3583         case CHAR_FF:
3584         case CHAR_NEL:
3585 #ifndef EBCDIC
3586         case 0x2028:
3587         case 0x2029:
3588 #endif  /* Not EBCDIC */
3589         return FALSE;
3590         }
3591       break;
3592
3593       case OP_EOD:    /* Can always possessify before \z */
3594       break;
3595
3596 #ifdef SUPPORT_UCP
3597       case OP_PROP:
3598       case OP_NOTPROP:
3599       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3600             list_ptr[0] == OP_NOTPROP))
3601         return FALSE;
3602       break;
3603 #endif
3604
3605       case OP_NCLASS:
3606       if (chr > 255) return FALSE;
3607       /* Fall through */
3608
3609       case OP_CLASS:
3610       if (chr > 255) break;
3611       class_bitset = (pcre_uint8 *)
3612         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3613       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3614       break;
3615
3616 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3617       case OP_XCLASS:
3618       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3619           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3620       break;
3621 #endif
3622
3623       default:
3624       return FALSE;
3625       }
3626
3627     chr_ptr++;
3628     }
3629   while(*chr_ptr != NOTACHAR);
3630
3631   /* At least one character must be matched from this opcode. */
3632
3633   if (list[1] == 0) return TRUE;
3634   }
3635
3636 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3637 but some compilers complain about an unreachable statement. */
3638
3639 }
3640
3641
3642
3643 /*************************************************
3644 *    Scan compiled regex for auto-possession     *
3645 *************************************************/
3646
3647 /* Replaces single character iterations with their possessive alternatives
3648 if appropriate. This function modifies the compiled opcode!
3649
3650 Arguments:
3651   code        points to start of the byte code
3652   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3653   cd          static compile data
3654
3655 Returns:      nothing
3656 */
3657
3658 static void
3659 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3660 {
3661 register pcre_uchar c;
3662 const pcre_uchar *end;
3663 pcre_uchar *repeat_opcode;
3664 pcre_uint32 list[8];
3665 int rec_limit;
3666
3667 for (;;)
3668   {
3669   c = *code;
3670
3671   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3672   it may compile without complaining, but may get into a loop here if the code
3673   pointer points to a bad value. This is, of course a documentated possibility,
3674   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3675   just give up on this optimization. */
3676
3677   if (c >= OP_TABLE_LENGTH) return;
3678
3679   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3680     {
3681     c -= get_repeat_base(c) - OP_STAR;
3682     end = (c <= OP_MINUPTO) ?
3683       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3684     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3685
3686     rec_limit = 1000;
3687     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3688       {
3689       switch(c)
3690         {
3691         case OP_STAR:
3692         *code += OP_POSSTAR - OP_STAR;
3693         break;
3694
3695         case OP_MINSTAR:
3696         *code += OP_POSSTAR - OP_MINSTAR;
3697         break;
3698
3699         case OP_PLUS:
3700         *code += OP_POSPLUS - OP_PLUS;
3701         break;
3702
3703         case OP_MINPLUS:
3704         *code += OP_POSPLUS - OP_MINPLUS;
3705         break;
3706
3707         case OP_QUERY:
3708         *code += OP_POSQUERY - OP_QUERY;
3709         break;
3710
3711         case OP_MINQUERY:
3712         *code += OP_POSQUERY - OP_MINQUERY;
3713         break;
3714
3715         case OP_UPTO:
3716         *code += OP_POSUPTO - OP_UPTO;
3717         break;
3718
3719         case OP_MINUPTO:
3720         *code += OP_POSUPTO - OP_MINUPTO;
3721         break;
3722         }
3723       }
3724     c = *code;
3725     }
3726   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3727     {
3728 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3729     if (c == OP_XCLASS)
3730       repeat_opcode = code + GET(code, 1);
3731     else
3732 #endif
3733       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3734
3735     c = *repeat_opcode;
3736     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3737       {
3738       /* end must not be NULL. */
3739       end = get_chr_property_list(code, utf, cd->fcc, list);
3740
3741       list[1] = (c & 1) == 0;
3742
3743       rec_limit = 1000;
3744       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3745         {
3746         switch (c)
3747           {
3748           case OP_CRSTAR:
3749           case OP_CRMINSTAR:
3750           *repeat_opcode = OP_CRPOSSTAR;
3751           break;
3752
3753           case OP_CRPLUS:
3754           case OP_CRMINPLUS:
3755           *repeat_opcode = OP_CRPOSPLUS;
3756           break;
3757
3758           case OP_CRQUERY:
3759           case OP_CRMINQUERY:
3760           *repeat_opcode = OP_CRPOSQUERY;
3761           break;
3762
3763           case OP_CRRANGE:
3764           case OP_CRMINRANGE:
3765           *repeat_opcode = OP_CRPOSRANGE;
3766           break;
3767           }
3768         }
3769       }
3770     c = *code;
3771     }
3772
3773   switch(c)
3774     {
3775     case OP_END:
3776     return;
3777
3778     case OP_TYPESTAR:
3779     case OP_TYPEMINSTAR:
3780     case OP_TYPEPLUS:
3781     case OP_TYPEMINPLUS:
3782     case OP_TYPEQUERY:
3783     case OP_TYPEMINQUERY:
3784     case OP_TYPEPOSSTAR:
3785     case OP_TYPEPOSPLUS:
3786     case OP_TYPEPOSQUERY:
3787     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3788     break;
3789
3790     case OP_TYPEUPTO:
3791     case OP_TYPEMINUPTO:
3792     case OP_TYPEEXACT:
3793     case OP_TYPEPOSUPTO:
3794     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3795       code += 2;
3796     break;
3797
3798 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3799     case OP_XCLASS:
3800     code += GET(code, 1);
3801     break;
3802 #endif
3803
3804     case OP_MARK:
3805     case OP_PRUNE_ARG:
3806     case OP_SKIP_ARG:
3807     case OP_THEN_ARG:
3808     code += code[1];
3809     break;
3810     }
3811
3812   /* Add in the fixed length from the table */
3813
3814   code += PRIV(OP_lengths)[c];
3815
3816   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3817   a multi-byte character. The length in the table is a minimum, so we have to
3818   arrange to skip the extra bytes. */
3819
3820 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3821   if (utf) switch(c)
3822     {
3823     case OP_CHAR:
3824     case OP_CHARI:
3825     case OP_NOT:
3826     case OP_NOTI:
3827     case OP_STAR:
3828     case OP_MINSTAR:
3829     case OP_PLUS:
3830     case OP_MINPLUS:
3831     case OP_QUERY:
3832     case OP_MINQUERY:
3833     case OP_UPTO:
3834     case OP_MINUPTO:
3835     case OP_EXACT:
3836     case OP_POSSTAR:
3837     case OP_POSPLUS:
3838     case OP_POSQUERY:
3839     case OP_POSUPTO:
3840     case OP_STARI:
3841     case OP_MINSTARI:
3842     case OP_PLUSI:
3843     case OP_MINPLUSI:
3844     case OP_QUERYI:
3845     case OP_MINQUERYI:
3846     case OP_UPTOI:
3847     case OP_MINUPTOI:
3848     case OP_EXACTI:
3849     case OP_POSSTARI:
3850     case OP_POSPLUSI:
3851     case OP_POSQUERYI:
3852     case OP_POSUPTOI:
3853     case OP_NOTSTAR:
3854     case OP_NOTMINSTAR:
3855     case OP_NOTPLUS:
3856     case OP_NOTMINPLUS:
3857     case OP_NOTQUERY:
3858     case OP_NOTMINQUERY:
3859     case OP_NOTUPTO:
3860     case OP_NOTMINUPTO:
3861     case OP_NOTEXACT:
3862     case OP_NOTPOSSTAR:
3863     case OP_NOTPOSPLUS:
3864     case OP_NOTPOSQUERY:
3865     case OP_NOTPOSUPTO:
3866     case OP_NOTSTARI:
3867     case OP_NOTMINSTARI:
3868     case OP_NOTPLUSI:
3869     case OP_NOTMINPLUSI:
3870     case OP_NOTQUERYI:
3871     case OP_NOTMINQUERYI:
3872     case OP_NOTUPTOI:
3873     case OP_NOTMINUPTOI:
3874     case OP_NOTEXACTI:
3875     case OP_NOTPOSSTARI:
3876     case OP_NOTPOSPLUSI:
3877     case OP_NOTPOSQUERYI:
3878     case OP_NOTPOSUPTOI:
3879     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3880     break;
3881     }
3882 #else
3883   (void)(utf);  /* Keep compiler happy by referencing function argument */
3884 #endif
3885   }
3886 }
3887
3888
3889
3890 /*************************************************
3891 *           Check for POSIX class syntax         *
3892 *************************************************/
3893
3894 /* This function is called when the sequence "[:" or "[." or "[=" is
3895 encountered in a character class. It checks whether this is followed by a
3896 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3897 reach an unescaped ']' without the special preceding character, return FALSE.
3898
3899 Originally, this function only recognized a sequence of letters between the
3900 terminators, but it seems that Perl recognizes any sequence of characters,
3901 though of course unknown POSIX names are subsequently rejected. Perl gives an
3902 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3903 didn't consider this to be a POSIX class. Likewise for [:1234:].
3904
3905 The problem in trying to be exactly like Perl is in the handling of escapes. We
3906 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3907 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3908 below handles the special cases \\ and \], but does not try to do any other
3909 escape processing. This makes it different from Perl for cases such as
3910 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3911 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3912 when Perl does, I think.
3913
3914 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3915 It seems that the appearance of a nested POSIX class supersedes an apparent
3916 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3917 a digit.
3918
3919 In Perl, unescaped square brackets may also appear as part of class names. For
3920 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3921 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3922 seem right at all. PCRE does not allow closing square brackets in POSIX class
3923 names.
3924
3925 Arguments:
3926   ptr      pointer to the initial [
3927   endptr   where to return the end pointer
3928
3929 Returns:   TRUE or FALSE
3930 */
3931
3932 static BOOL
3933 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3934 {
3935 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3936 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3937 for (++ptr; *ptr != CHAR_NULL; ptr++)
3938   {
3939   if (*ptr == CHAR_BACKSLASH &&
3940       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3941        ptr[1] == CHAR_BACKSLASH))
3942     ptr++;
3943   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3944             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3945   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3946     {
3947     *endptr = ptr;
3948     return TRUE;
3949     }
3950   }
3951 return FALSE;
3952 }
3953
3954
3955
3956
3957 /*************************************************
3958 *          Check POSIX class name                *
3959 *************************************************/
3960
3961 /* This function is called to check the name given in a POSIX-style class entry
3962 such as [:alnum:].
3963
3964 Arguments:
3965   ptr        points to the first letter
3966   len        the length of the name
3967
3968 Returns:     a value representing the name, or -1 if unknown
3969 */
3970
3971 static int
3972 check_posix_name(const pcre_uchar *ptr, int len)
3973 {
3974 const char *pn = posix_names;
3975 register int yield = 0;
3976 while (posix_name_lengths[yield] != 0)
3977   {
3978   if (len == posix_name_lengths[yield] &&
3979     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3980   pn += posix_name_lengths[yield] + 1;
3981   yield++;
3982   }
3983 return -1;
3984 }
3985
3986
3987 /*************************************************
3988 *    Adjust OP_RECURSE items in repeated group   *
3989 *************************************************/
3990
3991 /* OP_RECURSE items contain an offset from the start of the regex to the group
3992 that is referenced. This means that groups can be replicated for fixed
3993 repetition simply by copying (because the recursion is allowed to refer to
3994 earlier groups that are outside the current group). However, when a group is
3995 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3996 inserted before it, after it has been compiled. This means that any OP_RECURSE
3997 items within it that refer to the group itself or any contained groups have to
3998 have their offsets adjusted. That one of the jobs of this function. Before it
3999 is called, the partially compiled regex must be temporarily terminated with
4000 OP_END.
4001
4002 This function has been extended to cope with forward references for recursions
4003 and subroutine calls. It must check the list of such references for the
4004 group we are dealing with. If it finds that one of the recursions in the
4005 current group is on this list, it does not adjust the value in the reference
4006 (which is a group number). After the group has been scanned, all the offsets in
4007 the forward reference list for the group are adjusted.
4008
4009 Arguments:
4010   group      points to the start of the group
4011   adjust     the amount by which the group is to be moved
4012   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4013   cd         contains pointers to tables etc.
4014   save_hwm_offset   the hwm forward reference offset at the start of the group
4015
4016 Returns:     nothing
4017 */
4018
4019 static void
4020 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4021   size_t save_hwm_offset)
4022 {
4023 int offset;
4024 pcre_uchar *hc;
4025 pcre_uchar *ptr = group;
4026
4027 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4028   {
4029   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4030        hc += LINK_SIZE)
4031     {
4032     offset = (int)GET(hc, 0);
4033     if (cd->start_code + offset == ptr + 1) break;
4034     }
4035
4036   /* If we have not found this recursion on the forward reference list, adjust
4037   the recursion's offset if it's after the start of this group. */
4038
4039   if (hc >= cd->hwm)
4040     {
4041     offset = (int)GET(ptr, 1);
4042     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4043     }
4044
4045   ptr += 1 + LINK_SIZE;
4046   }
4047
4048 /* Now adjust all forward reference offsets for the group. */
4049
4050 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4051      hc += LINK_SIZE)
4052   {
4053   offset = (int)GET(hc, 0);
4054   PUT(hc, 0, offset + adjust);
4055   }
4056 }
4057
4058
4059
4060 /*************************************************
4061 *        Insert an automatic callout point       *
4062 *************************************************/
4063
4064 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4065 callout points before each pattern item.
4066
4067 Arguments:
4068   code           current code pointer
4069   ptr            current pattern pointer
4070   cd             pointers to tables etc
4071
4072 Returns:         new code pointer
4073 */
4074
4075 static pcre_uchar *
4076 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4077 {
4078 *code++ = OP_CALLOUT;
4079 *code++ = 255;
4080 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4081 PUT(code, LINK_SIZE, 0);                       /* Default length */
4082 return code + 2 * LINK_SIZE;
4083 }
4084
4085
4086
4087 /*************************************************
4088 *         Complete a callout item                *
4089 *************************************************/
4090
4091 /* A callout item contains the length of the next item in the pattern, which
4092 we can't fill in till after we have reached the relevant point. This is used
4093 for both automatic and manual callouts.
4094
4095 Arguments:
4096   previous_callout   points to previous callout item
4097   ptr                current pattern pointer
4098   cd                 pointers to tables etc
4099
4100 Returns:             nothing
4101 */
4102
4103 static void
4104 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4105 {
4106 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4107 PUT(previous_callout, 2 + LINK_SIZE, length);
4108 }
4109
4110
4111
4112 #ifdef SUPPORT_UCP
4113 /*************************************************
4114 *           Get othercase range                  *
4115 *************************************************/
4116
4117 /* This function is passed the start and end of a class range, in UTF-8 mode
4118 with UCP support. It searches up the characters, looking for ranges of
4119 characters in the "other" case. Each call returns the next one, updating the
4120 start address. A character with multiple other cases is returned on its own
4121 with a special return value.
4122
4123 Arguments:
4124   cptr        points to starting character value; updated
4125   d           end value
4126   ocptr       where to put start of othercase range
4127   odptr       where to put end of othercase range
4128
4129 Yield:        -1 when no more
4130                0 when a range is returned
4131               >0 the CASESET offset for char with multiple other cases
4132                 in this case, ocptr contains the original
4133 */
4134
4135 static int
4136 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4137   pcre_uint32 *odptr)
4138 {
4139 pcre_uint32 c, othercase, next;
4140 unsigned int co;
4141
4142 /* Find the first character that has an other case. If it has multiple other
4143 cases, return its case offset value. */
4144
4145 for (c = *cptr; c <= d; c++)
4146   {
4147   if ((co = UCD_CASESET(c)) != 0)
4148     {
4149     *ocptr = c++;   /* Character that has the set */
4150     *cptr = c;      /* Rest of input range */
4151     return (int)co;
4152     }
4153   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4154   }
4155
4156 if (c > d) return -1;  /* Reached end of range */
4157
4158 /* Found a character that has a single other case. Search for the end of the
4159 range, which is either the end of the input range, or a character that has zero
4160 or more than one other cases. */
4161
4162 *ocptr = othercase;
4163 next = othercase + 1;
4164
4165 for (++c; c <= d; c++)
4166   {
4167   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4168   next++;
4169   }
4170
4171 *odptr = next - 1;     /* End of othercase range */
4172 *cptr = c;             /* Rest of input range */
4173 return 0;
4174 }
4175 #endif  /* SUPPORT_UCP */
4176
4177
4178
4179 /*************************************************
4180 *        Add a character or range to a class     *
4181 *************************************************/
4182
4183 /* This function packages up the logic of adding a character or range of
4184 characters to a class. The character values in the arguments will be within the
4185 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4186 mutually recursive with the function immediately below.
4187
4188 Arguments:
4189   classbits     the bit map for characters < 256
4190   uchardptr     points to the pointer for extra data
4191   options       the options word
4192   cd            contains pointers to tables etc.
4193   start         start of range character
4194   end           end of range character
4195
4196 Returns:        the number of < 256 characters added
4197                 the pointer to extra data is updated
4198 */
4199
4200 static int
4201 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4202   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4203 {
4204 pcre_uint32 c;
4205 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4206 int n8 = 0;
4207
4208 /* If caseless matching is required, scan the range and process alternate
4209 cases. In Unicode, there are 8-bit characters that have alternate cases that
4210 are greater than 255 and vice-versa. Sometimes we can just extend the original
4211 range. */
4212
4213 if ((options & PCRE_CASELESS) != 0)
4214   {
4215 #ifdef SUPPORT_UCP
4216   if ((options & PCRE_UTF8) != 0)
4217     {
4218     int rc;
4219     pcre_uint32 oc, od;
4220
4221     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4222     c = start;
4223
4224     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4225       {
4226       /* Handle a single character that has more than one other case. */
4227
4228       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4229         PRIV(ucd_caseless_sets) + rc, oc);
4230
4231       /* Do nothing if the other case range is within the original range. */
4232
4233       else if (oc >= start && od <= end) continue;
4234
4235       /* Extend the original range if there is overlap, noting that if oc < c, we
4236       can't have od > end because a subrange is always shorter than the basic
4237       range. Otherwise, use a recursive call to add the additional range. */
4238
4239       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4240       else if (od > end && oc <= end + 1)
4241         {
4242         end = od;       /* Extend upwards */
4243         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4244         }
4245       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4246       }
4247     }
4248   else
4249 #endif  /* SUPPORT_UCP */
4250
4251   /* Not UTF-mode, or no UCP */
4252
4253   for (c = start; c <= classbits_end; c++)
4254     {
4255     SETBIT(classbits, cd->fcc[c]);
4256     n8++;
4257     }
4258   }
4259
4260 /* Now handle the original range. Adjust the final value according to the bit
4261 length - this means that the same lists of (e.g.) horizontal spaces can be used
4262 in all cases. */
4263
4264 #if defined COMPILE_PCRE8
4265 #ifdef SUPPORT_UTF
4266   if ((options & PCRE_UTF8) == 0)
4267 #endif
4268   if (end > 0xff) end = 0xff;
4269
4270 #elif defined COMPILE_PCRE16
4271 #ifdef SUPPORT_UTF
4272   if ((options & PCRE_UTF16) == 0)
4273 #endif
4274   if (end > 0xffff) end = 0xffff;
4275
4276 #endif /* COMPILE_PCRE[8|16] */
4277
4278 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4279
4280 for (c = start; c <= classbits_end; c++)
4281   {
4282   /* Regardless of start, c will always be <= 255. */
4283   SETBIT(classbits, c);
4284   n8++;
4285   }
4286
4287 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4288 if (start <= 0xff) start = 0xff + 1;
4289
4290 if (end >= start)
4291   {
4292   pcre_uchar *uchardata = *uchardptr;
4293 #ifdef SUPPORT_UTF
4294   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4295     {
4296     if (start < end)
4297       {
4298       *uchardata++ = XCL_RANGE;
4299       uchardata += PRIV(ord2utf)(start, uchardata);
4300       uchardata += PRIV(ord2utf)(end, uchardata);
4301       }
4302     else if (start == end)
4303       {
4304       *uchardata++ = XCL_SINGLE;
4305       uchardata += PRIV(ord2utf)(start, uchardata);
4306       }
4307     }
4308   else
4309 #endif  /* SUPPORT_UTF */
4310
4311   /* Without UTF support, character values are constrained by the bit length,
4312   and can only be > 256 for 16-bit and 32-bit libraries. */
4313
4314 #ifdef COMPILE_PCRE8
4315     {}
4316 #else
4317   if (start < end)
4318     {
4319     *uchardata++ = XCL_RANGE;
4320     *uchardata++ = start;
4321     *uchardata++ = end;
4322     }
4323   else if (start == end)
4324     {
4325     *uchardata++ = XCL_SINGLE;
4326     *uchardata++ = start;
4327     }
4328 #endif
4329
4330   *uchardptr = uchardata;   /* Updata extra data pointer */
4331   }
4332 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4333
4334 return n8;    /* Number of 8-bit characters */
4335 }
4336
4337
4338
4339
4340 /*************************************************
4341 *        Add a list of characters to a class     *
4342 *************************************************/
4343
4344 /* This function is used for adding a list of case-equivalent characters to a
4345 class, and also for adding a list of horizontal or vertical whitespace. If the
4346 list is in order (which it should be), ranges of characters are detected and
4347 handled appropriately. This function is mutually recursive with the function
4348 above.
4349
4350 Arguments:
4351   classbits     the bit map for characters < 256
4352   uchardptr     points to the pointer for extra data
4353   options       the options word
4354   cd            contains pointers to tables etc.
4355   p             points to row of 32-bit values, terminated by NOTACHAR
4356   except        character to omit; this is used when adding lists of
4357                   case-equivalent characters to avoid including the one we
4358                   already know about
4359
4360 Returns:        the number of < 256 characters added
4361                 the pointer to extra data is updated
4362 */
4363
4364 static int
4365 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4366   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4367 {
4368 int n8 = 0;
4369 while (p[0] < NOTACHAR)
4370   {
4371   int n = 0;
4372   if (p[0] != except)
4373     {
4374     while(p[n+1] == p[0] + n + 1) n++;
4375     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4376     }
4377   p += n + 1;
4378   }
4379 return n8;
4380 }
4381
4382
4383
4384 /*************************************************
4385 *    Add characters not in a list to a class     *
4386 *************************************************/
4387
4388 /* This function is used for adding the complement of a list of horizontal or
4389 vertical whitespace to a class. The list must be in order.
4390
4391 Arguments:
4392   classbits     the bit map for characters < 256
4393   uchardptr     points to the pointer for extra data
4394   options       the options word
4395   cd            contains pointers to tables etc.
4396   p             points to row of 32-bit values, terminated by NOTACHAR
4397
4398 Returns:        the number of < 256 characters added
4399                 the pointer to extra data is updated
4400 */
4401
4402 static int
4403 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4404   int options, compile_data *cd, const pcre_uint32 *p)
4405 {
4406 BOOL utf = (options & PCRE_UTF8) != 0;
4407 int n8 = 0;
4408 if (p[0] > 0)
4409   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4410 while (p[0] < NOTACHAR)
4411   {
4412   while (p[1] == p[0] + 1) p++;
4413   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4414     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4415   p++;
4416   }
4417 return n8;
4418 }
4419
4420
4421
4422 /*************************************************
4423 *           Compile one branch                   *
4424 *************************************************/
4425
4426 /* Scan the pattern, compiling it into the a vector. If the options are
4427 changed during the branch, the pointer is used to change the external options
4428 bits. This function is used during the pre-compile phase when we are trying
4429 to find out the amount of memory needed, as well as during the real compile
4430 phase. The value of lengthptr distinguishes the two phases.
4431
4432 Arguments:
4433   optionsptr        pointer to the option bits
4434   codeptr           points to the pointer to the current code point
4435   ptrptr            points to the current pattern pointer
4436   errorcodeptr      points to error code variable
4437   firstcharptr      place to put the first required character
4438   firstcharflagsptr place to put the first character flags, or a negative number
4439   reqcharptr        place to put the last required character
4440   reqcharflagsptr   place to put the last required character flags, or a negative number
4441   bcptr             points to current branch chain
4442   cond_depth        conditional nesting depth
4443   cd                contains pointers to tables etc.
4444   lengthptr         NULL during the real compile phase
4445                     points to length accumulator during pre-compile phase
4446
4447 Returns:            TRUE on success
4448                     FALSE, with *errorcodeptr set non-zero on error
4449 */
4450
4451 static BOOL
4452 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4453   const pcre_uchar **ptrptr, int *errorcodeptr,
4454   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4455   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4456   branch_chain *bcptr, int cond_depth,
4457   compile_data *cd, int *lengthptr)
4458 {
4459 int repeat_type, op_type;
4460 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4461 int bravalue = 0;
4462 int greedy_default, greedy_non_default;
4463 pcre_uint32 firstchar, reqchar;
4464 pcre_int32 firstcharflags, reqcharflags;
4465 pcre_uint32 zeroreqchar, zerofirstchar;
4466 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4467 pcre_int32 req_caseopt, reqvary, tempreqvary;
4468 int options = *optionsptr;               /* May change dynamically */
4469 int after_manual_callout = 0;
4470 int length_prevgroup = 0;
4471 register pcre_uint32 c;
4472 int escape;
4473 register pcre_uchar *code = *codeptr;
4474 pcre_uchar *last_code = code;
4475 pcre_uchar *orig_code = code;
4476 pcre_uchar *tempcode;
4477 BOOL inescq = FALSE;
4478 BOOL groupsetfirstchar = FALSE;
4479 const pcre_uchar *ptr = *ptrptr;
4480 const pcre_uchar *tempptr;
4481 const pcre_uchar *nestptr = NULL;
4482 pcre_uchar *previous = NULL;
4483 pcre_uchar *previous_callout = NULL;
4484 size_t item_hwm_offset = 0;
4485 pcre_uint8 classbits[32];
4486
4487 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4488 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4489 dynamically as we process the pattern. */
4490
4491 #ifdef SUPPORT_UTF
4492 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4493 BOOL utf = (options & PCRE_UTF8) != 0;
4494 #ifndef COMPILE_PCRE32
4495 pcre_uchar utf_chars[6];
4496 #endif
4497 #else
4498 BOOL utf = FALSE;
4499 #endif
4500
4501 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4502 class_uchardata always so that it can be passed to add_to_class() always,
4503 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4504 alternative calls for the different cases. */
4505
4506 pcre_uchar *class_uchardata;
4507 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4508 BOOL xclass;
4509 pcre_uchar *class_uchardata_base;
4510 #endif
4511
4512 #ifdef PCRE_DEBUG
4513 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4514 #endif
4515
4516 /* Set up the default and non-default settings for greediness */
4517
4518 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4519 greedy_non_default = greedy_default ^ 1;
4520
4521 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4522 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4523 matches a non-fixed char first char; reqchar just remains unset if we never
4524 find one.
4525
4526 When we hit a repeat whose minimum is zero, we may have to adjust these values
4527 to take the zero repeat into account. This is implemented by setting them to
4528 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4529 item types that can be repeated set these backoff variables appropriately. */
4530
4531 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4532 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4533
4534 /* The variable req_caseopt contains either the REQ_CASELESS value
4535 or zero, according to the current setting of the caseless flag. The
4536 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4537 firstchar or reqchar variables to record the case status of the
4538 value. This is used only for ASCII characters. */
4539
4540 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4541
4542 /* Switch on next character until the end of the branch */
4543
4544 for (;; ptr++)
4545   {
4546   BOOL negate_class;
4547   BOOL should_flip_negation;
4548   BOOL possessive_quantifier;
4549   BOOL is_quantifier;
4550   BOOL is_recurse;
4551   BOOL reset_bracount;
4552   int class_has_8bitchar;
4553   int class_one_char;
4554 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4555   BOOL xclass_has_prop;
4556 #endif
4557   int newoptions;
4558   int recno;
4559   int refsign;
4560   int skipbytes;
4561   pcre_uint32 subreqchar, subfirstchar;
4562   pcre_int32 subreqcharflags, subfirstcharflags;
4563   int terminator;
4564   unsigned int mclength;
4565   unsigned int tempbracount;
4566   pcre_uint32 ec;
4567   pcre_uchar mcbuffer[8];
4568
4569   /* Get next character in the pattern */
4570
4571   c = *ptr;
4572
4573   /* If we are at the end of a nested substitution, revert to the outer level
4574   string. Nesting only happens one level deep. */
4575
4576   if (c == CHAR_NULL && nestptr != NULL)
4577     {
4578     ptr = nestptr;
4579     nestptr = NULL;
4580     c = *ptr;
4581     }
4582
4583   /* If we are in the pre-compile phase, accumulate the length used for the
4584   previous cycle of this loop. */
4585
4586   if (lengthptr != NULL)
4587     {
4588 #ifdef PCRE_DEBUG
4589     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4590 #endif
4591     if (code > cd->start_workspace + cd->workspace_size -
4592         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4593       {
4594       *errorcodeptr = ERR52;
4595       goto FAILED;
4596       }
4597
4598     /* There is at least one situation where code goes backwards: this is the
4599     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4600     the class is simply eliminated. However, it is created first, so we have to
4601     allow memory for it. Therefore, don't ever reduce the length at this point.
4602     */
4603
4604     if (code < last_code) code = last_code;
4605
4606     /* Paranoid check for integer overflow */
4607
4608     if (OFLOW_MAX - *lengthptr < code - last_code)
4609       {
4610       *errorcodeptr = ERR20;
4611       goto FAILED;
4612       }
4613
4614     *lengthptr += (int)(code - last_code);
4615     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4616       (int)(code - last_code), c, c));
4617
4618     /* If "previous" is set and it is not at the start of the work space, move
4619     it back to there, in order to avoid filling up the work space. Otherwise,
4620     if "previous" is NULL, reset the current code pointer to the start. */
4621
4622     if (previous != NULL)
4623       {
4624       if (previous > orig_code)
4625         {
4626         memmove(orig_code, previous, IN_UCHARS(code - previous));
4627         code -= previous - orig_code;
4628         previous = orig_code;
4629         }
4630       }
4631     else code = orig_code;
4632
4633     /* Remember where this code item starts so we can pick up the length
4634     next time round. */
4635
4636     last_code = code;
4637     }
4638
4639   /* In the real compile phase, just check the workspace used by the forward
4640   reference list. */
4641
4642   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4643     {
4644     *errorcodeptr = ERR52;
4645     goto FAILED;
4646     }
4647
4648   /* If in \Q...\E, check for the end; if not, we have a literal */
4649
4650   if (inescq && c != CHAR_NULL)
4651     {
4652     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4653       {
4654       inescq = FALSE;
4655       ptr++;
4656       continue;
4657       }
4658     else
4659       {
4660       if (previous_callout != NULL)
4661         {
4662         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4663           complete_callout(previous_callout, ptr, cd);
4664         previous_callout = NULL;
4665         }
4666       if ((options & PCRE_AUTO_CALLOUT) != 0)
4667         {
4668         previous_callout = code;
4669         code = auto_callout(code, ptr, cd);
4670         }
4671       goto NORMAL_CHAR;
4672       }
4673     /* Control does not reach here. */
4674     }
4675
4676   /* In extended mode, skip white space and comments. We need a loop in order
4677   to check for more white space and more comments after a comment. */
4678
4679   if ((options & PCRE_EXTENDED) != 0)
4680     {
4681     for (;;)
4682       {
4683       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4684       if (c != CHAR_NUMBER_SIGN) break;
4685       ptr++;
4686       while (*ptr != CHAR_NULL)
4687         {
4688         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4689           {                          /* IS_NEWLINE sets cd->nllen. */
4690           ptr += cd->nllen;
4691           break;
4692           }
4693         ptr++;
4694 #ifdef SUPPORT_UTF
4695         if (utf) FORWARDCHAR(ptr);
4696 #endif
4697         }
4698       c = *ptr;     /* Either NULL or the char after a newline */
4699       }
4700     }
4701
4702   /* See if the next thing is a quantifier. */
4703
4704   is_quantifier =
4705     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4706     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4707
4708   /* Fill in length of a previous callout, except when the next thing is a
4709   quantifier or when processing a property substitution string in UCP mode. */
4710
4711   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4712        after_manual_callout-- <= 0)
4713     {
4714     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4715       complete_callout(previous_callout, ptr, cd);
4716     previous_callout = NULL;
4717     }
4718
4719   /* Create auto callout, except for quantifiers, or while processing property
4720   strings that are substituted for \w etc in UCP mode. */
4721
4722   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4723     {
4724     previous_callout = code;
4725     code = auto_callout(code, ptr, cd);
4726     }
4727
4728   /* Process the next pattern item. */
4729
4730   switch(c)
4731     {
4732     /* ===================================================================*/
4733     case CHAR_NULL:                /* The branch terminates at string end */
4734     case CHAR_VERTICAL_LINE:       /* or | or ) */
4735     case CHAR_RIGHT_PARENTHESIS:
4736     *firstcharptr = firstchar;
4737     *firstcharflagsptr = firstcharflags;
4738     *reqcharptr = reqchar;
4739     *reqcharflagsptr = reqcharflags;
4740     *codeptr = code;
4741     *ptrptr = ptr;
4742     if (lengthptr != NULL)
4743       {
4744       if (OFLOW_MAX - *lengthptr < code - last_code)
4745         {
4746         *errorcodeptr = ERR20;
4747         goto FAILED;
4748         }
4749       *lengthptr += (int)(code - last_code);   /* To include callout length */
4750       DPRINTF((">> end branch\n"));
4751       }
4752     return TRUE;
4753
4754
4755     /* ===================================================================*/
4756     /* Handle single-character metacharacters. In multiline mode, ^ disables
4757     the setting of any following char as a first character. */
4758
4759     case CHAR_CIRCUMFLEX_ACCENT:
4760     previous = NULL;
4761     if ((options & PCRE_MULTILINE) != 0)
4762       {
4763       if (firstcharflags == REQ_UNSET)
4764         zerofirstcharflags = firstcharflags = REQ_NONE;
4765       *code++ = OP_CIRCM;
4766       }
4767     else *code++ = OP_CIRC;
4768     break;
4769
4770     case CHAR_DOLLAR_SIGN:
4771     previous = NULL;
4772     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4773     break;
4774
4775     /* There can never be a first char if '.' is first, whatever happens about
4776     repeats. The value of reqchar doesn't change either. */
4777
4778     case CHAR_DOT:
4779     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4780     zerofirstchar = firstchar;
4781     zerofirstcharflags = firstcharflags;
4782     zeroreqchar = reqchar;
4783     zeroreqcharflags = reqcharflags;
4784     previous = code;
4785     item_hwm_offset = cd->hwm - cd->start_workspace;
4786     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4787     break;
4788
4789
4790     /* ===================================================================*/
4791     /* Character classes. If the included characters are all < 256, we build a
4792     32-byte bitmap of the permitted characters, except in the special case
4793     where there is only one such character. For negated classes, we build the
4794     map as usual, then invert it at the end. However, we use a different opcode
4795     so that data characters > 255 can be handled correctly.
4796
4797     If the class contains characters outside the 0-255 range, a different
4798     opcode is compiled. It may optionally have a bit map for characters < 256,
4799     but those above are are explicitly listed afterwards. A flag byte tells
4800     whether the bitmap is present, and whether this is a negated class or not.
4801
4802     In JavaScript compatibility mode, an isolated ']' causes an error. In
4803     default (Perl) mode, it is treated as a data character. */
4804
4805     case CHAR_RIGHT_SQUARE_BRACKET:
4806     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4807       {
4808       *errorcodeptr = ERR64;
4809       goto FAILED;
4810       }
4811     goto NORMAL_CHAR;
4812
4813     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4814     used for "start of word" and "end of word". As these are otherwise illegal
4815     sequences, we don't break anything by recognizing them. They are replaced
4816     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4817     erroneous and are handled by the normal code below. */
4818
4819     case CHAR_LEFT_SQUARE_BRACKET:
4820     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4821       {
4822       nestptr = ptr + 7;
4823       ptr = sub_start_of_word - 1;
4824       continue;
4825       }
4826
4827     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4828       {
4829       nestptr = ptr + 7;
4830       ptr = sub_end_of_word - 1;
4831       continue;
4832       }
4833
4834     /* Handle a real character class. */
4835
4836     previous = code;
4837     item_hwm_offset = cd->hwm - cd->start_workspace;
4838
4839     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4840     they are encountered at the top level, so we'll do that too. */
4841
4842     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4843          ptr[1] == CHAR_EQUALS_SIGN) &&
4844         check_posix_syntax(ptr, &tempptr))
4845       {
4846       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4847       goto FAILED;
4848       }
4849
4850     /* If the first character is '^', set the negation flag and skip it. Also,
4851     if the first few characters (either before or after ^) are \Q\E or \E we
4852     skip them too. This makes for compatibility with Perl. */
4853
4854     negate_class = FALSE;
4855     for (;;)
4856       {
4857       c = *(++ptr);
4858       if (c == CHAR_BACKSLASH)
4859         {
4860         if (ptr[1] == CHAR_E)
4861           ptr++;
4862         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4863           ptr += 3;
4864         else
4865           break;
4866         }
4867       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4868         negate_class = TRUE;
4869       else break;
4870       }
4871
4872     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4873     an initial ']' is taken as a data character -- the code below handles
4874     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4875     [^] must match any character, so generate OP_ALLANY. */
4876
4877     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4878         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4879       {
4880       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4881       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4882       zerofirstchar = firstchar;
4883       zerofirstcharflags = firstcharflags;
4884       break;
4885       }
4886
4887     /* If a class contains a negative special such as \S, we need to flip the
4888     negation flag at the end, so that support for characters > 255 works
4889     correctly (they are all included in the class). */
4890
4891     should_flip_negation = FALSE;
4892
4893     /* Extended class (xclass) will be used when characters > 255
4894     might match. */
4895
4896 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4897     xclass = FALSE;
4898     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4899     class_uchardata_base = class_uchardata;   /* Save the start */
4900 #endif
4901
4902     /* For optimization purposes, we track some properties of the class:
4903     class_has_8bitchar will be non-zero if the class contains at least one <
4904     256 character; class_one_char will be 1 if the class contains just one
4905     character; xclass_has_prop will be TRUE if unicode property checks
4906     are present in the class. */
4907
4908     class_has_8bitchar = 0;
4909     class_one_char = 0;
4910 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4911     xclass_has_prop = FALSE;
4912 #endif
4913
4914     /* Initialize the 32-char bit map to all zeros. We build the map in a
4915     temporary bit of memory, in case the class contains fewer than two
4916     8-bit characters because in that case the compiled code doesn't use the bit
4917     map. */
4918
4919     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4920
4921     /* Process characters until ] is reached. By writing this as a "do" it
4922     means that an initial ] is taken as a data character. At the start of the
4923     loop, c contains the first byte of the character. */
4924
4925     if (c != CHAR_NULL) do
4926       {
4927       const pcre_uchar *oldptr;
4928
4929 #ifdef SUPPORT_UTF
4930       if (utf && HAS_EXTRALEN(c))
4931         {                           /* Braces are required because the */
4932         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4933         }
4934 #endif
4935
4936 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4937       /* In the pre-compile phase, accumulate the length of any extra
4938       data and reset the pointer. This is so that very large classes that
4939       contain a zillion > 255 characters no longer overwrite the work space
4940       (which is on the stack). We have to remember that there was XCLASS data,
4941       however. */
4942
4943       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4944
4945       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4946         {
4947         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4948         class_uchardata = class_uchardata_base;
4949         }
4950 #endif
4951
4952       /* Inside \Q...\E everything is literal except \E */
4953
4954       if (inescq)
4955         {
4956         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4957           {
4958           inescq = FALSE;                   /* Reset literal state */
4959           ptr++;                            /* Skip the 'E' */
4960           continue;                         /* Carry on with next */
4961           }
4962         goto CHECK_RANGE;                   /* Could be range if \E follows */
4963         }
4964
4965       /* Handle POSIX class names. Perl allows a negation extension of the
4966       form [:^name:]. A square bracket that doesn't match the syntax is
4967       treated as a literal. We also recognize the POSIX constructions
4968       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4969       5.6 and 5.8 do. */
4970
4971       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4972           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4973            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4974         {
4975         BOOL local_negate = FALSE;
4976         int posix_class, taboffset, tabopt;
4977         register const pcre_uint8 *cbits = cd->cbits;
4978         pcre_uint8 pbits[32];
4979
4980         if (ptr[1] != CHAR_COLON)
4981           {
4982           *errorcodeptr = ERR31;
4983           goto FAILED;
4984           }
4985
4986         ptr += 2;
4987         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4988           {
4989           local_negate = TRUE;
4990           should_flip_negation = TRUE;  /* Note negative special */
4991           ptr++;
4992           }
4993
4994         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4995         if (posix_class < 0)
4996           {
4997           *errorcodeptr = ERR30;
4998           goto FAILED;
4999           }
5000
5001         /* If matching is caseless, upper and lower are converted to
5002         alpha. This relies on the fact that the class table starts with
5003         alpha, lower, upper as the first 3 entries. */
5004
5005         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5006           posix_class = 0;
5007
5008         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5009         different escape sequences that use Unicode properties \p or \P. Others
5010         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5011         directly. */
5012
5013 #ifdef SUPPORT_UCP
5014         if ((options & PCRE_UCP) != 0)
5015           {
5016           unsigned int ptype = 0;
5017           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5018
5019           /* The posix_substitutes table specifies which POSIX classes can be
5020           converted to \p or \P items. */
5021
5022           if (posix_substitutes[pc] != NULL)
5023             {
5024             nestptr = tempptr + 1;
5025             ptr = posix_substitutes[pc] - 1;
5026             continue;
5027             }
5028
5029           /* There are three other classes that generate special property calls
5030           that are recognized only in an XCLASS. */
5031
5032           else switch(posix_class)
5033             {
5034             case PC_GRAPH:
5035             ptype = PT_PXGRAPH;
5036             /* Fall through */
5037             case PC_PRINT:
5038             if (ptype == 0) ptype = PT_PXPRINT;
5039             /* Fall through */
5040             case PC_PUNCT:
5041             if (ptype == 0) ptype = PT_PXPUNCT;
5042             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5043             *class_uchardata++ = ptype;
5044             *class_uchardata++ = 0;
5045             xclass_has_prop = TRUE;
5046             ptr = tempptr + 1;
5047             continue;
5048
5049             /* For the other POSIX classes (ascii, xdigit) we are going to fall
5050             through to the non-UCP case and build a bit map for characters with
5051             code points less than 256. If we are in a negated POSIX class
5052             within a non-negated overall class, characters with code points
5053             greater than 255 must all match. In the special case where we have
5054             not yet generated any xclass data, and this is the final item in
5055             the overall class, we need do nothing: later on, the opcode
5056             OP_NCLASS will be used to indicate that characters greater than 255
5057             are acceptable. If we have already seen an xclass item or one may
5058             follow (we have to assume that it might if this is not the end of
5059             the class), explicitly match all wide codepoints. */
5060
5061             default:
5062             if (!negate_class && local_negate &&
5063                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5064               {
5065               *class_uchardata++ = XCL_RANGE;
5066               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5067               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5068               }
5069             break;
5070             }
5071           }
5072 #endif
5073         /* In the non-UCP case, or when UCP makes no difference, we build the
5074         bit map for the POSIX class in a chunk of local store because we may be
5075         adding and subtracting from it, and we don't want to subtract bits that
5076         may be in the main map already. At the end we or the result into the
5077         bit map that is being built. */
5078
5079         posix_class *= 3;
5080
5081         /* Copy in the first table (always present) */
5082
5083         memcpy(pbits, cbits + posix_class_maps[posix_class],
5084           32 * sizeof(pcre_uint8));
5085
5086         /* If there is a second table, add or remove it as required. */
5087
5088         taboffset = posix_class_maps[posix_class + 1];
5089         tabopt = posix_class_maps[posix_class + 2];
5090
5091         if (taboffset >= 0)
5092           {
5093           if (tabopt >= 0)
5094             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5095           else
5096             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5097           }
5098
5099         /* Now see if we need to remove any special characters. An option
5100         value of 1 removes vertical space and 2 removes underscore. */
5101
5102         if (tabopt < 0) tabopt = -tabopt;
5103         if (tabopt == 1) pbits[1] &= ~0x3c;
5104           else if (tabopt == 2) pbits[11] &= 0x7f;
5105
5106         /* Add the POSIX table or its complement into the main table that is
5107         being built and we are done. */
5108
5109         if (local_negate)
5110           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5111         else
5112           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5113
5114         ptr = tempptr + 1;
5115         /* Every class contains at least one < 256 character. */
5116         class_has_8bitchar = 1;
5117         /* Every class contains at least two characters. */
5118         class_one_char = 2;
5119         continue;    /* End of POSIX syntax handling */
5120         }
5121
5122       /* Backslash may introduce a single character, or it may introduce one
5123       of the specials, which just set a flag. The sequence \b is a special
5124       case. Inside a class (and only there) it is treated as backspace. We
5125       assume that other escapes have more than one character in them, so
5126       speculatively set both class_has_8bitchar and class_one_char bigger
5127       than one. Unrecognized escapes fall through and are either treated
5128       as literal characters (by default), or are faulted if
5129       PCRE_EXTRA is set. */
5130
5131       if (c == CHAR_BACKSLASH)
5132         {
5133         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5134           TRUE);
5135         if (*errorcodeptr != 0) goto FAILED;
5136         if (escape == 0) c = ec;
5137         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5138         else if (escape == ESC_N)          /* \N is not supported in a class */
5139           {
5140           *errorcodeptr = ERR71;
5141           goto FAILED;
5142           }
5143         else if (escape == ESC_Q)            /* Handle start of quoted string */
5144           {
5145           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5146             {
5147             ptr += 2; /* avoid empty string */
5148             }
5149           else inescq = TRUE;
5150           continue;
5151           }
5152         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5153
5154         else
5155           {
5156           register const pcre_uint8 *cbits = cd->cbits;
5157           /* Every class contains at least two < 256 characters. */
5158           class_has_8bitchar++;
5159           /* Every class contains at least two characters. */
5160           class_one_char += 2;
5161
5162           switch (escape)
5163             {
5164 #ifdef SUPPORT_UCP
5165             case ESC_du:     /* These are the values given for \d etc */
5166             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5167             case ESC_wu:     /* escape sequence with an appropriate \p */
5168             case ESC_WU:     /* or \P to test Unicode properties instead */
5169             case ESC_su:     /* of the default ASCII testing. */
5170             case ESC_SU:
5171             nestptr = ptr;
5172             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5173             class_has_8bitchar--;                /* Undo! */
5174             continue;
5175 #endif
5176             case ESC_d:
5177             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5178             continue;
5179
5180             case ESC_D:
5181             should_flip_negation = TRUE;
5182             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5183             continue;
5184
5185             case ESC_w:
5186             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5187             continue;
5188
5189             case ESC_W:
5190             should_flip_negation = TRUE;
5191             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5192             continue;
5193
5194             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5195             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5196             previously set by something earlier in the character class.
5197             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5198             we could just adjust the appropriate bit. From PCRE 8.34 we no
5199             longer treat \s and \S specially. */
5200
5201             case ESC_s:
5202             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5203             continue;
5204
5205             case ESC_S:
5206             should_flip_negation = TRUE;
5207             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5208             continue;
5209
5210             /* The rest apply in both UCP and non-UCP cases. */
5211
5212             case ESC_h:
5213             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5214               PRIV(hspace_list), NOTACHAR);
5215             continue;
5216
5217             case ESC_H:
5218             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5219               cd, PRIV(hspace_list));
5220             continue;
5221
5222             case ESC_v:
5223             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5224               PRIV(vspace_list), NOTACHAR);
5225             continue;
5226
5227             case ESC_V:
5228             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5229               cd, PRIV(vspace_list));
5230             continue;
5231
5232             case ESC_p:
5233             case ESC_P:
5234 #ifdef SUPPORT_UCP
5235               {
5236               BOOL negated;
5237               unsigned int ptype = 0, pdata = 0;
5238               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5239                 goto FAILED;
5240               *class_uchardata++ = ((escape == ESC_p) != negated)?
5241                 XCL_PROP : XCL_NOTPROP;
5242               *class_uchardata++ = ptype;
5243               *class_uchardata++ = pdata;
5244               xclass_has_prop = TRUE;
5245               class_has_8bitchar--;                /* Undo! */
5246               continue;
5247               }
5248 #else
5249             *errorcodeptr = ERR45;
5250             goto FAILED;
5251 #endif
5252             /* Unrecognized escapes are faulted if PCRE is running in its
5253             strict mode. By default, for compatibility with Perl, they are
5254             treated as literals. */
5255
5256             default:
5257             if ((options & PCRE_EXTRA) != 0)
5258               {
5259               *errorcodeptr = ERR7;
5260               goto FAILED;
5261               }
5262             class_has_8bitchar--;    /* Undo the speculative increase. */
5263             class_one_char -= 2;     /* Undo the speculative increase. */
5264             c = *ptr;                /* Get the final character and fall through */
5265             break;
5266             }
5267           }
5268
5269         /* Fall through if the escape just defined a single character (c >= 0).
5270         This may be greater than 256. */
5271
5272         escape = 0;
5273
5274         }   /* End of backslash handling */
5275
5276       /* A character may be followed by '-' to form a range. However, Perl does
5277       not permit ']' to be the end of the range. A '-' character at the end is
5278       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5279       code for handling \Q and \E is messy. */
5280
5281       CHECK_RANGE:
5282       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5283         {
5284         inescq = FALSE;
5285         ptr += 2;
5286         }
5287       oldptr = ptr;
5288
5289       /* Remember if \r or \n were explicitly used */
5290
5291       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5292
5293       /* Check for range */
5294
5295       if (!inescq && ptr[1] == CHAR_MINUS)
5296         {
5297         pcre_uint32 d;
5298         ptr += 2;
5299         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5300
5301         /* If we hit \Q (not followed by \E) at this point, go into escaped
5302         mode. */
5303
5304         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5305           {
5306           ptr += 2;
5307           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5308             { ptr += 2; continue; }
5309           inescq = TRUE;
5310           break;
5311           }
5312
5313         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5314         back the pointer and jump to handle the character that preceded it. */
5315
5316         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5317           {
5318           ptr = oldptr;
5319           goto CLASS_SINGLE_CHARACTER;
5320           }
5321
5322         /* Otherwise, we have a potential range; pick up the next character */
5323
5324 #ifdef SUPPORT_UTF
5325         if (utf)
5326           {                           /* Braces are required because the */
5327           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5328           }
5329         else
5330 #endif
5331         d = *ptr;  /* Not UTF-8 mode */
5332
5333         /* The second part of a range can be a single-character escape
5334         sequence, but not any of the other escapes. Perl treats a hyphen as a
5335         literal in such circumstances. However, in Perl's warning mode, a
5336         warning is given, so PCRE now faults it as it is almost certainly a
5337         mistake on the user's part. */
5338
5339         if (!inescq)
5340           {
5341           if (d == CHAR_BACKSLASH)
5342             {
5343             int descape;
5344             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5345             if (*errorcodeptr != 0) goto FAILED;
5346
5347             /* 0 means a character was put into d; \b is backspace; any other
5348             special causes an error. */
5349
5350             if (descape != 0)
5351               {
5352               if (descape == ESC_b) d = CHAR_BS; else
5353                 {
5354                 *errorcodeptr = ERR83;
5355                 goto FAILED;
5356                 }
5357               }
5358             }
5359
5360           /* A hyphen followed by a POSIX class is treated in the same way. */
5361
5362           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5363                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5364                     ptr[1] == CHAR_EQUALS_SIGN) &&
5365                    check_posix_syntax(ptr, &tempptr))
5366             {
5367             *errorcodeptr = ERR83;
5368             goto FAILED;
5369             }
5370           }
5371
5372         /* Check that the two values are in the correct order. Optimize
5373         one-character ranges. */
5374
5375         if (d < c)
5376           {
5377           *errorcodeptr = ERR8;
5378           goto FAILED;
5379           }
5380         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5381
5382         /* We have found a character range, so single character optimizations
5383         cannot be done anymore. Any value greater than 1 indicates that there
5384         is more than one character. */
5385
5386         class_one_char = 2;
5387
5388         /* Remember an explicit \r or \n, and add the range to the class. */
5389
5390         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5391
5392         class_has_8bitchar +=
5393           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5394
5395         continue;   /* Go get the next char in the class */
5396         }
5397
5398       /* Handle a single character - we can get here for a normal non-escape
5399       char, or after \ that introduces a single character or for an apparent
5400       range that isn't. Only the value 1 matters for class_one_char, so don't
5401       increase it if it is already 2 or more ... just in case there's a class
5402       with a zillion characters in it. */
5403
5404       CLASS_SINGLE_CHARACTER:
5405       if (class_one_char < 2) class_one_char++;
5406
5407       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5408       single character in the class, and there have been no prior ranges, or
5409       XCLASS items generated by escapes. If this is the final character in the
5410       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5411       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5412       can cause firstchar to be set. Otherwise, there can be no first char if
5413       this item is first, whatever repeat count may follow. In the case of
5414       reqchar, save the previous value for reinstating. */
5415
5416       if (!inescq &&
5417 #ifdef SUPPORT_UCP
5418           !xclass_has_prop &&
5419 #endif
5420           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5421         {
5422         ptr++;
5423         zeroreqchar = reqchar;
5424         zeroreqcharflags = reqcharflags;
5425
5426         if (negate_class)
5427           {
5428 #ifdef SUPPORT_UCP
5429           int d;
5430 #endif
5431           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5432           zerofirstchar = firstchar;
5433           zerofirstcharflags = firstcharflags;
5434
5435           /* For caseless UTF-8 mode when UCP support is available, check
5436           whether this character has more than one other case. If so, generate
5437           a special OP_NOTPROP item instead of OP_NOTI. */
5438
5439 #ifdef SUPPORT_UCP
5440           if (utf && (options & PCRE_CASELESS) != 0 &&
5441               (d = UCD_CASESET(c)) != 0)
5442             {
5443             *code++ = OP_NOTPROP;
5444             *code++ = PT_CLIST;
5445             *code++ = d;
5446             }
5447           else
5448 #endif
5449           /* Char has only one other case, or UCP not available */
5450
5451             {
5452             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5453 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5454             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5455               code += PRIV(ord2utf)(c, code);
5456             else
5457 #endif
5458               *code++ = c;
5459             }
5460
5461           /* We are finished with this character class */
5462
5463           goto END_CLASS;
5464           }
5465
5466         /* For a single, positive character, get the value into mcbuffer, and
5467         then we can handle this with the normal one-character code. */
5468
5469 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5470         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5471           mclength = PRIV(ord2utf)(c, mcbuffer);
5472         else
5473 #endif
5474           {
5475           mcbuffer[0] = c;
5476           mclength = 1;
5477           }
5478         goto ONE_CHAR;
5479         }       /* End of 1-char optimization */
5480
5481       /* There is more than one character in the class, or an XCLASS item
5482       has been generated. Add this character to the class. */
5483
5484       class_has_8bitchar +=
5485         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5486       }
5487
5488     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5489     If we are at the end of an internal nested string, revert to the outer
5490     string. */
5491
5492     while (((c = *(++ptr)) != CHAR_NULL ||
5493            (nestptr != NULL &&
5494              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5495            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5496
5497     /* Check for missing terminating ']' */
5498
5499     if (c == CHAR_NULL)
5500       {
5501       *errorcodeptr = ERR6;
5502       goto FAILED;
5503       }
5504
5505     /* We will need an XCLASS if data has been placed in class_uchardata. In
5506     the second phase this is a sufficient test. However, in the pre-compile
5507     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5508     only if the very last character in the class needs XCLASS will it contain
5509     anything at this point. For this reason, xclass gets set TRUE above when
5510     uchar_classdata is emptied, and that's why this code is the way it is here
5511     instead of just doing a test on class_uchardata below. */
5512
5513 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5514     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5515 #endif
5516
5517     /* If this is the first thing in the branch, there can be no first char
5518     setting, whatever the repeat count. Any reqchar setting must remain
5519     unchanged after any kind of repeat. */
5520
5521     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5522     zerofirstchar = firstchar;
5523     zerofirstcharflags = firstcharflags;
5524     zeroreqchar = reqchar;
5525     zeroreqcharflags = reqcharflags;
5526
5527     /* If there are characters with values > 255, we have to compile an
5528     extended class, with its own opcode, unless there was a negated special
5529     such as \S in the class, and PCRE_UCP is not set, because in that case all
5530     characters > 255 are in the class, so any that were explicitly given as
5531     well can be ignored. If (when there are explicit characters > 255 that must
5532     be listed) there are no characters < 256, we can omit the bitmap in the
5533     actual compiled code. */
5534
5535 #ifdef SUPPORT_UTF
5536     if (xclass && (xclass_has_prop || !should_flip_negation ||
5537         (options & PCRE_UCP) != 0))
5538 #elif !defined COMPILE_PCRE8
5539     if (xclass && (xclass_has_prop || !should_flip_negation))
5540 #endif
5541 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5542       {
5543       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5544       *code++ = OP_XCLASS;
5545       code += LINK_SIZE;
5546       *code = negate_class? XCL_NOT:0;
5547       if (xclass_has_prop) *code |= XCL_HASPROP;
5548
5549       /* If the map is required, move up the extra data to make room for it;
5550       otherwise just move the code pointer to the end of the extra data. */
5551
5552       if (class_has_8bitchar > 0)
5553         {
5554         *code++ |= XCL_MAP;
5555         memmove(code + (32 / sizeof(pcre_uchar)), code,
5556           IN_UCHARS(class_uchardata - code));
5557         if (negate_class && !xclass_has_prop)
5558           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5559         memcpy(code, classbits, 32);
5560         code = class_uchardata + (32 / sizeof(pcre_uchar));
5561         }
5562       else code = class_uchardata;
5563
5564       /* Now fill in the complete length of the item */
5565
5566       PUT(previous, 1, (int)(code - previous));
5567       break;   /* End of class handling */
5568       }
5569
5570     /* Even though any XCLASS list is now discarded, we must allow for
5571     its memory. */
5572
5573     if (lengthptr != NULL)
5574       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5575 #endif
5576
5577     /* If there are no characters > 255, or they are all to be included or
5578     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5579     whole class was negated and whether there were negative specials such as \S
5580     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5581     negating it if necessary. */
5582
5583     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5584     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5585       {
5586       if (negate_class)
5587         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5588       memcpy(code, classbits, 32);
5589       }
5590     code += 32 / sizeof(pcre_uchar);
5591
5592     END_CLASS:
5593     break;
5594
5595
5596     /* ===================================================================*/
5597     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5598     has been tested above. */
5599
5600     case CHAR_LEFT_CURLY_BRACKET:
5601     if (!is_quantifier) goto NORMAL_CHAR;
5602     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5603     if (*errorcodeptr != 0) goto FAILED;
5604     goto REPEAT;
5605
5606     case CHAR_ASTERISK:
5607     repeat_min = 0;
5608     repeat_max = -1;
5609     goto REPEAT;
5610
5611     case CHAR_PLUS:
5612     repeat_min = 1;
5613     repeat_max = -1;
5614     goto REPEAT;
5615
5616     case CHAR_QUESTION_MARK:
5617     repeat_min = 0;
5618     repeat_max = 1;
5619
5620     REPEAT:
5621     if (previous == NULL)
5622       {
5623       *errorcodeptr = ERR9;
5624       goto FAILED;
5625       }
5626
5627     if (repeat_min == 0)
5628       {
5629       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5630       firstcharflags = zerofirstcharflags;
5631       reqchar = zeroreqchar;        /* Ditto */
5632       reqcharflags = zeroreqcharflags;
5633       }
5634
5635     /* Remember whether this is a variable length repeat */
5636
5637     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5638
5639     op_type = 0;                    /* Default single-char op codes */
5640     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5641
5642     /* Save start of previous item, in case we have to move it up in order to
5643     insert something before it. */
5644
5645     tempcode = previous;
5646
5647     /* Before checking for a possessive quantifier, we must skip over
5648     whitespace and comments in extended mode because Perl allows white space at
5649     this point. */
5650
5651     if ((options & PCRE_EXTENDED) != 0)
5652       {
5653       const pcre_uchar *p = ptr + 1;
5654       for (;;)
5655         {
5656         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5657         if (*p != CHAR_NUMBER_SIGN) break;
5658         p++;
5659         while (*p != CHAR_NULL)
5660           {
5661           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5662             {                        /* IS_NEWLINE sets cd->nllen. */
5663             p += cd->nllen;
5664             break;
5665             }
5666           p++;
5667 #ifdef SUPPORT_UTF
5668           if (utf) FORWARDCHAR(p);
5669 #endif
5670           }           /* Loop for comment characters */
5671         }             /* Loop for multiple comments */
5672       ptr = p - 1;    /* Character before the next significant one. */
5673       }
5674
5675     /* If the next character is '+', we have a possessive quantifier. This
5676     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5677     If the next character is '?' this is a minimizing repeat, by default,
5678     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5679     repeat type to the non-default. */
5680
5681     if (ptr[1] == CHAR_PLUS)
5682       {
5683       repeat_type = 0;                  /* Force greedy */
5684       possessive_quantifier = TRUE;
5685       ptr++;
5686       }
5687     else if (ptr[1] == CHAR_QUESTION_MARK)
5688       {
5689       repeat_type = greedy_non_default;
5690       ptr++;
5691       }
5692     else repeat_type = greedy_default;
5693
5694     /* If previous was a recursion call, wrap it in atomic brackets so that
5695     previous becomes the atomic group. All recursions were so wrapped in the
5696     past, but it no longer happens for non-repeated recursions. In fact, the
5697     repeated ones could be re-implemented independently so as not to need this,
5698     but for the moment we rely on the code for repeating groups. */
5699
5700     if (*previous == OP_RECURSE)
5701       {
5702       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5703       *previous = OP_ONCE;
5704       PUT(previous, 1, 2 + 2*LINK_SIZE);
5705       previous[2 + 2*LINK_SIZE] = OP_KET;
5706       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5707       code += 2 + 2 * LINK_SIZE;
5708       length_prevgroup = 3 + 3*LINK_SIZE;
5709
5710       /* When actually compiling, we need to check whether this was a forward
5711       reference, and if so, adjust the offset. */
5712
5713       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5714         {
5715         int offset = GET(cd->hwm, -LINK_SIZE);
5716         if (offset == previous + 1 - cd->start_code)
5717           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5718         }
5719       }
5720
5721     /* Now handle repetition for the different types of item. */
5722
5723     /* If previous was a character or negated character match, abolish the item
5724     and generate a repeat item instead. If a char item has a minimum of more
5725     than one, ensure that it is set in reqchar - it might not be if a sequence
5726     such as x{3} is the first thing in a branch because the x will have gone
5727     into firstchar instead.  */
5728
5729     if (*previous == OP_CHAR || *previous == OP_CHARI
5730         || *previous == OP_NOT || *previous == OP_NOTI)
5731       {
5732       switch (*previous)
5733         {
5734         default: /* Make compiler happy. */
5735         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5736         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5737         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5738         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5739         }
5740
5741       /* Deal with UTF characters that take up more than one character. It's
5742       easier to write this out separately than try to macrify it. Use c to
5743       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5744       it's a length rather than a small character. */
5745
5746 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5747       if (utf && NOT_FIRSTCHAR(code[-1]))
5748         {
5749         pcre_uchar *lastchar = code - 1;
5750         BACKCHAR(lastchar);
5751         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5752         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5753         c |= UTF_LENGTH;                /* Flag c as a length */
5754         }
5755       else
5756 #endif /* SUPPORT_UTF */
5757
5758       /* Handle the case of a single charater - either with no UTF support, or
5759       with UTF disabled, or for a single character UTF character. */
5760         {
5761         c = code[-1];
5762         if (*previous <= OP_CHARI && repeat_min > 1)
5763           {
5764           reqchar = c;
5765           reqcharflags = req_caseopt | cd->req_varyopt;
5766           }
5767         }
5768
5769       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5770       }
5771
5772     /* If previous was a character type match (\d or similar), abolish it and
5773     create a suitable repeat item. The code is shared with single-character
5774     repeats by setting op_type to add a suitable offset into repeat_type. Note
5775     the the Unicode property types will be present only when SUPPORT_UCP is
5776     defined, but we don't wrap the little bits of code here because it just
5777     makes it horribly messy. */
5778
5779     else if (*previous < OP_EODN)
5780       {
5781       pcre_uchar *oldcode;
5782       int prop_type, prop_value;
5783       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5784       c = *previous;
5785
5786       OUTPUT_SINGLE_REPEAT:
5787       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5788         {
5789         prop_type = previous[1];
5790         prop_value = previous[2];
5791         }
5792       else prop_type = prop_value = -1;
5793
5794       oldcode = code;
5795       code = previous;                  /* Usually overwrite previous item */
5796
5797       /* If the maximum is zero then the minimum must also be zero; Perl allows
5798       this case, so we do too - by simply omitting the item altogether. */
5799
5800       if (repeat_max == 0) goto END_REPEAT;
5801
5802       /* Combine the op_type with the repeat_type */
5803
5804       repeat_type += op_type;
5805
5806       /* A minimum of zero is handled either as the special case * or ?, or as
5807       an UPTO, with the maximum given. */
5808
5809       if (repeat_min == 0)
5810         {
5811         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5812           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5813         else
5814           {
5815           *code++ = OP_UPTO + repeat_type;
5816           PUT2INC(code, 0, repeat_max);
5817           }
5818         }
5819
5820       /* A repeat minimum of 1 is optimized into some special cases. If the
5821       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5822       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5823       one less than the maximum. */
5824
5825       else if (repeat_min == 1)
5826         {
5827         if (repeat_max == -1)
5828           *code++ = OP_PLUS + repeat_type;
5829         else
5830           {
5831           code = oldcode;                 /* leave previous item in place */
5832           if (repeat_max == 1) goto END_REPEAT;
5833           *code++ = OP_UPTO + repeat_type;
5834           PUT2INC(code, 0, repeat_max - 1);
5835           }
5836         }
5837
5838       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5839       handled as an EXACT followed by an UPTO. */
5840
5841       else
5842         {
5843         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5844         PUT2INC(code, 0, repeat_min);
5845
5846         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5847         we have to insert the character for the previous code. For a repeated
5848         Unicode property match, there are two extra bytes that define the
5849         required property. In UTF-8 mode, long characters have their length in
5850         c, with the UTF_LENGTH bit as a flag. */
5851
5852         if (repeat_max < 0)
5853           {
5854 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5855           if (utf && (c & UTF_LENGTH) != 0)
5856             {
5857             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5858             code += c & 7;
5859             }
5860           else
5861 #endif
5862             {
5863             *code++ = c;
5864             if (prop_type >= 0)
5865               {
5866               *code++ = prop_type;
5867               *code++ = prop_value;
5868               }
5869             }
5870           *code++ = OP_STAR + repeat_type;
5871           }
5872
5873         /* Else insert an UPTO if the max is greater than the min, again
5874         preceded by the character, for the previously inserted code. If the
5875         UPTO is just for 1 instance, we can use QUERY instead. */
5876
5877         else if (repeat_max != repeat_min)
5878           {
5879 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5880           if (utf && (c & UTF_LENGTH) != 0)
5881             {
5882             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5883             code += c & 7;
5884             }
5885           else
5886 #endif
5887           *code++ = c;
5888           if (prop_type >= 0)
5889             {
5890             *code++ = prop_type;
5891             *code++ = prop_value;
5892             }
5893           repeat_max -= repeat_min;
5894
5895           if (repeat_max == 1)
5896             {
5897             *code++ = OP_QUERY + repeat_type;
5898             }
5899           else
5900             {
5901             *code++ = OP_UPTO + repeat_type;
5902             PUT2INC(code, 0, repeat_max);
5903             }
5904           }
5905         }
5906
5907       /* The character or character type itself comes last in all cases. */
5908
5909 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5910       if (utf && (c & UTF_LENGTH) != 0)
5911         {
5912         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5913         code += c & 7;
5914         }
5915       else
5916 #endif
5917       *code++ = c;
5918
5919       /* For a repeated Unicode property match, there are two extra bytes that
5920       define the required property. */
5921
5922 #ifdef SUPPORT_UCP
5923       if (prop_type >= 0)
5924         {
5925         *code++ = prop_type;
5926         *code++ = prop_value;
5927         }
5928 #endif
5929       }
5930
5931     /* If previous was a character class or a back reference, we put the repeat
5932     stuff after it, but just skip the item if the repeat was {0,0}. */
5933
5934     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5935 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5936              *previous == OP_XCLASS ||
5937 #endif
5938              *previous == OP_REF   || *previous == OP_REFI ||
5939              *previous == OP_DNREF || *previous == OP_DNREFI)
5940       {
5941       if (repeat_max == 0)
5942         {
5943         code = previous;
5944         goto END_REPEAT;
5945         }
5946
5947       if (repeat_min == 0 && repeat_max == -1)
5948         *code++ = OP_CRSTAR + repeat_type;
5949       else if (repeat_min == 1 && repeat_max == -1)
5950         *code++ = OP_CRPLUS + repeat_type;
5951       else if (repeat_min == 0 && repeat_max == 1)
5952         *code++ = OP_CRQUERY + repeat_type;
5953       else
5954         {
5955         *code++ = OP_CRRANGE + repeat_type;
5956         PUT2INC(code, 0, repeat_min);
5957         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5958         PUT2INC(code, 0, repeat_max);
5959         }
5960       }
5961
5962     /* If previous was a bracket group, we may have to replicate it in certain
5963     cases. Note that at this point we can encounter only the "basic" bracket
5964     opcodes such as BRA and CBRA, as this is the place where they get converted
5965     into the more special varieties such as BRAPOS and SBRA. A test for >=
5966     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5967     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5968     Originally, PCRE did not allow repetition of assertions, but now it does,
5969     for Perl compatibility. */
5970
5971     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5972       {
5973       register int i;
5974       int len = (int)(code - previous);
5975       size_t base_hwm_offset = item_hwm_offset;
5976       pcre_uchar *bralink = NULL;
5977       pcre_uchar *brazeroptr = NULL;
5978
5979       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5980       we just ignore the repeat. */
5981
5982       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5983         goto END_REPEAT;
5984
5985       /* There is no sense in actually repeating assertions. The only potential
5986       use of repetition is in cases when the assertion is optional. Therefore,
5987       if the minimum is greater than zero, just ignore the repeat. If the
5988       maximum is not zero or one, set it to 1. */
5989
5990       if (*previous < OP_ONCE)    /* Assertion */
5991         {
5992         if (repeat_min > 0) goto END_REPEAT;
5993         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5994         }
5995
5996       /* The case of a zero minimum is special because of the need to stick
5997       OP_BRAZERO in front of it, and because the group appears once in the
5998       data, whereas in other cases it appears the minimum number of times. For
5999       this reason, it is simplest to treat this case separately, as otherwise
6000       the code gets far too messy. There are several special subcases when the
6001       minimum is zero. */
6002
6003       if (repeat_min == 0)
6004         {
6005         /* If the maximum is also zero, we used to just omit the group from the
6006         output altogether, like this:
6007
6008         ** if (repeat_max == 0)
6009         **   {
6010         **   code = previous;
6011         **   goto END_REPEAT;
6012         **   }
6013
6014         However, that fails when a group or a subgroup within it is referenced
6015         as a subroutine from elsewhere in the pattern, so now we stick in
6016         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6017         don't have a list of which groups are referenced, we cannot do this
6018         selectively.
6019
6020         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6021         and do no more at this point. However, we do need to adjust any
6022         OP_RECURSE calls inside the group that refer to the group itself or any
6023         internal or forward referenced group, because the offset is from the
6024         start of the whole regex. Temporarily terminate the pattern while doing
6025         this. */
6026
6027         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6028           {
6029           *code = OP_END;
6030           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6031           memmove(previous + 1, previous, IN_UCHARS(len));
6032           code++;
6033           if (repeat_max == 0)
6034             {
6035             *previous++ = OP_SKIPZERO;
6036             goto END_REPEAT;
6037             }
6038           brazeroptr = previous;    /* Save for possessive optimizing */
6039           *previous++ = OP_BRAZERO + repeat_type;
6040           }
6041
6042         /* If the maximum is greater than 1 and limited, we have to replicate
6043         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6044         The first one has to be handled carefully because it's the original
6045         copy, which has to be moved up. The remainder can be handled by code
6046         that is common with the non-zero minimum case below. We have to
6047         adjust the value or repeat_max, since one less copy is required. Once
6048         again, we may have to adjust any OP_RECURSE calls inside the group. */
6049
6050         else
6051           {
6052           int offset;
6053           *code = OP_END;
6054           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6055           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6056           code += 2 + LINK_SIZE;
6057           *previous++ = OP_BRAZERO + repeat_type;
6058           *previous++ = OP_BRA;
6059
6060           /* We chain together the bracket offset fields that have to be
6061           filled in later when the ends of the brackets are reached. */
6062
6063           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6064           bralink = previous;
6065           PUTINC(previous, 0, offset);
6066           }
6067
6068         repeat_max--;
6069         }
6070
6071       /* If the minimum is greater than zero, replicate the group as many
6072       times as necessary, and adjust the maximum to the number of subsequent
6073       copies that we need. If we set a first char from the group, and didn't
6074       set a required char, copy the latter from the former. If there are any
6075       forward reference subroutine calls in the group, there will be entries on
6076       the workspace list; replicate these with an appropriate increment. */
6077
6078       else
6079         {
6080         if (repeat_min > 1)
6081           {
6082           /* In the pre-compile phase, we don't actually do the replication. We
6083           just adjust the length as if we had. Do some paranoid checks for
6084           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6085           integer type when available, otherwise double. */
6086
6087           if (lengthptr != NULL)
6088             {
6089             int delta = (repeat_min - 1)*length_prevgroup;
6090             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6091                   (INT64_OR_DOUBLE)length_prevgroup >
6092                     (INT64_OR_DOUBLE)INT_MAX ||
6093                 OFLOW_MAX - *lengthptr < delta)
6094               {
6095               *errorcodeptr = ERR20;
6096               goto FAILED;
6097               }
6098             *lengthptr += delta;
6099             }
6100
6101           /* This is compiling for real. If there is a set first byte for
6102           the group, and we have not yet set a "required byte", set it. Make
6103           sure there is enough workspace for copying forward references before
6104           doing the copy. */
6105
6106           else
6107             {
6108             if (groupsetfirstchar && reqcharflags < 0)
6109               {
6110               reqchar = firstchar;
6111               reqcharflags = firstcharflags;
6112               }
6113
6114             for (i = 1; i < repeat_min; i++)
6115               {
6116               pcre_uchar *hc;
6117               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6118               memcpy(code, previous, IN_UCHARS(len));
6119
6120               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6121                      WORK_SIZE_SAFETY_MARGIN -
6122                      (this_hwm_offset - base_hwm_offset))
6123                 {
6124                 *errorcodeptr = expand_workspace(cd);
6125                 if (*errorcodeptr != 0) goto FAILED;
6126                 }
6127
6128               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6129                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6130                    hc += LINK_SIZE)
6131                 {
6132                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6133                 cd->hwm += LINK_SIZE;
6134                 }
6135               base_hwm_offset = this_hwm_offset;
6136               code += len;
6137               }
6138             }
6139           }
6140
6141         if (repeat_max > 0) repeat_max -= repeat_min;
6142         }
6143
6144       /* This code is common to both the zero and non-zero minimum cases. If
6145       the maximum is limited, it replicates the group in a nested fashion,
6146       remembering the bracket starts on a stack. In the case of a zero minimum,
6147       the first one was set up above. In all cases the repeat_max now specifies
6148       the number of additional copies needed. Again, we must remember to
6149       replicate entries on the forward reference list. */
6150
6151       if (repeat_max >= 0)
6152         {
6153         /* In the pre-compile phase, we don't actually do the replication. We
6154         just adjust the length as if we had. For each repetition we must add 1
6155         to the length for BRAZERO and for all but the last repetition we must
6156         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6157         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6158         a 64-bit integer type when available, otherwise double. */
6159
6160         if (lengthptr != NULL && repeat_max > 0)
6161           {
6162           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6163                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6164           if ((INT64_OR_DOUBLE)repeat_max *
6165                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6166                   > (INT64_OR_DOUBLE)INT_MAX ||
6167               OFLOW_MAX - *lengthptr < delta)
6168             {
6169             *errorcodeptr = ERR20;
6170             goto FAILED;
6171             }
6172           *lengthptr += delta;
6173           }
6174
6175         /* This is compiling for real */
6176
6177         else for (i = repeat_max - 1; i >= 0; i--)
6178           {
6179           pcre_uchar *hc;
6180           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6181
6182           *code++ = OP_BRAZERO + repeat_type;
6183
6184           /* All but the final copy start a new nesting, maintaining the
6185           chain of brackets outstanding. */
6186
6187           if (i != 0)
6188             {
6189             int offset;
6190             *code++ = OP_BRA;
6191             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6192             bralink = code;
6193             PUTINC(code, 0, offset);
6194             }
6195
6196           memcpy(code, previous, IN_UCHARS(len));
6197
6198           /* Ensure there is enough workspace for forward references before
6199           copying them. */
6200
6201           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6202                  WORK_SIZE_SAFETY_MARGIN -
6203                  (this_hwm_offset - base_hwm_offset))
6204             {
6205             *errorcodeptr = expand_workspace(cd);
6206             if (*errorcodeptr != 0) goto FAILED;
6207             }
6208
6209           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6210                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6211                hc += LINK_SIZE)
6212             {
6213             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6214             cd->hwm += LINK_SIZE;
6215             }
6216           base_hwm_offset = this_hwm_offset;
6217           code += len;
6218           }
6219
6220         /* Now chain through the pending brackets, and fill in their length
6221         fields (which are holding the chain links pro tem). */
6222
6223         while (bralink != NULL)
6224           {
6225           int oldlinkoffset;
6226           int offset = (int)(code - bralink + 1);
6227           pcre_uchar *bra = code - offset;
6228           oldlinkoffset = GET(bra, 1);
6229           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6230           *code++ = OP_KET;
6231           PUTINC(code, 0, offset);
6232           PUT(bra, 1, offset);
6233           }
6234         }
6235
6236       /* If the maximum is unlimited, set a repeater in the final copy. For
6237       ONCE brackets, that's all we need to do. However, possessively repeated
6238       ONCE brackets can be converted into non-capturing brackets, as the
6239       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6240       deal with possessive ONCEs specially.
6241
6242       Otherwise, when we are doing the actual compile phase, check to see
6243       whether this group is one that could match an empty string. If so,
6244       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6245       that runtime checking can be done. [This check is also applied to ONCE
6246       groups at runtime, but in a different way.]
6247
6248       Then, if the quantifier was possessive and the bracket is not a
6249       conditional, we convert the BRA code to the POS form, and the KET code to
6250       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6251       subpattern at both the start and at the end.) The use of special opcodes
6252       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6253       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6254
6255       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6256       flag so that the default action below, of wrapping everything inside
6257       atomic brackets, does not happen. When the minimum is greater than 1,
6258       there will be earlier copies of the group, and so we still have to wrap
6259       the whole thing. */
6260
6261       else
6262         {
6263         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6264         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6265
6266         /* Convert possessive ONCE brackets to non-capturing */
6267
6268         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6269             possessive_quantifier) *bracode = OP_BRA;
6270
6271         /* For non-possessive ONCE brackets, all we need to do is to
6272         set the KET. */
6273
6274         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6275           *ketcode = OP_KETRMAX + repeat_type;
6276
6277         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6278         converted to non-capturing above). */
6279
6280         else
6281           {
6282           /* In the compile phase, check for empty string matching. */
6283
6284           if (lengthptr == NULL)
6285             {
6286             pcre_uchar *scode = bracode;
6287             do
6288               {
6289               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6290                 {
6291                 *bracode += OP_SBRA - OP_BRA;
6292                 break;
6293                 }
6294               scode += GET(scode, 1);
6295               }
6296             while (*scode == OP_ALT);
6297             }
6298
6299           /* A conditional group with only one branch has an implicit empty
6300           alternative branch. */
6301
6302           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6303             *bracode = OP_SCOND;
6304
6305           /* Handle possessive quantifiers. */
6306
6307           if (possessive_quantifier)
6308             {
6309             /* For COND brackets, we wrap the whole thing in a possessively
6310             repeated non-capturing bracket, because we have not invented POS
6311             versions of the COND opcodes. Because we are moving code along, we
6312             must ensure that any pending recursive references are updated. */
6313
6314             if (*bracode == OP_COND || *bracode == OP_SCOND)
6315               {
6316               int nlen = (int)(code - bracode);
6317               *code = OP_END;
6318               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6319               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6320               code += 1 + LINK_SIZE;
6321               nlen += 1 + LINK_SIZE;
6322               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6323               *code++ = OP_KETRPOS;
6324               PUTINC(code, 0, nlen);
6325               PUT(bracode, 1, nlen);
6326               }
6327
6328             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6329
6330             else
6331               {
6332               *bracode += 1;              /* Switch to xxxPOS opcodes */
6333               *ketcode = OP_KETRPOS;
6334               }
6335
6336             /* If the minimum is zero, mark it as possessive, then unset the
6337             possessive flag when the minimum is 0 or 1. */
6338
6339             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6340             if (repeat_min < 2) possessive_quantifier = FALSE;
6341             }
6342
6343           /* Non-possessive quantifier */
6344
6345           else *ketcode = OP_KETRMAX + repeat_type;
6346           }
6347         }
6348       }
6349
6350     /* If previous is OP_FAIL, it was generated by an empty class [] in
6351     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6352     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6353     error above. We can just ignore the repeat in JS case. */
6354
6355     else if (*previous == OP_FAIL) goto END_REPEAT;
6356
6357     /* Else there's some kind of shambles */
6358
6359     else
6360       {
6361       *errorcodeptr = ERR11;
6362       goto FAILED;
6363       }
6364
6365     /* If the character following a repeat is '+', possessive_quantifier is
6366     TRUE. For some opcodes, there are special alternative opcodes for this
6367     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6368     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6369     Sun's Java package, but the special opcodes can optimize it.
6370
6371     Some (but not all) possessively repeated subpatterns have already been
6372     completely handled in the code just above. For them, possessive_quantifier
6373     is always FALSE at this stage. Note that the repeated item starts at
6374     tempcode, not at previous, which might be the first part of a string whose
6375     (former) last char we repeated. */
6376
6377     if (possessive_quantifier)
6378       {
6379       int len;
6380
6381       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6382       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6383       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6384       remains is greater than zero, there's a further opcode that can be
6385       handled. If not, do nothing, leaving the EXACT alone. */
6386
6387       switch(*tempcode)
6388         {
6389         case OP_TYPEEXACT:
6390         tempcode += PRIV(OP_lengths)[*tempcode] +
6391           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6392           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6393         break;
6394
6395         /* CHAR opcodes are used for exacts whose count is 1. */
6396
6397         case OP_CHAR:
6398         case OP_CHARI:
6399         case OP_NOT:
6400         case OP_NOTI:
6401         case OP_EXACT:
6402         case OP_EXACTI:
6403         case OP_NOTEXACT:
6404         case OP_NOTEXACTI:
6405         tempcode += PRIV(OP_lengths)[*tempcode];
6406 #ifdef SUPPORT_UTF
6407         if (utf && HAS_EXTRALEN(tempcode[-1]))
6408           tempcode += GET_EXTRALEN(tempcode[-1]);
6409 #endif
6410         break;
6411
6412         /* For the class opcodes, the repeat operator appears at the end;
6413         adjust tempcode to point to it. */
6414
6415         case OP_CLASS:
6416         case OP_NCLASS:
6417         tempcode += 1 + 32/sizeof(pcre_uchar);
6418         break;
6419
6420 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6421         case OP_XCLASS:
6422         tempcode += GET(tempcode, 1);
6423         break;
6424 #endif
6425         }
6426
6427       /* If tempcode is equal to code (which points to the end of the repeated
6428       item), it means we have skipped an EXACT item but there is no following
6429       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6430       all other cases, tempcode will be pointing to the repeat opcode, and will
6431       be less than code, so the value of len will be greater than 0. */
6432
6433       len = (int)(code - tempcode);
6434       if (len > 0)
6435         {
6436         unsigned int repcode = *tempcode;
6437
6438         /* There is a table for possessifying opcodes, all of which are less
6439         than OP_CALLOUT. A zero entry means there is no possessified version.
6440         */
6441
6442         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6443           *tempcode = opcode_possessify[repcode];
6444
6445         /* For opcode without a special possessified version, wrap the item in
6446         ONCE brackets. Because we are moving code along, we must ensure that any
6447         pending recursive references are updated. */
6448
6449         else
6450           {
6451           *code = OP_END;
6452           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6453           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6454           code += 1 + LINK_SIZE;
6455           len += 1 + LINK_SIZE;
6456           tempcode[0] = OP_ONCE;
6457           *code++ = OP_KET;
6458           PUTINC(code, 0, len);
6459           PUT(tempcode, 1, len);
6460           }
6461         }
6462
6463 #ifdef NEVER
6464       if (len > 0) switch (*tempcode)
6465         {
6466         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6467         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6468         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6469         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6470
6471         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6472         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6473         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6474         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6475
6476         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6477         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6478         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6479         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6480
6481         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6482         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6483         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6484         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6485
6486         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6487         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6488         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6489         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6490
6491         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6492         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6493         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6494         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6495
6496         /* Because we are moving code along, we must ensure that any
6497         pending recursive references are updated. */
6498
6499         default:
6500         *code = OP_END;
6501         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6502         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6503         code += 1 + LINK_SIZE;
6504         len += 1 + LINK_SIZE;
6505         tempcode[0] = OP_ONCE;
6506         *code++ = OP_KET;
6507         PUTINC(code, 0, len);
6508         PUT(tempcode, 1, len);
6509         break;
6510         }
6511 #endif
6512       }
6513
6514     /* In all case we no longer have a previous item. We also set the
6515     "follows varying string" flag for subsequently encountered reqchars if
6516     it isn't already set and we have just passed a varying length item. */
6517
6518     END_REPEAT:
6519     previous = NULL;
6520     cd->req_varyopt |= reqvary;
6521     break;
6522
6523
6524     /* ===================================================================*/
6525     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6526     lookbehind or option setting or condition or all the other extended
6527     parenthesis forms.  */
6528
6529     case CHAR_LEFT_PARENTHESIS:
6530     ptr++;
6531
6532     /* First deal with comments. Putting this code right at the start ensures
6533     that comments have no bad side effects. */
6534
6535     if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6536       {
6537       ptr += 2;
6538       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6539       if (*ptr == CHAR_NULL)
6540         {
6541         *errorcodeptr = ERR18;
6542         goto FAILED;
6543         }
6544       continue;
6545       }
6546
6547     /* Now deal with various "verbs" that can be introduced by '*'. */
6548
6549     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6550          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6551       {
6552       int i, namelen;
6553       int arglen = 0;
6554       const char *vn = verbnames;
6555       const pcre_uchar *name = ptr + 1;
6556       const pcre_uchar *arg = NULL;
6557       previous = NULL;
6558       ptr++;
6559       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6560       namelen = (int)(ptr - name);
6561
6562       /* It appears that Perl allows any characters whatsoever, other than
6563       a closing parenthesis, to appear in arguments, so we no longer insist on
6564       letters, digits, and underscores. */
6565
6566       if (*ptr == CHAR_COLON)
6567         {
6568         arg = ++ptr;
6569         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6570         arglen = (int)(ptr - arg);
6571         if ((unsigned int)arglen > MAX_MARK)
6572           {
6573           *errorcodeptr = ERR75;
6574           goto FAILED;
6575           }
6576         }
6577
6578       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6579         {
6580         *errorcodeptr = ERR60;
6581         goto FAILED;
6582         }
6583
6584       /* Scan the table of verb names */
6585
6586       for (i = 0; i < verbcount; i++)
6587         {
6588         if (namelen == verbs[i].len &&
6589             STRNCMP_UC_C8(name, vn, namelen) == 0)
6590           {
6591           int setverb;
6592
6593           /* Check for open captures before ACCEPT and convert it to
6594           ASSERT_ACCEPT if in an assertion. */
6595
6596           if (verbs[i].op == OP_ACCEPT)
6597             {
6598             open_capitem *oc;
6599             if (arglen != 0)
6600               {
6601               *errorcodeptr = ERR59;
6602               goto FAILED;
6603               }
6604             cd->had_accept = TRUE;
6605             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6606               {
6607               *code++ = OP_CLOSE;
6608               PUT2INC(code, 0, oc->number);
6609               }
6610             setverb = *code++ =
6611               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6612
6613             /* Do not set firstchar after *ACCEPT */
6614             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6615             }
6616
6617           /* Handle other cases with/without an argument */
6618
6619           else if (arglen == 0)
6620             {
6621             if (verbs[i].op < 0)   /* Argument is mandatory */
6622               {
6623               *errorcodeptr = ERR66;
6624               goto FAILED;
6625               }
6626             setverb = *code++ = verbs[i].op;
6627             }
6628
6629           else
6630             {
6631             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6632               {
6633               *errorcodeptr = ERR59;
6634               goto FAILED;
6635               }
6636             setverb = *code++ = verbs[i].op_arg;
6637             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6638               {                       /* to avoid potential workspace */
6639               *lengthptr += arglen;   /* overflow. */
6640               *code++ = 0;
6641               }
6642             else
6643               {
6644               *code++ = arglen;
6645               memcpy(code, arg, IN_UCHARS(arglen));
6646               code += arglen;
6647               }
6648             *code++ = 0;
6649             }
6650
6651           switch (setverb)
6652             {
6653             case OP_THEN:
6654             case OP_THEN_ARG:
6655             cd->external_flags |= PCRE_HASTHEN;
6656             break;
6657
6658             case OP_PRUNE:
6659             case OP_PRUNE_ARG:
6660             case OP_SKIP:
6661             case OP_SKIP_ARG:
6662             cd->had_pruneorskip = TRUE;
6663             break;
6664             }
6665
6666           break;  /* Found verb, exit loop */
6667           }
6668
6669         vn += verbs[i].len + 1;
6670         }
6671
6672       if (i < verbcount) continue;    /* Successfully handled a verb */
6673       *errorcodeptr = ERR60;          /* Verb not recognized */
6674       goto FAILED;
6675       }
6676
6677     /* Initialize for "real" parentheses */
6678
6679     newoptions = options;
6680     skipbytes = 0;
6681     bravalue = OP_CBRA;
6682     item_hwm_offset = cd->hwm - cd->start_workspace;
6683     reset_bracount = FALSE;
6684
6685     /* Deal with the extended parentheses; all are introduced by '?', and the
6686     appearance of any of them means that this is not a capturing group. */
6687
6688     if (*ptr == CHAR_QUESTION_MARK)
6689       {
6690       int i, set, unset, namelen;
6691       int *optset;
6692       const pcre_uchar *name;
6693       pcre_uchar *slot;
6694
6695       switch (*(++ptr))
6696         {
6697         /* ------------------------------------------------------------ */
6698         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6699         reset_bracount = TRUE;
6700         cd->dupgroups = TRUE;     /* Record (?| encountered */
6701         /* Fall through */
6702
6703         /* ------------------------------------------------------------ */
6704         case CHAR_COLON:          /* Non-capturing bracket */
6705         bravalue = OP_BRA;
6706         ptr++;
6707         break;
6708
6709
6710         /* ------------------------------------------------------------ */
6711         case CHAR_LEFT_PARENTHESIS:
6712         bravalue = OP_COND;       /* Conditional group */
6713         tempptr = ptr;
6714
6715         /* A condition can be an assertion, a number (referring to a numbered
6716         group's having been set), a name (referring to a named group), or 'R',
6717         referring to recursion. R<digits> and R&name are also permitted for
6718         recursion tests.
6719
6720         There are ways of testing a named group: (?(name)) is used by Python;
6721         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6722
6723         There is one unfortunate ambiguity, caused by history. 'R' can be the
6724         recursive thing or the name 'R' (and similarly for 'R' followed by
6725         digits). We look for a name first; if not found, we try the other case.
6726
6727         For compatibility with auto-callouts, we allow a callout to be
6728         specified before a condition that is an assertion. First, check for the
6729         syntax of a callout; if found, adjust the temporary pointer that is
6730         used to check for an assertion condition. That's all that is needed! */
6731
6732         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6733           {
6734           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6735           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6736             tempptr += i + 1;
6737           }
6738
6739         /* For conditions that are assertions, check the syntax, and then exit
6740         the switch. This will take control down to where bracketed groups,
6741         including assertions, are processed. */
6742
6743         if (tempptr[1] == CHAR_QUESTION_MARK &&
6744               (tempptr[2] == CHAR_EQUALS_SIGN ||
6745                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6746                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6747                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6748                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6749           {
6750           cd->iscondassert = TRUE;
6751           break;
6752           }
6753
6754         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6755         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6756
6757         code[1+LINK_SIZE] = OP_CREF;
6758         skipbytes = 1+IMM2_SIZE;
6759         refsign = -1;     /* => not a number */
6760         namelen = -1;     /* => not a name; must set to avoid warning */
6761         name = NULL;      /* Always set to avoid warning */
6762         recno = 0;        /* Always set to avoid warning */
6763
6764         /* Check for a test for recursion in a named group. */
6765
6766         ptr++;
6767         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6768           {
6769           terminator = -1;
6770           ptr += 2;
6771           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6772           }
6773
6774         /* Check for a test for a named group's having been set, using the Perl
6775         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6776         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6777
6778         else if (*ptr == CHAR_LESS_THAN_SIGN)
6779           {
6780           terminator = CHAR_GREATER_THAN_SIGN;
6781           ptr++;
6782           }
6783         else if (*ptr == CHAR_APOSTROPHE)
6784           {
6785           terminator = CHAR_APOSTROPHE;
6786           ptr++;
6787           }
6788         else
6789           {
6790           terminator = CHAR_NULL;
6791           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6792             else if (IS_DIGIT(*ptr)) refsign = 0;
6793           }
6794
6795         /* Handle a number */
6796
6797         if (refsign >= 0)
6798           {
6799           while (IS_DIGIT(*ptr))
6800             {
6801             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6802               {
6803               while (IS_DIGIT(*ptr)) ptr++;
6804               *errorcodeptr = ERR61;
6805               goto FAILED;
6806               }
6807             recno = recno * 10 + (int)(*ptr - CHAR_0);
6808             ptr++;
6809             }
6810           }
6811
6812         /* Otherwise we expect to read a name; anything else is an error. When
6813         a name is one of a number of duplicates, a different opcode is used and
6814         it needs more memory. Unfortunately we cannot tell whether a name is a
6815         duplicate in the first pass, so we have to allow for more memory. */
6816
6817         else
6818           {
6819           if (IS_DIGIT(*ptr))
6820             {
6821             *errorcodeptr = ERR84;
6822             goto FAILED;
6823             }
6824           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6825             {
6826             *errorcodeptr = ERR28;   /* Assertion expected */
6827             goto FAILED;
6828             }
6829           name = ptr++;
6830           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6831             {
6832             ptr++;
6833             }
6834           namelen = (int)(ptr - name);
6835           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6836           }
6837
6838         /* Check the terminator */
6839
6840         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6841             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6842           {
6843           ptr--;                  /* Error offset */
6844           *errorcodeptr = ERR26;  /* Malformed number or name */
6845           goto FAILED;
6846           }
6847
6848         /* Do no further checking in the pre-compile phase. */
6849
6850         if (lengthptr != NULL) break;
6851
6852         /* In the real compile we do the work of looking for the actual
6853         reference. If refsign is not negative, it means we have a number in
6854         recno. */
6855
6856         if (refsign >= 0)
6857           {
6858           if (recno <= 0)
6859             {
6860             *errorcodeptr = ERR35;
6861             goto FAILED;
6862             }
6863           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6864             cd->bracount - recno + 1 : recno + cd->bracount;
6865           if (recno <= 0 || recno > cd->final_bracount)
6866             {
6867             *errorcodeptr = ERR15;
6868             goto FAILED;
6869             }
6870           PUT2(code, 2+LINK_SIZE, recno);
6871           if (recno > cd->top_backref) cd->top_backref = recno;
6872           break;
6873           }
6874
6875         /* Otherwise look for the name. */
6876
6877         slot = cd->name_table;
6878         for (i = 0; i < cd->names_found; i++)
6879           {
6880           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6881           slot += cd->name_entry_size;
6882           }
6883
6884         /* Found the named subpattern. If the name is duplicated, add one to
6885         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6886         appropriate data values. Otherwise, just insert the unique subpattern
6887         number. */
6888
6889         if (i < cd->names_found)
6890           {
6891           int offset = i++;
6892           int count = 1;
6893           recno = GET2(slot, 0);   /* Number from first found */
6894           if (recno > cd->top_backref) cd->top_backref = recno;
6895           for (; i < cd->names_found; i++)
6896             {
6897             slot += cd->name_entry_size;
6898             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6899               (slot+IMM2_SIZE)[namelen] != 0) break;
6900             count++;
6901             }
6902
6903           if (count > 1)
6904             {
6905             PUT2(code, 2+LINK_SIZE, offset);
6906             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6907             skipbytes += IMM2_SIZE;
6908             code[1+LINK_SIZE]++;
6909             }
6910           else  /* Not a duplicated name */
6911             {
6912             PUT2(code, 2+LINK_SIZE, recno);
6913             }
6914           }
6915
6916         /* If terminator == CHAR_NULL it means that the name followed directly
6917         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6918         are some further alternatives to try. For the cases where terminator !=
6919         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6920         we have now checked all the possibilities, so give an error. */
6921
6922         else if (terminator != CHAR_NULL)
6923           {
6924           *errorcodeptr = ERR15;
6925           goto FAILED;
6926           }
6927
6928         /* Check for (?(R) for recursion. Allow digits after R to specify a
6929         specific group number. */
6930
6931         else if (*name == CHAR_R)
6932           {
6933           recno = 0;
6934           for (i = 1; i < namelen; i++)
6935             {
6936             if (!IS_DIGIT(name[i]))
6937               {
6938               *errorcodeptr = ERR15;
6939               goto FAILED;
6940               }
6941             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6942               {
6943               *errorcodeptr = ERR61;
6944               goto FAILED;
6945               }
6946             recno = recno * 10 + name[i] - CHAR_0;
6947             }
6948           if (recno == 0) recno = RREF_ANY;
6949           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6950           PUT2(code, 2+LINK_SIZE, recno);
6951           }
6952
6953         /* Similarly, check for the (?(DEFINE) "condition", which is always
6954         false. */
6955
6956         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6957           {
6958           code[1+LINK_SIZE] = OP_DEF;
6959           skipbytes = 1;
6960           }
6961
6962         /* Reference to an unidentified subpattern. */
6963
6964         else
6965           {
6966           *errorcodeptr = ERR15;
6967           goto FAILED;
6968           }
6969         break;
6970
6971
6972         /* ------------------------------------------------------------ */
6973         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6974         bravalue = OP_ASSERT;
6975         cd->assert_depth += 1;
6976         ptr++;
6977         break;
6978
6979         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6980         thing to do, but Perl allows all assertions to be quantified, and when
6981         they contain capturing parentheses there may be a potential use for
6982         this feature. Not that that applies to a quantified (?!) but we allow
6983         it for uniformity. */
6984
6985         /* ------------------------------------------------------------ */
6986         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6987         ptr++;
6988         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6989              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6990             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6991           {
6992           *code++ = OP_FAIL;
6993           previous = NULL;
6994           continue;
6995           }
6996         bravalue = OP_ASSERT_NOT;
6997         cd->assert_depth += 1;
6998         break;
6999
7000
7001         /* ------------------------------------------------------------ */
7002         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7003         switch (ptr[1])
7004           {
7005           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7006           bravalue = OP_ASSERTBACK;
7007           cd->assert_depth += 1;
7008           ptr += 2;
7009           break;
7010
7011           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7012           bravalue = OP_ASSERTBACK_NOT;
7013           cd->assert_depth += 1;
7014           ptr += 2;
7015           break;
7016
7017           default:                /* Could be name define, else bad */
7018           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7019             goto DEFINE_NAME;
7020           ptr++;                  /* Correct offset for error */
7021           *errorcodeptr = ERR24;
7022           goto FAILED;
7023           }
7024         break;
7025
7026
7027         /* ------------------------------------------------------------ */
7028         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7029         bravalue = OP_ONCE;
7030         ptr++;
7031         break;
7032
7033
7034         /* ------------------------------------------------------------ */
7035         case CHAR_C:                 /* Callout - may be followed by digits; */
7036         previous_callout = code;     /* Save for later completion */
7037         after_manual_callout = 1;    /* Skip one item before completing */
7038         *code++ = OP_CALLOUT;
7039           {
7040           int n = 0;
7041           ptr++;
7042           while(IS_DIGIT(*ptr))
7043             n = n * 10 + *ptr++ - CHAR_0;
7044           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7045             {
7046             *errorcodeptr = ERR39;
7047             goto FAILED;
7048             }
7049           if (n > 255)
7050             {
7051             *errorcodeptr = ERR38;
7052             goto FAILED;
7053             }
7054           *code++ = n;
7055           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7056           PUT(code, LINK_SIZE, 0);                          /* Default length */
7057           code += 2 * LINK_SIZE;
7058           }
7059         previous = NULL;
7060         continue;
7061
7062
7063         /* ------------------------------------------------------------ */
7064         case CHAR_P:              /* Python-style named subpattern handling */
7065         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7066             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7067           {
7068           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7069           terminator = CHAR_RIGHT_PARENTHESIS;
7070           goto NAMED_REF_OR_RECURSE;
7071           }
7072         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7073           {
7074           *errorcodeptr = ERR41;
7075           goto FAILED;
7076           }
7077         /* Fall through to handle (?P< as (?< is handled */
7078
7079
7080         /* ------------------------------------------------------------ */
7081         DEFINE_NAME:    /* Come here from (?< handling */
7082         case CHAR_APOSTROPHE:
7083         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7084           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7085         name = ++ptr;
7086         if (IS_DIGIT(*ptr))
7087           {
7088           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7089           goto FAILED;
7090           }
7091         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7092         namelen = (int)(ptr - name);
7093
7094         /* In the pre-compile phase, do a syntax check, remember the longest
7095         name, and then remember the group in a vector, expanding it if
7096         necessary. Duplicates for the same number are skipped; other duplicates
7097         are checked for validity. In the actual compile, there is nothing to
7098         do. */
7099
7100         if (lengthptr != NULL)
7101           {
7102           named_group *ng;
7103           pcre_uint32 number = cd->bracount + 1;
7104
7105           if (*ptr != (pcre_uchar)terminator)
7106             {
7107             *errorcodeptr = ERR42;
7108             goto FAILED;
7109             }
7110
7111           if (cd->names_found >= MAX_NAME_COUNT)
7112             {
7113             *errorcodeptr = ERR49;
7114             goto FAILED;
7115             }
7116
7117           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7118             {
7119             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7120             if (namelen > MAX_NAME_SIZE)
7121               {
7122               *errorcodeptr = ERR48;
7123               goto FAILED;
7124               }
7125             }
7126
7127           /* Scan the list to check for duplicates. For duplicate names, if the
7128           number is the same, break the loop, which causes the name to be
7129           discarded; otherwise, if DUPNAMES is not set, give an error.
7130           If it is set, allow the name with a different number, but continue
7131           scanning in case this is a duplicate with the same number. For
7132           non-duplicate names, give an error if the number is duplicated. */
7133
7134           ng = cd->named_groups;
7135           for (i = 0; i < cd->names_found; i++, ng++)
7136             {
7137             if (namelen == ng->length &&
7138                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7139               {
7140               if (ng->number == number) break;
7141               if ((options & PCRE_DUPNAMES) == 0)
7142                 {
7143                 *errorcodeptr = ERR43;
7144                 goto FAILED;
7145                 }
7146               cd->dupnames = TRUE;  /* Duplicate names exist */
7147               }
7148             else if (ng->number == number)
7149               {
7150               *errorcodeptr = ERR65;
7151               goto FAILED;
7152               }
7153             }
7154
7155           if (i >= cd->names_found)     /* Not a duplicate with same number */
7156             {
7157             /* Increase the list size if necessary */
7158
7159             if (cd->names_found >= cd->named_group_list_size)
7160               {
7161               int newsize = cd->named_group_list_size * 2;
7162               named_group *newspace = (PUBL(malloc))
7163                 (newsize * sizeof(named_group));
7164
7165               if (newspace == NULL)
7166                 {
7167                 *errorcodeptr = ERR21;
7168                 goto FAILED;
7169                 }
7170
7171               memcpy(newspace, cd->named_groups,
7172                 cd->named_group_list_size * sizeof(named_group));
7173               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7174                 (PUBL(free))((void *)cd->named_groups);
7175               cd->named_groups = newspace;
7176               cd->named_group_list_size = newsize;
7177               }
7178
7179             cd->named_groups[cd->names_found].name = name;
7180             cd->named_groups[cd->names_found].length = namelen;
7181             cd->named_groups[cd->names_found].number = number;
7182             cd->names_found++;
7183             }
7184           }
7185
7186         ptr++;                    /* Move past > or ' in both passes. */
7187         goto NUMBERED_GROUP;
7188
7189
7190         /* ------------------------------------------------------------ */
7191         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7192         terminator = CHAR_RIGHT_PARENTHESIS;
7193         is_recurse = TRUE;
7194         /* Fall through */
7195
7196         /* We come here from the Python syntax above that handles both
7197         references (?P=name) and recursion (?P>name), as well as falling
7198         through from the Perl recursion syntax (?&name). We also come here from
7199         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7200         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7201
7202         NAMED_REF_OR_RECURSE:
7203         name = ++ptr;
7204         if (IS_DIGIT(*ptr))
7205           {
7206           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7207           goto FAILED;
7208           }
7209         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7210         namelen = (int)(ptr - name);
7211
7212         /* In the pre-compile phase, do a syntax check. We used to just set
7213         a dummy reference number, because it was not used in the first pass.
7214         However, with the change of recursive back references to be atomic,
7215         we have to look for the number so that this state can be identified, as
7216         otherwise the incorrect length is computed. If it's not a backwards
7217         reference, the dummy number will do. */
7218
7219         if (lengthptr != NULL)
7220           {
7221           named_group *ng;
7222           recno = 0;
7223
7224           if (namelen == 0)
7225             {
7226             *errorcodeptr = ERR62;
7227             goto FAILED;
7228             }
7229           if (*ptr != (pcre_uchar)terminator)
7230             {
7231             *errorcodeptr = ERR42;
7232             goto FAILED;
7233             }
7234           if (namelen > MAX_NAME_SIZE)
7235             {
7236             *errorcodeptr = ERR48;
7237             goto FAILED;
7238             }
7239
7240           /* Count named back references. */
7241
7242           if (!is_recurse) cd->namedrefcount++;
7243
7244           /* We have to allow for a named reference to a duplicated name (this
7245           cannot be determined until the second pass). This needs an extra
7246           16-bit data item. */
7247
7248           *lengthptr += IMM2_SIZE;
7249
7250           /* If this is a forward reference and we are within a (?|...) group,
7251           the reference may end up as the number of a group which we are
7252           currently inside, that is, it could be a recursive reference. In the
7253           real compile this will be picked up and the reference wrapped with
7254           OP_ONCE to make it atomic, so we must space in case this occurs. */
7255
7256           /* In fact, this can happen for a non-forward reference because
7257           another group with the same number might be created later. This
7258           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7259           only mode, we finesse the bug by allowing more memory always. */
7260
7261           *lengthptr += 2 + 2*LINK_SIZE;
7262
7263           /* It is even worse than that. The current reference may be to an
7264           existing named group with a different number (so apparently not
7265           recursive) but which later on is also attached to a group with the
7266           current number. This can only happen if $(| has been previous
7267           encountered. In that case, we allow yet more memory, just in case.
7268           (Again, this is fixed "properly" in PCRE2. */
7269
7270           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7271
7272           /* Otherwise, check for recursion here. The name table does not exist
7273           in the first pass; instead we must scan the list of names encountered
7274           so far in order to get the number. If the name is not found, leave
7275           the value of recno as 0 for a forward reference. */
7276
7277           else
7278             {
7279             ng = cd->named_groups;
7280             for (i = 0; i < cd->names_found; i++, ng++)
7281               {
7282               if (namelen == ng->length &&
7283                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7284                 {
7285                 open_capitem *oc;
7286                 recno = ng->number;
7287                 if (is_recurse) break;
7288                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7289                   {
7290                   if (oc->number == recno)
7291                     {
7292                     oc->flag = TRUE;
7293                     break;
7294                     }
7295                   }
7296                 }
7297               }
7298             }
7299           }
7300
7301         /* In the real compile, search the name table. We check the name
7302         first, and then check that we have reached the end of the name in the
7303         table. That way, if the name is longer than any in the table, the
7304         comparison will fail without reading beyond the table entry. */
7305
7306         else
7307           {
7308           slot = cd->name_table;
7309           for (i = 0; i < cd->names_found; i++)
7310             {
7311             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7312                 slot[IMM2_SIZE+namelen] == 0)
7313               break;
7314             slot += cd->name_entry_size;
7315             }
7316
7317           if (i < cd->names_found)
7318             {
7319             recno = GET2(slot, 0);
7320             }
7321           else
7322             {
7323             *errorcodeptr = ERR15;
7324             goto FAILED;
7325             }
7326           }
7327
7328         /* In both phases, for recursions, we can now go to the code than
7329         handles numerical recursion. */
7330
7331         if (is_recurse) goto HANDLE_RECURSION;
7332
7333         /* In the second pass we must see if the name is duplicated. If so, we
7334         generate a different opcode. */
7335
7336         if (lengthptr == NULL && cd->dupnames)
7337           {
7338           int count = 1;
7339           unsigned int index = i;
7340           pcre_uchar *cslot = slot + cd->name_entry_size;
7341
7342           for (i++; i < cd->names_found; i++)
7343             {
7344             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7345             count++;
7346             cslot += cd->name_entry_size;
7347             }
7348
7349           if (count > 1)
7350             {
7351             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7352             previous = code;
7353             item_hwm_offset = cd->hwm - cd->start_workspace;
7354             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7355             PUT2INC(code, 0, index);
7356             PUT2INC(code, 0, count);
7357
7358             /* Process each potentially referenced group. */
7359
7360             for (; slot < cslot; slot += cd->name_entry_size)
7361               {
7362               open_capitem *oc;
7363               recno = GET2(slot, 0);
7364               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7365               if (recno > cd->top_backref) cd->top_backref = recno;
7366
7367               /* Check to see if this back reference is recursive, that it, it
7368               is inside the group that it references. A flag is set so that the
7369               group can be made atomic. */
7370
7371               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7372                 {
7373                 if (oc->number == recno)
7374                   {
7375                   oc->flag = TRUE;
7376                   break;
7377                   }
7378                 }
7379               }
7380
7381             continue;  /* End of back ref handling */
7382             }
7383           }
7384
7385         /* First pass, or a non-duplicated name. */
7386
7387         goto HANDLE_REFERENCE;
7388
7389
7390         /* ------------------------------------------------------------ */
7391         case CHAR_R:              /* Recursion, same as (?0) */
7392         recno = 0;
7393         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7394           {
7395           *errorcodeptr = ERR29;
7396           goto FAILED;
7397           }
7398         goto HANDLE_RECURSION;
7399
7400
7401         /* ------------------------------------------------------------ */
7402         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7403         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7404         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7405           {
7406           const pcre_uchar *called;
7407           terminator = CHAR_RIGHT_PARENTHESIS;
7408
7409           /* Come here from the \g<...> and \g'...' code (Oniguruma
7410           compatibility). However, the syntax has been checked to ensure that
7411           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7412           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7413           ever be taken. */
7414
7415           HANDLE_NUMERICAL_RECURSION:
7416
7417           if ((refsign = *ptr) == CHAR_PLUS)
7418             {
7419             ptr++;
7420             if (!IS_DIGIT(*ptr))
7421               {
7422               *errorcodeptr = ERR63;
7423               goto FAILED;
7424               }
7425             }
7426           else if (refsign == CHAR_MINUS)
7427             {
7428             if (!IS_DIGIT(ptr[1]))
7429               goto OTHER_CHAR_AFTER_QUERY;
7430             ptr++;
7431             }
7432
7433           recno = 0;
7434           while(IS_DIGIT(*ptr))
7435             {
7436             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7437               {
7438               while (IS_DIGIT(*ptr)) ptr++;
7439               *errorcodeptr = ERR61;
7440               goto FAILED;
7441               }
7442             recno = recno * 10 + *ptr++ - CHAR_0;
7443             }
7444
7445           if (*ptr != (pcre_uchar)terminator)
7446             {
7447             *errorcodeptr = ERR29;
7448             goto FAILED;
7449             }
7450
7451           if (refsign == CHAR_MINUS)
7452             {
7453             if (recno == 0)
7454               {
7455               *errorcodeptr = ERR58;
7456               goto FAILED;
7457               }
7458             recno = cd->bracount - recno + 1;
7459             if (recno <= 0)
7460               {
7461               *errorcodeptr = ERR15;
7462               goto FAILED;
7463               }
7464             }
7465           else if (refsign == CHAR_PLUS)
7466             {
7467             if (recno == 0)
7468               {
7469               *errorcodeptr = ERR58;
7470               goto FAILED;
7471               }
7472             recno += cd->bracount;
7473             }
7474
7475           /* Come here from code above that handles a named recursion */
7476
7477           HANDLE_RECURSION:
7478
7479           previous = code;
7480           item_hwm_offset = cd->hwm - cd->start_workspace;
7481           called = cd->start_code;
7482
7483           /* When we are actually compiling, find the bracket that is being
7484           referenced. Temporarily end the regex in case it doesn't exist before
7485           this point. If we end up with a forward reference, first check that
7486           the bracket does occur later so we can give the error (and position)
7487           now. Then remember this forward reference in the workspace so it can
7488           be filled in at the end. */
7489
7490           if (lengthptr == NULL)
7491             {
7492             *code = OP_END;
7493             if (recno != 0)
7494               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7495
7496             /* Forward reference */
7497
7498             if (called == NULL)
7499               {
7500               if (recno > cd->final_bracount)
7501                 {
7502                 *errorcodeptr = ERR15;
7503                 goto FAILED;
7504                 }
7505
7506               /* Fudge the value of "called" so that when it is inserted as an
7507               offset below, what it actually inserted is the reference number
7508               of the group. Then remember the forward reference. */
7509
7510               called = cd->start_code + recno;
7511               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7512                   WORK_SIZE_SAFETY_MARGIN)
7513                 {
7514                 *errorcodeptr = expand_workspace(cd);
7515                 if (*errorcodeptr != 0) goto FAILED;
7516                 }
7517               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7518               }
7519
7520             /* If not a forward reference, and the subpattern is still open,
7521             this is a recursive call. We check to see if this is a left
7522             recursion that could loop for ever, and diagnose that case. We
7523             must not, however, do this check if we are in a conditional
7524             subpattern because the condition might be testing for recursion in
7525             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7526             Forever loops are also detected at runtime, so those that occur in
7527             conditional subpatterns will be picked up then. */
7528
7529             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7530                      could_be_empty(called, code, bcptr, utf, cd))
7531               {
7532               *errorcodeptr = ERR40;
7533               goto FAILED;
7534               }
7535             }
7536
7537           /* Insert the recursion/subroutine item. It does not have a set first
7538           character (relevant if it is repeated, because it will then be
7539           wrapped with ONCE brackets). */
7540
7541           *code = OP_RECURSE;
7542           PUT(code, 1, (int)(called - cd->start_code));
7543           code += 1 + LINK_SIZE;
7544           groupsetfirstchar = FALSE;
7545           }
7546
7547         /* Can't determine a first byte now */
7548
7549         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7550         continue;
7551
7552
7553         /* ------------------------------------------------------------ */
7554         default:              /* Other characters: check option setting */
7555         OTHER_CHAR_AFTER_QUERY:
7556         set = unset = 0;
7557         optset = &set;
7558
7559         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7560           {
7561           switch (*ptr++)
7562             {
7563             case CHAR_MINUS: optset = &unset; break;
7564
7565             case CHAR_J:    /* Record that it changed in the external options */
7566             *optset |= PCRE_DUPNAMES;
7567             cd->external_flags |= PCRE_JCHANGED;
7568             break;
7569
7570             case CHAR_i: *optset |= PCRE_CASELESS; break;
7571             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7572             case CHAR_s: *optset |= PCRE_DOTALL; break;
7573             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7574             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7575             case CHAR_X: *optset |= PCRE_EXTRA; break;
7576
7577             default:  *errorcodeptr = ERR12;
7578                       ptr--;    /* Correct the offset */
7579                       goto FAILED;
7580             }
7581           }
7582
7583         /* Set up the changed option bits, but don't change anything yet. */
7584
7585         newoptions = (options | set) & (~unset);
7586
7587         /* If the options ended with ')' this is not the start of a nested
7588         group with option changes, so the options change at this level. If this
7589         item is right at the start of the pattern, the options can be
7590         abstracted and made external in the pre-compile phase, and ignored in
7591         the compile phase. This can be helpful when matching -- for instance in
7592         caseless checking of required bytes.
7593
7594         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7595         definitely *not* at the start of the pattern because something has been
7596         compiled. In the pre-compile phase, however, the code pointer can have
7597         that value after the start, because it gets reset as code is discarded
7598         during the pre-compile. However, this can happen only at top level - if
7599         we are within parentheses, the starting BRA will still be present. At
7600         any parenthesis level, the length value can be used to test if anything
7601         has been compiled at that level. Thus, a test for both these conditions
7602         is necessary to ensure we correctly detect the start of the pattern in
7603         both phases.
7604
7605         If we are not at the pattern start, reset the greedy defaults and the
7606         case value for firstchar and reqchar. */
7607
7608         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7609           {
7610           if (code == cd->start_code + 1 + LINK_SIZE &&
7611                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7612             {
7613             cd->external_options = newoptions;
7614             }
7615           else
7616             {
7617             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7618             greedy_non_default = greedy_default ^ 1;
7619             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7620             }
7621
7622           /* Change options at this level, and pass them back for use
7623           in subsequent branches. */
7624
7625           *optionsptr = options = newoptions;
7626           previous = NULL;       /* This item can't be repeated */
7627           continue;              /* It is complete */
7628           }
7629
7630         /* If the options ended with ':' we are heading into a nested group
7631         with possible change of options. Such groups are non-capturing and are
7632         not assertions of any kind. All we need to do is skip over the ':';
7633         the newoptions value is handled below. */
7634
7635         bravalue = OP_BRA;
7636         ptr++;
7637         }     /* End of switch for character following (? */
7638       }       /* End of (? handling */
7639
7640     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7641     is set, all unadorned brackets become non-capturing and behave like (?:...)
7642     brackets. */
7643
7644     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7645       {
7646       bravalue = OP_BRA;
7647       }
7648
7649     /* Else we have a capturing group. */
7650
7651     else
7652       {
7653       NUMBERED_GROUP:
7654       cd->bracount += 1;
7655       PUT2(code, 1+LINK_SIZE, cd->bracount);
7656       skipbytes = IMM2_SIZE;
7657       }
7658
7659     /* Process nested bracketed regex. First check for parentheses nested too
7660     deeply. */
7661
7662     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7663       {
7664       *errorcodeptr = ERR82;
7665       goto FAILED;
7666       }
7667
7668     /* All assertions used not to be repeatable, but this was changed for Perl
7669     compatibility. All kinds can now be repeated except for assertions that are
7670     conditions (Perl also forbids these to be repeated). We copy code into a
7671     non-register variable (tempcode) in order to be able to pass its address
7672     because some compilers complain otherwise. At the start of a conditional
7673     group whose condition is an assertion, cd->iscondassert is set. We unset it
7674     here so as to allow assertions later in the group to be quantified. */
7675
7676     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7677         cd->iscondassert)
7678       {
7679       previous = NULL;
7680       cd->iscondassert = FALSE;
7681       }
7682     else
7683       {
7684       previous = code;
7685       item_hwm_offset = cd->hwm - cd->start_workspace;
7686       }
7687
7688     *code = bravalue;
7689     tempcode = code;
7690     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7691     tempbracount = cd->bracount;          /* Save value before bracket */
7692     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7693
7694     if (!compile_regex(
7695          newoptions,                      /* The complete new option state */
7696          &tempcode,                       /* Where to put code (updated) */
7697          &ptr,                            /* Input pointer (updated) */
7698          errorcodeptr,                    /* Where to put an error message */
7699          (bravalue == OP_ASSERTBACK ||
7700           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7701          reset_bracount,                  /* True if (?| group */
7702          skipbytes,                       /* Skip over bracket number */
7703          cond_depth +
7704            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7705          &subfirstchar,                   /* For possible first char */
7706          &subfirstcharflags,
7707          &subreqchar,                     /* For possible last char */
7708          &subreqcharflags,
7709          bcptr,                           /* Current branch chain */
7710          cd,                              /* Tables block */
7711          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7712            &length_prevgroup              /* Pre-compile phase */
7713          ))
7714       goto FAILED;
7715
7716     cd->parens_depth -= 1;
7717
7718     /* If this was an atomic group and there are no capturing groups within it,
7719     generate OP_ONCE_NC instead of OP_ONCE. */
7720
7721     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7722       *code = OP_ONCE_NC;
7723
7724     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7725       cd->assert_depth -= 1;
7726
7727     /* At the end of compiling, code is still pointing to the start of the
7728     group, while tempcode has been updated to point past the end of the group.
7729     The pattern pointer (ptr) is on the bracket.
7730
7731     If this is a conditional bracket, check that there are no more than
7732     two branches in the group, or just one if it's a DEFINE group. We do this
7733     in the real compile phase, not in the pre-pass, where the whole group may
7734     not be available. */
7735
7736     if (bravalue == OP_COND && lengthptr == NULL)
7737       {
7738       pcre_uchar *tc = code;
7739       int condcount = 0;
7740
7741       do {
7742          condcount++;
7743          tc += GET(tc,1);
7744          }
7745       while (*tc != OP_KET);
7746
7747       /* A DEFINE group is never obeyed inline (the "condition" is always
7748       false). It must have only one branch. */
7749
7750       if (code[LINK_SIZE+1] == OP_DEF)
7751         {
7752         if (condcount > 1)
7753           {
7754           *errorcodeptr = ERR54;
7755           goto FAILED;
7756           }
7757         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7758         }
7759
7760       /* A "normal" conditional group. If there is just one branch, we must not
7761       make use of its firstchar or reqchar, because this is equivalent to an
7762       empty second branch. */
7763
7764       else
7765         {
7766         if (condcount > 2)
7767           {
7768           *errorcodeptr = ERR27;
7769           goto FAILED;
7770           }
7771         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7772         }
7773       }
7774
7775     /* Error if hit end of pattern */
7776
7777     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7778       {
7779       *errorcodeptr = ERR14;
7780       goto FAILED;
7781       }
7782
7783     /* In the pre-compile phase, update the length by the length of the group,
7784     less the brackets at either end. Then reduce the compiled code to just a
7785     set of non-capturing brackets so that it doesn't use much memory if it is
7786     duplicated by a quantifier.*/
7787
7788     if (lengthptr != NULL)
7789       {
7790       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7791         {
7792         *errorcodeptr = ERR20;
7793         goto FAILED;
7794         }
7795       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7796       code++;   /* This already contains bravalue */
7797       PUTINC(code, 0, 1 + LINK_SIZE);
7798       *code++ = OP_KET;
7799       PUTINC(code, 0, 1 + LINK_SIZE);
7800       break;    /* No need to waste time with special character handling */
7801       }
7802
7803     /* Otherwise update the main code pointer to the end of the group. */
7804
7805     code = tempcode;
7806
7807     /* For a DEFINE group, required and first character settings are not
7808     relevant. */
7809
7810     if (bravalue == OP_DEF) break;
7811
7812     /* Handle updating of the required and first characters for other types of
7813     group. Update for normal brackets of all kinds, and conditions with two
7814     branches (see code above). If the bracket is followed by a quantifier with
7815     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7816     zerofirstchar outside the main loop so that they can be accessed for the
7817     back off. */
7818
7819     zeroreqchar = reqchar;
7820     zeroreqcharflags = reqcharflags;
7821     zerofirstchar = firstchar;
7822     zerofirstcharflags = firstcharflags;
7823     groupsetfirstchar = FALSE;
7824
7825     if (bravalue >= OP_ONCE)
7826       {
7827       /* If we have not yet set a firstchar in this branch, take it from the
7828       subpattern, remembering that it was set here so that a repeat of more
7829       than one can replicate it as reqchar if necessary. If the subpattern has
7830       no firstchar, set "none" for the whole branch. In both cases, a zero
7831       repeat forces firstchar to "none". */
7832
7833       if (firstcharflags == REQ_UNSET)
7834         {
7835         if (subfirstcharflags >= 0)
7836           {
7837           firstchar = subfirstchar;
7838           firstcharflags = subfirstcharflags;
7839           groupsetfirstchar = TRUE;
7840           }
7841         else firstcharflags = REQ_NONE;
7842         zerofirstcharflags = REQ_NONE;
7843         }
7844
7845       /* If firstchar was previously set, convert the subpattern's firstchar
7846       into reqchar if there wasn't one, using the vary flag that was in
7847       existence beforehand. */
7848
7849       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7850         {
7851         subreqchar = subfirstchar;
7852         subreqcharflags = subfirstcharflags | tempreqvary;
7853         }
7854
7855       /* If the subpattern set a required byte (or set a first byte that isn't
7856       really the first byte - see above), set it. */
7857
7858       if (subreqcharflags >= 0)
7859         {
7860         reqchar = subreqchar;
7861         reqcharflags = subreqcharflags;
7862         }
7863       }
7864
7865     /* For a forward assertion, we take the reqchar, if set. This can be
7866     helpful if the pattern that follows the assertion doesn't set a different
7867     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7868     for an assertion, however because it leads to incorrect effect for patterns
7869     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7870     of a firstchar. This is overcome by a scan at the end if there's no
7871     firstchar, looking for an asserted first char. */
7872
7873     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7874       {
7875       reqchar = subreqchar;
7876       reqcharflags = subreqcharflags;
7877       }
7878     break;     /* End of processing '(' */
7879
7880
7881     /* ===================================================================*/
7882     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7883     are arranged to be the negation of the corresponding OP_values in the
7884     default case when PCRE_UCP is not set. For the back references, the values
7885     are negative the reference number. Only back references and those types
7886     that consume a character may be repeated. We can test for values between
7887     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7888     ever created. */
7889
7890     case CHAR_BACKSLASH:
7891     tempptr = ptr;
7892     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7893     if (*errorcodeptr != 0) goto FAILED;
7894
7895     if (escape == 0)                  /* The escape coded a single character */
7896       c = ec;
7897     else
7898       {
7899       if (escape == ESC_Q)            /* Handle start of quoted string */
7900         {
7901         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7902           ptr += 2;               /* avoid empty string */
7903             else inescq = TRUE;
7904         continue;
7905         }
7906
7907       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7908
7909       /* For metasequences that actually match a character, we disable the
7910       setting of a first character if it hasn't already been set. */
7911
7912       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7913         firstcharflags = REQ_NONE;
7914
7915       /* Set values to reset to if this is followed by a zero repeat. */
7916
7917       zerofirstchar = firstchar;
7918       zerofirstcharflags = firstcharflags;
7919       zeroreqchar = reqchar;
7920       zeroreqcharflags = reqcharflags;
7921
7922       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7923       is a subroutine call by number (Oniguruma syntax). In fact, the value
7924       ESC_g is returned only for these cases. So we don't need to check for <
7925       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7926       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7927       that is a synonym for a named back reference). */
7928
7929       if (escape == ESC_g)
7930         {
7931         const pcre_uchar *p;
7932         pcre_uint32 cf;
7933
7934         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7935         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7936           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7937
7938         /* These two statements stop the compiler for warning about possibly
7939         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7940         fact, because we do the check for a number below, the paths that
7941         would actually be in error are never taken. */
7942
7943         skipbytes = 0;
7944         reset_bracount = FALSE;
7945
7946         /* If it's not a signed or unsigned number, treat it as a name. */
7947
7948         cf = ptr[1];
7949         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7950           {
7951           is_recurse = TRUE;
7952           goto NAMED_REF_OR_RECURSE;
7953           }
7954
7955         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7956         or a digit. */
7957
7958         p = ptr + 2;
7959         while (IS_DIGIT(*p)) p++;
7960         if (*p != (pcre_uchar)terminator)
7961           {
7962           *errorcodeptr = ERR57;
7963           goto FAILED;
7964           }
7965         ptr++;
7966         goto HANDLE_NUMERICAL_RECURSION;
7967         }
7968
7969       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7970       We also support \k{name} (.NET syntax).  */
7971
7972       if (escape == ESC_k)
7973         {
7974         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7975           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7976           {
7977           *errorcodeptr = ERR69;
7978           goto FAILED;
7979           }
7980         is_recurse = FALSE;
7981         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7982           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7983           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7984         goto NAMED_REF_OR_RECURSE;
7985         }
7986
7987       /* Back references are handled specially; must disable firstchar if
7988       not set to cope with cases like (?=(\w+))\1: which would otherwise set
7989       ':' later. */
7990
7991       if (escape < 0)
7992         {
7993         open_capitem *oc;
7994         recno = -escape;
7995
7996         /* Come here from named backref handling when the reference is to a
7997         single group (i.e. not to a duplicated name. */
7998
7999         HANDLE_REFERENCE:
8000         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8001         previous = code;
8002         item_hwm_offset = cd->hwm - cd->start_workspace;
8003         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8004         PUT2INC(code, 0, recno);
8005         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8006         if (recno > cd->top_backref) cd->top_backref = recno;
8007
8008         /* Check to see if this back reference is recursive, that it, it
8009         is inside the group that it references. A flag is set so that the
8010         group can be made atomic. */
8011
8012         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8013           {
8014           if (oc->number == recno)
8015             {
8016             oc->flag = TRUE;
8017             break;
8018             }
8019           }
8020         }
8021
8022       /* So are Unicode property matches, if supported. */
8023
8024 #ifdef SUPPORT_UCP
8025       else if (escape == ESC_P || escape == ESC_p)
8026         {
8027         BOOL negated;
8028         unsigned int ptype = 0, pdata = 0;
8029         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8030           goto FAILED;
8031         previous = code;
8032         item_hwm_offset = cd->hwm - cd->start_workspace;
8033         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8034         *code++ = ptype;
8035         *code++ = pdata;
8036         }
8037 #else
8038
8039       /* If Unicode properties are not supported, \X, \P, and \p are not
8040       allowed. */
8041
8042       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8043         {
8044         *errorcodeptr = ERR45;
8045         goto FAILED;
8046         }
8047 #endif
8048
8049       /* For the rest (including \X when Unicode properties are supported), we
8050       can obtain the OP value by negating the escape value in the default
8051       situation when PCRE_UCP is not set. When it *is* set, we substitute
8052       Unicode property tests. Note that \b and \B do a one-character
8053       lookbehind, and \A also behaves as if it does. */
8054
8055       else
8056         {
8057         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8058              cd->max_lookbehind == 0)
8059           cd->max_lookbehind = 1;
8060 #ifdef SUPPORT_UCP
8061         if (escape >= ESC_DU && escape <= ESC_wu)
8062           {
8063           nestptr = ptr + 1;                   /* Where to resume */
8064           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8065           }
8066         else
8067 #endif
8068         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8069         so that it works in DFA mode and in lookbehinds. */
8070
8071           {
8072           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8073           item_hwm_offset = cd->hwm - cd->start_workspace;
8074           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8075           }
8076         }
8077       continue;
8078       }
8079
8080     /* We have a data character whose value is in c. In UTF-8 mode it may have
8081     a value > 127. We set its representation in the length/buffer, and then
8082     handle it as a data character. */
8083
8084 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8085     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8086       mclength = PRIV(ord2utf)(c, mcbuffer);
8087     else
8088 #endif
8089
8090      {
8091      mcbuffer[0] = c;
8092      mclength = 1;
8093      }
8094     goto ONE_CHAR;
8095
8096
8097     /* ===================================================================*/
8098     /* Handle a literal character. It is guaranteed not to be whitespace or #
8099     when the extended flag is set. If we are in a UTF mode, it may be a
8100     multi-unit literal character. */
8101
8102     default:
8103     NORMAL_CHAR:
8104     mclength = 1;
8105     mcbuffer[0] = c;
8106
8107 #ifdef SUPPORT_UTF
8108     if (utf && HAS_EXTRALEN(c))
8109       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8110 #endif
8111
8112     /* At this point we have the character's bytes in mcbuffer, and the length
8113     in mclength. When not in UTF-8 mode, the length is always 1. */
8114
8115     ONE_CHAR:
8116     previous = code;
8117     item_hwm_offset = cd->hwm - cd->start_workspace;
8118
8119     /* For caseless UTF-8 mode when UCP support is available, check whether
8120     this character has more than one other case. If so, generate a special
8121     OP_PROP item instead of OP_CHARI. */
8122
8123 #ifdef SUPPORT_UCP
8124     if (utf && (options & PCRE_CASELESS) != 0)
8125       {
8126       GETCHAR(c, mcbuffer);
8127       if ((c = UCD_CASESET(c)) != 0)
8128         {
8129         *code++ = OP_PROP;
8130         *code++ = PT_CLIST;
8131         *code++ = c;
8132         if (firstcharflags == REQ_UNSET)
8133           firstcharflags = zerofirstcharflags = REQ_NONE;
8134         break;
8135         }
8136       }
8137 #endif
8138
8139     /* Caseful matches, or not one of the multicase characters. */
8140
8141     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8142     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8143
8144     /* Remember if \r or \n were seen */
8145
8146     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8147       cd->external_flags |= PCRE_HASCRORLF;
8148
8149     /* Set the first and required bytes appropriately. If no previous first
8150     byte, set it from this character, but revert to none on a zero repeat.
8151     Otherwise, leave the firstchar value alone, and don't change it on a zero
8152     repeat. */
8153
8154     if (firstcharflags == REQ_UNSET)
8155       {
8156       zerofirstcharflags = REQ_NONE;
8157       zeroreqchar = reqchar;
8158       zeroreqcharflags = reqcharflags;
8159
8160       /* If the character is more than one byte long, we can set firstchar
8161       only if it is not to be matched caselessly. */
8162
8163       if (mclength == 1 || req_caseopt == 0)
8164         {
8165         firstchar = mcbuffer[0] | req_caseopt;
8166         firstchar = mcbuffer[0];
8167         firstcharflags = req_caseopt;
8168
8169         if (mclength != 1)
8170           {
8171           reqchar = code[-1];
8172           reqcharflags = cd->req_varyopt;
8173           }
8174         }
8175       else firstcharflags = reqcharflags = REQ_NONE;
8176       }
8177
8178     /* firstchar was previously set; we can set reqchar only if the length is
8179     1 or the matching is caseful. */
8180
8181     else
8182       {
8183       zerofirstchar = firstchar;
8184       zerofirstcharflags = firstcharflags;
8185       zeroreqchar = reqchar;
8186       zeroreqcharflags = reqcharflags;
8187       if (mclength == 1 || req_caseopt == 0)
8188         {
8189         reqchar = code[-1];
8190         reqcharflags = req_caseopt | cd->req_varyopt;
8191         }
8192       }
8193
8194     break;            /* End of literal character handling */
8195     }
8196   }                   /* end of big loop */
8197
8198
8199 /* Control never reaches here by falling through, only by a goto for all the
8200 error states. Pass back the position in the pattern so that it can be displayed
8201 to the user for diagnosing the error. */
8202
8203 FAILED:
8204 *ptrptr = ptr;
8205 return FALSE;
8206 }
8207
8208
8209
8210 /*************************************************
8211 *     Compile sequence of alternatives           *
8212 *************************************************/
8213
8214 /* On entry, ptr is pointing past the bracket character, but on return it
8215 points to the closing bracket, or vertical bar, or end of string. The code
8216 variable is pointing at the byte into which the BRA operator has been stored.
8217 This function is used during the pre-compile phase when we are trying to find
8218 out the amount of memory needed, as well as during the real compile phase. The
8219 value of lengthptr distinguishes the two phases.
8220
8221 Arguments:
8222   options           option bits, including any changes for this subpattern
8223   codeptr           -> the address of the current code pointer
8224   ptrptr            -> the address of the current pattern pointer
8225   errorcodeptr      -> pointer to error code variable
8226   lookbehind        TRUE if this is a lookbehind assertion
8227   reset_bracount    TRUE to reset the count for each branch
8228   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8229   cond_depth        depth of nesting for conditional subpatterns
8230   firstcharptr      place to put the first required character
8231   firstcharflagsptr place to put the first character flags, or a negative number
8232   reqcharptr        place to put the last required character
8233   reqcharflagsptr   place to put the last required character flags, or a negative number
8234   bcptr             pointer to the chain of currently open branches
8235   cd                points to the data block with tables pointers etc.
8236   lengthptr         NULL during the real compile phase
8237                     points to length accumulator during pre-compile phase
8238
8239 Returns:            TRUE on success
8240 */
8241
8242 static BOOL
8243 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8244   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8245   int cond_depth,
8246   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8247   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8248   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8249 {
8250 const pcre_uchar *ptr = *ptrptr;
8251 pcre_uchar *code = *codeptr;
8252 pcre_uchar *last_branch = code;
8253 pcre_uchar *start_bracket = code;
8254 pcre_uchar *reverse_count = NULL;
8255 open_capitem capitem;
8256 int capnumber = 0;
8257 pcre_uint32 firstchar, reqchar;
8258 pcre_int32 firstcharflags, reqcharflags;
8259 pcre_uint32 branchfirstchar, branchreqchar;
8260 pcre_int32 branchfirstcharflags, branchreqcharflags;
8261 int length;
8262 unsigned int orig_bracount;
8263 unsigned int max_bracount;
8264 branch_chain bc;
8265 size_t save_hwm_offset;
8266
8267 /* If set, call the external function that checks for stack availability. */
8268
8269 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8270   {
8271   *errorcodeptr= ERR85;
8272   return FALSE;
8273   }
8274
8275 /* Miscellaneous initialization */
8276
8277 bc.outer = bcptr;
8278 bc.current_branch = code;
8279
8280 firstchar = reqchar = 0;
8281 firstcharflags = reqcharflags = REQ_UNSET;
8282
8283 save_hwm_offset = cd->hwm - cd->start_workspace;
8284
8285 /* Accumulate the length for use in the pre-compile phase. Start with the
8286 length of the BRA and KET and any extra bytes that are required at the
8287 beginning. We accumulate in a local variable to save frequent testing of
8288 lenthptr for NULL. We cannot do this by looking at the value of code at the
8289 start and end of each alternative, because compiled items are discarded during
8290 the pre-compile phase so that the work space is not exceeded. */
8291
8292 length = 2 + 2*LINK_SIZE + skipbytes;
8293
8294 /* WARNING: If the above line is changed for any reason, you must also change
8295 the code that abstracts option settings at the start of the pattern and makes
8296 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8297 pre-compile phase to find out whether anything has yet been compiled or not. */
8298
8299 /* If this is a capturing subpattern, add to the chain of open capturing items
8300 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8301 detect groups that contain recursive back references to themselves. Note that
8302 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8303 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8304
8305 if (*code == OP_CBRA)
8306   {
8307   capnumber = GET2(code, 1 + LINK_SIZE);
8308   capitem.number = capnumber;
8309   capitem.next = cd->open_caps;
8310   capitem.flag = FALSE;
8311   cd->open_caps = &capitem;
8312   }
8313
8314 /* Offset is set zero to mark that this bracket is still open */
8315
8316 PUT(code, 1, 0);
8317 code += 1 + LINK_SIZE + skipbytes;
8318
8319 /* Loop for each alternative branch */
8320
8321 orig_bracount = max_bracount = cd->bracount;
8322 for (;;)
8323   {
8324   /* For a (?| group, reset the capturing bracket count so that each branch
8325   uses the same numbers. */
8326
8327   if (reset_bracount) cd->bracount = orig_bracount;
8328
8329   /* Set up dummy OP_REVERSE if lookbehind assertion */
8330
8331   if (lookbehind)
8332     {
8333     *code++ = OP_REVERSE;
8334     reverse_count = code;
8335     PUTINC(code, 0, 0);
8336     length += 1 + LINK_SIZE;
8337     }
8338
8339   /* Now compile the branch; in the pre-compile phase its length gets added
8340   into the length. */
8341
8342   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8343         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8344         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8345     {
8346     *ptrptr = ptr;
8347     return FALSE;
8348     }
8349
8350   /* Keep the highest bracket count in case (?| was used and some branch
8351   has fewer than the rest. */
8352
8353   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8354
8355   /* In the real compile phase, there is some post-processing to be done. */
8356
8357   if (lengthptr == NULL)
8358     {
8359     /* If this is the first branch, the firstchar and reqchar values for the
8360     branch become the values for the regex. */
8361
8362     if (*last_branch != OP_ALT)
8363       {
8364       firstchar = branchfirstchar;
8365       firstcharflags = branchfirstcharflags;
8366       reqchar = branchreqchar;
8367       reqcharflags = branchreqcharflags;
8368       }
8369
8370     /* If this is not the first branch, the first char and reqchar have to
8371     match the values from all the previous branches, except that if the
8372     previous value for reqchar didn't have REQ_VARY set, it can still match,
8373     and we set REQ_VARY for the regex. */
8374
8375     else
8376       {
8377       /* If we previously had a firstchar, but it doesn't match the new branch,
8378       we have to abandon the firstchar for the regex, but if there was
8379       previously no reqchar, it takes on the value of the old firstchar. */
8380
8381       if (firstcharflags >= 0 &&
8382           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8383         {
8384         if (reqcharflags < 0)
8385           {
8386           reqchar = firstchar;
8387           reqcharflags = firstcharflags;
8388           }
8389         firstcharflags = REQ_NONE;
8390         }
8391
8392       /* If we (now or from before) have no firstchar, a firstchar from the
8393       branch becomes a reqchar if there isn't a branch reqchar. */
8394
8395       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8396         {
8397         branchreqchar = branchfirstchar;
8398         branchreqcharflags = branchfirstcharflags;
8399         }
8400
8401       /* Now ensure that the reqchars match */
8402
8403       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8404           reqchar != branchreqchar)
8405         reqcharflags = REQ_NONE;
8406       else
8407         {
8408         reqchar = branchreqchar;
8409         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8410         }
8411       }
8412
8413     /* If lookbehind, check that this branch matches a fixed-length string, and
8414     put the length into the OP_REVERSE item. Temporarily mark the end of the
8415     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8416     because there may be forward references that we can't check here. Set a
8417     flag to cause another lookbehind check at the end. Why not do it all at the
8418     end? Because common, erroneous checks are picked up here and the offset of
8419     the problem can be shown. */
8420
8421     if (lookbehind)
8422       {
8423       int fixed_length;
8424       *code = OP_END;
8425       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8426         FALSE, cd, NULL);
8427       DPRINTF(("fixed length = %d\n", fixed_length));
8428       if (fixed_length == -3)
8429         {
8430         cd->check_lookbehind = TRUE;
8431         }
8432       else if (fixed_length < 0)
8433         {
8434         *errorcodeptr = (fixed_length == -2)? ERR36 :
8435                         (fixed_length == -4)? ERR70: ERR25;
8436         *ptrptr = ptr;
8437         return FALSE;
8438         }
8439       else
8440         {
8441         if (fixed_length > cd->max_lookbehind)
8442           cd->max_lookbehind = fixed_length;
8443         PUT(reverse_count, 0, fixed_length);
8444         }
8445       }
8446     }
8447
8448   /* Reached end of expression, either ')' or end of pattern. In the real
8449   compile phase, go back through the alternative branches and reverse the chain
8450   of offsets, with the field in the BRA item now becoming an offset to the
8451   first alternative. If there are no alternatives, it points to the end of the
8452   group. The length in the terminating ket is always the length of the whole
8453   bracketed item. Return leaving the pointer at the terminating char. */
8454
8455   if (*ptr != CHAR_VERTICAL_LINE)
8456     {
8457     if (lengthptr == NULL)
8458       {
8459       int branch_length = (int)(code - last_branch);
8460       do
8461         {
8462         int prev_length = GET(last_branch, 1);
8463         PUT(last_branch, 1, branch_length);
8464         branch_length = prev_length;
8465         last_branch -= branch_length;
8466         }
8467       while (branch_length > 0);
8468       }
8469
8470     /* Fill in the ket */
8471
8472     *code = OP_KET;
8473     PUT(code, 1, (int)(code - start_bracket));
8474     code += 1 + LINK_SIZE;
8475
8476     /* If it was a capturing subpattern, check to see if it contained any
8477     recursive back references. If so, we must wrap it in atomic brackets.
8478     Because we are moving code along, we must ensure that any pending recursive
8479     references are updated. In any event, remove the block from the chain. */
8480
8481     if (capnumber > 0)
8482       {
8483       if (cd->open_caps->flag)
8484         {
8485         *code = OP_END;
8486         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8487           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8488         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8489           IN_UCHARS(code - start_bracket));
8490         *start_bracket = OP_ONCE;
8491         code += 1 + LINK_SIZE;
8492         PUT(start_bracket, 1, (int)(code - start_bracket));
8493         *code = OP_KET;
8494         PUT(code, 1, (int)(code - start_bracket));
8495         code += 1 + LINK_SIZE;
8496         length += 2 + 2*LINK_SIZE;
8497         }
8498       cd->open_caps = cd->open_caps->next;
8499       }
8500
8501     /* Retain the highest bracket number, in case resetting was used. */
8502
8503     cd->bracount = max_bracount;
8504
8505     /* Set values to pass back */
8506
8507     *codeptr = code;
8508     *ptrptr = ptr;
8509     *firstcharptr = firstchar;
8510     *firstcharflagsptr = firstcharflags;
8511     *reqcharptr = reqchar;
8512     *reqcharflagsptr = reqcharflags;
8513     if (lengthptr != NULL)
8514       {
8515       if (OFLOW_MAX - *lengthptr < length)
8516         {
8517         *errorcodeptr = ERR20;
8518         return FALSE;
8519         }
8520       *lengthptr += length;
8521       }
8522     return TRUE;
8523     }
8524
8525   /* Another branch follows. In the pre-compile phase, we can move the code
8526   pointer back to where it was for the start of the first branch. (That is,
8527   pretend that each branch is the only one.)
8528
8529   In the real compile phase, insert an ALT node. Its length field points back
8530   to the previous branch while the bracket remains open. At the end the chain
8531   is reversed. It's done like this so that the start of the bracket has a
8532   zero offset until it is closed, making it possible to detect recursion. */
8533
8534   if (lengthptr != NULL)
8535     {
8536     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8537     length += 1 + LINK_SIZE;
8538     }
8539   else
8540     {
8541     *code = OP_ALT;
8542     PUT(code, 1, (int)(code - last_branch));
8543     bc.current_branch = last_branch = code;
8544     code += 1 + LINK_SIZE;
8545     }
8546
8547   ptr++;
8548   }
8549 /* Control never reaches here */
8550 }
8551
8552
8553
8554
8555 /*************************************************
8556 *          Check for anchored expression         *
8557 *************************************************/
8558
8559 /* Try to find out if this is an anchored regular expression. Consider each
8560 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8561 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8562 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8563 be found, because ^ generates OP_CIRCM in that mode.
8564
8565 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8566 This is the code for \G, which means "match at start of match position, taking
8567 into account the match offset".
8568
8569 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8570 because that will try the rest of the pattern at all possible matching points,
8571 so there is no point trying again.... er ....
8572
8573 .... except when the .* appears inside capturing parentheses, and there is a
8574 subsequent back reference to those parentheses. We haven't enough information
8575 to catch that case precisely.
8576
8577 At first, the best we could do was to detect when .* was in capturing brackets
8578 and the highest back reference was greater than or equal to that level.
8579 However, by keeping a bitmap of the first 31 back references, we can catch some
8580 of the more common cases more precisely.
8581
8582 ... A second exception is when the .* appears inside an atomic group, because
8583 this prevents the number of characters it matches from being adjusted.
8584
8585 Arguments:
8586   code           points to start of expression (the bracket)
8587   bracket_map    a bitmap of which brackets we are inside while testing; this
8588                   handles up to substring 31; after that we just have to take
8589                   the less precise approach
8590   cd             points to the compile data block
8591   atomcount      atomic group level
8592
8593 Returns:     TRUE or FALSE
8594 */
8595
8596 static BOOL
8597 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8598   compile_data *cd, int atomcount)
8599 {
8600 do {
8601    const pcre_uchar *scode = first_significant_code(
8602      code + PRIV(OP_lengths)[*code], FALSE);
8603    register int op = *scode;
8604
8605    /* Non-capturing brackets */
8606
8607    if (op == OP_BRA  || op == OP_BRAPOS ||
8608        op == OP_SBRA || op == OP_SBRAPOS)
8609      {
8610      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8611      }
8612
8613    /* Capturing brackets */
8614
8615    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8616             op == OP_SCBRA || op == OP_SCBRAPOS)
8617      {
8618      int n = GET2(scode, 1+LINK_SIZE);
8619      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8620      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8621      }
8622
8623    /* Positive forward assertions and conditions */
8624
8625    else if (op == OP_ASSERT || op == OP_COND)
8626      {
8627      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8628      }
8629
8630    /* Atomic groups */
8631
8632    else if (op == OP_ONCE || op == OP_ONCE_NC)
8633      {
8634      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8635        return FALSE;
8636      }
8637
8638    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8639    it isn't in brackets that are or may be referenced or inside an atomic
8640    group. */
8641
8642    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8643              op == OP_TYPEPOSSTAR))
8644      {
8645      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8646          atomcount > 0 || cd->had_pruneorskip)
8647        return FALSE;
8648      }
8649
8650    /* Check for explicit anchoring */
8651
8652    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8653
8654    code += GET(code, 1);
8655    }
8656 while (*code == OP_ALT);   /* Loop for each alternative */
8657 return TRUE;
8658 }
8659
8660
8661
8662 /*************************************************
8663 *         Check for starting with ^ or .*        *
8664 *************************************************/
8665
8666 /* This is called to find out if every branch starts with ^ or .* so that
8667 "first char" processing can be done to speed things up in multiline
8668 matching and for non-DOTALL patterns that start with .* (which must start at
8669 the beginning or after \n). As in the case of is_anchored() (see above), we
8670 have to take account of back references to capturing brackets that contain .*
8671 because in that case we can't make the assumption. Also, the appearance of .*
8672 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8673 count, because once again the assumption no longer holds.
8674
8675 Arguments:
8676   code           points to start of expression (the bracket)
8677   bracket_map    a bitmap of which brackets we are inside while testing; this
8678                   handles up to substring 31; after that we just have to take
8679                   the less precise approach
8680   cd             points to the compile data
8681   atomcount      atomic group level
8682
8683 Returns:         TRUE or FALSE
8684 */
8685
8686 static BOOL
8687 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8688   compile_data *cd, int atomcount)
8689 {
8690 do {
8691    const pcre_uchar *scode = first_significant_code(
8692      code + PRIV(OP_lengths)[*code], FALSE);
8693    register int op = *scode;
8694
8695    /* If we are at the start of a conditional assertion group, *both* the
8696    conditional assertion *and* what follows the condition must satisfy the test
8697    for start of line. Other kinds of condition fail. Note that there may be an
8698    auto-callout at the start of a condition. */
8699
8700    if (op == OP_COND)
8701      {
8702      scode += 1 + LINK_SIZE;
8703      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8704      switch (*scode)
8705        {
8706        case OP_CREF:
8707        case OP_DNCREF:
8708        case OP_RREF:
8709        case OP_DNRREF:
8710        case OP_DEF:
8711        case OP_FAIL:
8712        return FALSE;
8713
8714        default:     /* Assertion */
8715        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8716        do scode += GET(scode, 1); while (*scode == OP_ALT);
8717        scode += 1 + LINK_SIZE;
8718        break;
8719        }
8720      scode = first_significant_code(scode, FALSE);
8721      op = *scode;
8722      }
8723
8724    /* Non-capturing brackets */
8725
8726    if (op == OP_BRA  || op == OP_BRAPOS ||
8727        op == OP_SBRA || op == OP_SBRAPOS)
8728      {
8729      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8730      }
8731
8732    /* Capturing brackets */
8733
8734    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8735             op == OP_SCBRA || op == OP_SCBRAPOS)
8736      {
8737      int n = GET2(scode, 1+LINK_SIZE);
8738      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8739      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8740      }
8741
8742    /* Positive forward assertions */
8743
8744    else if (op == OP_ASSERT)
8745      {
8746      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8747      }
8748
8749    /* Atomic brackets */
8750
8751    else if (op == OP_ONCE || op == OP_ONCE_NC)
8752      {
8753      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8754      }
8755
8756    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8757    brackets that may be referenced, as long as the pattern does not contain
8758    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8759    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8760    start of a line. */
8761
8762    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8763      {
8764      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8765          atomcount > 0 || cd->had_pruneorskip)
8766        return FALSE;
8767      }
8768
8769    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8770    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8771    because the number of characters matched by .* cannot be adjusted inside
8772    them. */
8773
8774    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8775
8776    /* Move on to the next alternative */
8777
8778    code += GET(code, 1);
8779    }
8780 while (*code == OP_ALT);  /* Loop for each alternative */
8781 return TRUE;
8782 }
8783
8784
8785
8786 /*************************************************
8787 *       Check for asserted fixed first char      *
8788 *************************************************/
8789
8790 /* During compilation, the "first char" settings from forward assertions are
8791 discarded, because they can cause conflicts with actual literals that follow.
8792 However, if we end up without a first char setting for an unanchored pattern,
8793 it is worth scanning the regex to see if there is an initial asserted first
8794 char. If all branches start with the same asserted char, or with a
8795 non-conditional bracket all of whose alternatives start with the same asserted
8796 char (recurse ad lib), then we return that char, with the flags set to zero or
8797 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8798
8799 Arguments:
8800   code       points to start of expression (the bracket)
8801   flags      points to the first char flags, or to REQ_NONE
8802   inassert   TRUE if in an assertion
8803
8804 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8805 */
8806
8807 static pcre_uint32
8808 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8809   BOOL inassert)
8810 {
8811 register pcre_uint32 c = 0;
8812 int cflags = REQ_NONE;
8813
8814 *flags = REQ_NONE;
8815 do {
8816    pcre_uint32 d;
8817    int dflags;
8818    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8819              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8820    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8821      TRUE);
8822    register pcre_uchar op = *scode;
8823
8824    switch(op)
8825      {
8826      default:
8827      return 0;
8828
8829      case OP_BRA:
8830      case OP_BRAPOS:
8831      case OP_CBRA:
8832      case OP_SCBRA:
8833      case OP_CBRAPOS:
8834      case OP_SCBRAPOS:
8835      case OP_ASSERT:
8836      case OP_ONCE:
8837      case OP_ONCE_NC:
8838      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8839      if (dflags < 0)
8840        return 0;
8841      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8842      break;
8843
8844      case OP_EXACT:
8845      scode += IMM2_SIZE;
8846      /* Fall through */
8847
8848      case OP_CHAR:
8849      case OP_PLUS:
8850      case OP_MINPLUS:
8851      case OP_POSPLUS:
8852      if (!inassert) return 0;
8853      if (cflags < 0) { c = scode[1]; cflags = 0; }
8854        else if (c != scode[1]) return 0;
8855      break;
8856
8857      case OP_EXACTI:
8858      scode += IMM2_SIZE;
8859      /* Fall through */
8860
8861      case OP_CHARI:
8862      case OP_PLUSI:
8863      case OP_MINPLUSI:
8864      case OP_POSPLUSI:
8865      if (!inassert) return 0;
8866      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8867        else if (c != scode[1]) return 0;
8868      break;
8869      }
8870
8871    code += GET(code, 1);
8872    }
8873 while (*code == OP_ALT);
8874
8875 *flags = cflags;
8876 return c;
8877 }
8878
8879
8880
8881 /*************************************************
8882 *     Add an entry to the name/number table      *
8883 *************************************************/
8884
8885 /* This function is called between compiling passes to add an entry to the
8886 name/number table, maintaining alphabetical order. Checking for permitted
8887 and forbidden duplicates has already been done.
8888
8889 Arguments:
8890   cd           the compile data block
8891   name         the name to add
8892   length       the length of the name
8893   groupno      the group number
8894
8895 Returns:       nothing
8896 */
8897
8898 static void
8899 add_name(compile_data *cd, const pcre_uchar *name, int length,
8900   unsigned int groupno)
8901 {
8902 int i;
8903 pcre_uchar *slot = cd->name_table;
8904
8905 for (i = 0; i < cd->names_found; i++)
8906   {
8907   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8908   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8909     crc = -1; /* Current name is a substring */
8910
8911   /* Make space in the table and break the loop for an earlier name. For a
8912   duplicate or later name, carry on. We do this for duplicates so that in the
8913   simple case (when ?(| is not used) they are in order of their numbers. In all
8914   cases they are in the order in which they appear in the pattern. */
8915
8916   if (crc < 0)
8917     {
8918     memmove(slot + cd->name_entry_size, slot,
8919       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8920     break;
8921     }
8922
8923   /* Continue the loop for a later or duplicate name */
8924
8925   slot += cd->name_entry_size;
8926   }
8927
8928 PUT2(slot, 0, groupno);
8929 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8930 slot[IMM2_SIZE + length] = 0;
8931 cd->names_found++;
8932 }
8933
8934
8935
8936 /*************************************************
8937 *        Compile a Regular Expression            *
8938 *************************************************/
8939
8940 /* This function takes a string and returns a pointer to a block of store
8941 holding a compiled version of the expression. The original API for this
8942 function had no error code return variable; it is retained for backwards
8943 compatibility. The new function is given a new name.
8944
8945 Arguments:
8946   pattern       the regular expression
8947   options       various option bits
8948   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8949                   can be NULL if you don't want a code value
8950   errorptr      pointer to pointer to error text
8951   erroroffset   ptr offset in pattern where error was detected
8952   tables        pointer to character tables or NULL
8953
8954 Returns:        pointer to compiled data block, or NULL on error,
8955                 with errorptr and erroroffset set
8956 */
8957
8958 #if defined COMPILE_PCRE8
8959 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8960 pcre_compile(const char *pattern, int options, const char **errorptr,
8961   int *erroroffset, const unsigned char *tables)
8962 #elif defined COMPILE_PCRE16
8963 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8964 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8965   int *erroroffset, const unsigned char *tables)
8966 #elif defined COMPILE_PCRE32
8967 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8968 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8969   int *erroroffset, const unsigned char *tables)
8970 #endif
8971 {
8972 #if defined COMPILE_PCRE8
8973 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8974 #elif defined COMPILE_PCRE16
8975 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8976 #elif defined COMPILE_PCRE32
8977 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8978 #endif
8979 }
8980
8981
8982 #if defined COMPILE_PCRE8
8983 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8984 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8985   const char **errorptr, int *erroroffset, const unsigned char *tables)
8986 #elif defined COMPILE_PCRE16
8987 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8988 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
8989   const char **errorptr, int *erroroffset, const unsigned char *tables)
8990 #elif defined COMPILE_PCRE32
8991 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8992 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
8993   const char **errorptr, int *erroroffset, const unsigned char *tables)
8994 #endif
8995 {
8996 REAL_PCRE *re;
8997 int length = 1;  /* For final END opcode */
8998 pcre_int32 firstcharflags, reqcharflags;
8999 pcre_uint32 firstchar, reqchar;
9000 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9001 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9002 int newline;
9003 int errorcode = 0;
9004 int skipatstart = 0;
9005 BOOL utf;
9006 BOOL never_utf = FALSE;
9007 size_t size;
9008 pcre_uchar *code;
9009 const pcre_uchar *codestart;
9010 const pcre_uchar *ptr;
9011 compile_data compile_block;
9012 compile_data *cd = &compile_block;
9013
9014 /* This space is used for "compiling" into during the first phase, when we are
9015 computing the amount of memory that is needed. Compiled items are thrown away
9016 as soon as possible, so that a fairly large buffer should be sufficient for
9017 this purpose. The same space is used in the second phase for remembering where
9018 to fill in forward references to subpatterns. That may overflow, in which case
9019 new memory is obtained from malloc(). */
9020
9021 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9022
9023 /* This vector is used for remembering name groups during the pre-compile. In a
9024 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9025
9026 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9027
9028 /* Set this early so that early errors get offset 0. */
9029
9030 ptr = (const pcre_uchar *)pattern;
9031
9032 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9033 can do is just return NULL, but we can set a code value if there is a code
9034 pointer. */
9035
9036 if (errorptr == NULL)
9037   {
9038   if (errorcodeptr != NULL) *errorcodeptr = 99;
9039   return NULL;
9040   }
9041
9042 *errorptr = NULL;
9043 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9044
9045 /* However, we can give a message for this error */
9046
9047 if (erroroffset == NULL)
9048   {
9049   errorcode = ERR16;
9050   goto PCRE_EARLY_ERROR_RETURN2;
9051   }
9052
9053 *erroroffset = 0;
9054
9055 /* Set up pointers to the individual character tables */
9056
9057 if (tables == NULL) tables = PRIV(default_tables);
9058 cd->lcc = tables + lcc_offset;
9059 cd->fcc = tables + fcc_offset;
9060 cd->cbits = tables + cbits_offset;
9061 cd->ctypes = tables + ctypes_offset;
9062
9063 /* Check that all undefined public option bits are zero */
9064
9065 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9066   {
9067   errorcode = ERR17;
9068   goto PCRE_EARLY_ERROR_RETURN;
9069   }
9070
9071 /* If PCRE_NEVER_UTF is set, remember it. */
9072
9073 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9074
9075 /* Check for global one-time settings at the start of the pattern, and remember
9076 the offset for later. */
9077
9078 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9079
9080 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9081        ptr[skipatstart+1] == CHAR_ASTERISK)
9082   {
9083   int newnl = 0;
9084   int newbsr = 0;
9085
9086 /* For completeness and backward compatibility, (*UTFn) is supported in the
9087 relevant libraries, but (*UTF) is generic and always supported. Note that
9088 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9089
9090 #ifdef COMPILE_PCRE8
9091   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9092     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9093 #endif
9094 #ifdef COMPILE_PCRE16
9095   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9096     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9097 #endif
9098 #ifdef COMPILE_PCRE32
9099   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9100     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9101 #endif
9102
9103   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9104     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9105   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9106     { skipatstart += 6; options |= PCRE_UCP; continue; }
9107   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9108     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9109   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9110     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9111
9112   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9113     {
9114     pcre_uint32 c = 0;
9115     int p = skipatstart + 14;
9116     while (isdigit(ptr[p]))
9117       {
9118       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9119       c = c*10 + ptr[p++] - CHAR_0;
9120       }
9121     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9122     if (c < limit_match)
9123       {
9124       limit_match = c;
9125       cd->external_flags |= PCRE_MLSET;
9126       }
9127     skipatstart = p;
9128     continue;
9129     }
9130
9131   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9132     {
9133     pcre_uint32 c = 0;
9134     int p = skipatstart + 18;
9135     while (isdigit(ptr[p]))
9136       {
9137       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9138       c = c*10 + ptr[p++] - CHAR_0;
9139       }
9140     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9141     if (c < limit_recursion)
9142       {
9143       limit_recursion = c;
9144       cd->external_flags |= PCRE_RLSET;
9145       }
9146     skipatstart = p;
9147     continue;
9148     }
9149
9150   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9151     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9152   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9153     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9154   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9155     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9156   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9157     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9158   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9159     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9160
9161   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9162     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9163   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9164     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9165
9166   if (newnl != 0)
9167     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9168   else if (newbsr != 0)
9169     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9170   else break;
9171   }
9172
9173 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9174 utf = (options & PCRE_UTF8) != 0;
9175 if (utf && never_utf)
9176   {
9177   errorcode = ERR78;
9178   goto PCRE_EARLY_ERROR_RETURN2;
9179   }
9180
9181 /* Can't support UTF unless PCRE has been compiled to include the code. The
9182 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9183 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9184 not used here. */
9185
9186 #ifdef SUPPORT_UTF
9187 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9188      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9189   {
9190 #if defined COMPILE_PCRE8
9191   errorcode = ERR44;
9192 #elif defined COMPILE_PCRE16
9193   errorcode = ERR74;
9194 #elif defined COMPILE_PCRE32
9195   errorcode = ERR77;
9196 #endif
9197   goto PCRE_EARLY_ERROR_RETURN2;
9198   }
9199 #else
9200 if (utf)
9201   {
9202   errorcode = ERR32;
9203   goto PCRE_EARLY_ERROR_RETURN;
9204   }
9205 #endif
9206
9207 /* Can't support UCP unless PCRE has been compiled to include the code. */
9208
9209 #ifndef SUPPORT_UCP
9210 if ((options & PCRE_UCP) != 0)
9211   {
9212   errorcode = ERR67;
9213   goto PCRE_EARLY_ERROR_RETURN;
9214   }
9215 #endif
9216
9217 /* Check validity of \R options. */
9218
9219 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9220      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9221   {
9222   errorcode = ERR56;
9223   goto PCRE_EARLY_ERROR_RETURN;
9224   }
9225
9226 /* Handle different types of newline. The three bits give seven cases. The
9227 current code allows for fixed one- or two-byte sequences, plus "any" and
9228 "anycrlf". */
9229
9230 switch (options & PCRE_NEWLINE_BITS)
9231   {
9232   case 0: newline = NEWLINE; break;   /* Build-time default */
9233   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9234   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9235   case PCRE_NEWLINE_CR+
9236        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9237   case PCRE_NEWLINE_ANY: newline = -1; break;
9238   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9239   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9240   }
9241
9242 if (newline == -2)
9243   {
9244   cd->nltype = NLTYPE_ANYCRLF;
9245   }
9246 else if (newline < 0)
9247   {
9248   cd->nltype = NLTYPE_ANY;
9249   }
9250 else
9251   {
9252   cd->nltype = NLTYPE_FIXED;
9253   if (newline > 255)
9254     {
9255     cd->nllen = 2;
9256     cd->nl[0] = (newline >> 8) & 255;
9257     cd->nl[1] = newline & 255;
9258     }
9259   else
9260     {
9261     cd->nllen = 1;
9262     cd->nl[0] = newline;
9263     }
9264   }
9265
9266 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9267 references to help in deciding whether (.*) can be treated as anchored or not.
9268 */
9269
9270 cd->top_backref = 0;
9271 cd->backref_map = 0;
9272
9273 /* Reflect pattern for debugging output */
9274
9275 DPRINTF(("------------------------------------------------------------------\n"));
9276 #ifdef PCRE_DEBUG
9277 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9278 #endif
9279 DPRINTF(("\n"));
9280
9281 /* Pretend to compile the pattern while actually just accumulating the length
9282 of memory required. This behaviour is triggered by passing a non-NULL final
9283 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9284 to compile parts of the pattern into; the compiled code is discarded when it is
9285 no longer needed, so hopefully this workspace will never overflow, though there
9286 is a test for its doing so. */
9287
9288 cd->bracount = cd->final_bracount = 0;
9289 cd->names_found = 0;
9290 cd->name_entry_size = 0;
9291 cd->name_table = NULL;
9292 cd->dupnames = FALSE;
9293 cd->dupgroups = FALSE;
9294 cd->namedrefcount = 0;
9295 cd->start_code = cworkspace;
9296 cd->hwm = cworkspace;
9297 cd->iscondassert = FALSE;
9298 cd->start_workspace = cworkspace;
9299 cd->workspace_size = COMPILE_WORK_SIZE;
9300 cd->named_groups = named_groups;
9301 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9302 cd->start_pattern = (const pcre_uchar *)pattern;
9303 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9304 cd->req_varyopt = 0;
9305 cd->parens_depth = 0;
9306 cd->assert_depth = 0;
9307 cd->max_lookbehind = 0;
9308 cd->external_options = options;
9309 cd->open_caps = NULL;
9310
9311 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9312 don't need to look at the result of the function here. The initial options have
9313 been put into the cd block so that they can be changed if an option setting is
9314 found within the regex right at the beginning. Bringing initial option settings
9315 outside can help speed up starting point checks. */
9316
9317 ptr += skipatstart;
9318 code = cworkspace;
9319 *code = OP_BRA;
9320
9321 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9322   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9323   cd, &length);
9324 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9325
9326 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9327   (int)(cd->hwm - cworkspace)));
9328
9329 if (length > MAX_PATTERN_SIZE)
9330   {
9331   errorcode = ERR20;
9332   goto PCRE_EARLY_ERROR_RETURN;
9333   }
9334
9335 /* Compute the size of the data block for storing the compiled pattern. Integer
9336 overflow should no longer be possible because nowadays we limit the maximum
9337 value of cd->names_found and cd->name_entry_size. */
9338
9339 size = sizeof(REAL_PCRE) +
9340   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9341
9342 /* Get the memory. */
9343
9344 re = (REAL_PCRE *)(PUBL(malloc))(size);
9345 if (re == NULL)
9346   {
9347   errorcode = ERR21;
9348   goto PCRE_EARLY_ERROR_RETURN;
9349   }
9350
9351 /* Put in the magic number, and save the sizes, initial options, internal
9352 flags, and character table pointer. NULL is used for the default character
9353 tables. The nullpad field is at the end; it's there to help in the case when a
9354 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9355 pointers. */
9356
9357 re->magic_number = MAGIC_NUMBER;
9358 re->size = (int)size;
9359 re->options = cd->external_options;
9360 re->flags = cd->external_flags;
9361 re->limit_match = limit_match;
9362 re->limit_recursion = limit_recursion;
9363 re->first_char = 0;
9364 re->req_char = 0;
9365 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9366 re->name_entry_size = cd->name_entry_size;
9367 re->name_count = cd->names_found;
9368 re->ref_count = 0;
9369 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9370 re->nullpad = NULL;
9371 #ifdef COMPILE_PCRE32
9372 re->dummy = 0;
9373 #else
9374 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9375 #endif
9376
9377 /* The starting points of the name/number translation table and of the code are
9378 passed around in the compile data block. The start/end pattern and initial
9379 options are already set from the pre-compile phase, as is the name_entry_size
9380 field. Reset the bracket count and the names_found field. Also reset the hwm
9381 field; this time it's used for remembering forward references to subpatterns.
9382 */
9383
9384 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9385 cd->parens_depth = 0;
9386 cd->assert_depth = 0;
9387 cd->bracount = 0;
9388 cd->max_lookbehind = 0;
9389 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9390 codestart = cd->name_table + re->name_entry_size * re->name_count;
9391 cd->start_code = codestart;
9392 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9393 cd->iscondassert = FALSE;
9394 cd->req_varyopt = 0;
9395 cd->had_accept = FALSE;
9396 cd->had_pruneorskip = FALSE;
9397 cd->check_lookbehind = FALSE;
9398 cd->open_caps = NULL;
9399
9400 /* If any named groups were found, create the name/number table from the list
9401 created in the first pass. */
9402
9403 if (cd->names_found > 0)
9404   {
9405   int i = cd->names_found;
9406   named_group *ng = cd->named_groups;
9407   cd->names_found = 0;
9408   for (; i > 0; i--, ng++)
9409     add_name(cd, ng->name, ng->length, ng->number);
9410   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9411     (PUBL(free))((void *)cd->named_groups);
9412   }
9413
9414 /* Set up a starting, non-extracting bracket, then compile the expression. On
9415 error, errorcode will be set non-zero, so we don't need to look at the result
9416 of the function here. */
9417
9418 ptr = (const pcre_uchar *)pattern + skipatstart;
9419 code = (pcre_uchar *)codestart;
9420 *code = OP_BRA;
9421 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9422   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9423 re->top_bracket = cd->bracount;
9424 re->top_backref = cd->top_backref;
9425 re->max_lookbehind = cd->max_lookbehind;
9426 re->flags = cd->external_flags | PCRE_MODE;
9427
9428 if (cd->had_accept)
9429   {
9430   reqchar = 0;              /* Must disable after (*ACCEPT) */
9431   reqcharflags = REQ_NONE;
9432   }
9433
9434 /* If not reached end of pattern on success, there's an excess bracket. */
9435
9436 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9437
9438 /* Fill in the terminating state and check for disastrous overflow, but
9439 if debugging, leave the test till after things are printed out. */
9440
9441 *code++ = OP_END;
9442
9443 #ifndef PCRE_DEBUG
9444 if (code - codestart > length) errorcode = ERR23;
9445 #endif
9446
9447 #ifdef SUPPORT_VALGRIND
9448 /* If the estimated length exceeds the really used length, mark the extra
9449 allocated memory as unaddressable, so that any out-of-bound reads can be
9450 detected. */
9451 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9452 #endif
9453
9454 /* Fill in any forward references that are required. There may be repeated
9455 references; optimize for them, as searching a large regex takes time. */
9456
9457 if (cd->hwm > cd->start_workspace)
9458   {
9459   int prev_recno = -1;
9460   const pcre_uchar *groupptr = NULL;
9461   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9462     {
9463     int offset, recno;
9464     cd->hwm -= LINK_SIZE;
9465     offset = GET(cd->hwm, 0);
9466
9467     /* Check that the hwm handling hasn't gone wrong. This whole area is
9468     rewritten in PCRE2 because there are some obscure cases. */
9469
9470     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9471       {
9472       errorcode = ERR10;
9473       break;
9474       }
9475
9476     recno = GET(codestart, offset);
9477     if (recno != prev_recno)
9478       {
9479       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9480       prev_recno = recno;
9481       }
9482     if (groupptr == NULL) errorcode = ERR53;
9483       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9484     }
9485   }
9486
9487 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9488 NULL to indicate that forward references have been filled in. */
9489
9490 if (cd->workspace_size > COMPILE_WORK_SIZE)
9491   (PUBL(free))((void *)cd->start_workspace);
9492 cd->start_workspace = NULL;
9493
9494 /* Give an error if there's back reference to a non-existent capturing
9495 subpattern. */
9496
9497 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9498
9499 /* Unless disabled, check whether any single character iterators can be
9500 auto-possessified. The function overwrites the appropriate opcode values, so
9501 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9502 used in this code because at least one compiler gives a warning about loss of
9503 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9504 function call. */
9505
9506 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9507   {
9508   pcre_uchar *temp = (pcre_uchar *)codestart;
9509   auto_possessify(temp, utf, cd);
9510   }
9511
9512 /* If there were any lookbehind assertions that contained OP_RECURSE
9513 (recursions or subroutine calls), a flag is set for them to be checked here,
9514 because they may contain forward references. Actual recursions cannot be fixed
9515 length, but subroutine calls can. It is done like this so that those without
9516 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9517 exceptional ones forgo this. We scan the pattern to check that they are fixed
9518 length, and set their lengths. */
9519
9520 if (errorcode == 0 && cd->check_lookbehind)
9521   {
9522   pcre_uchar *cc = (pcre_uchar *)codestart;
9523
9524   /* Loop, searching for OP_REVERSE items, and process those that do not have
9525   their length set. (Actually, it will also re-process any that have a length
9526   of zero, but that is a pathological case, and it does no harm.) When we find
9527   one, we temporarily terminate the branch it is in while we scan it. */
9528
9529   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9530        cc != NULL;
9531        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9532     {
9533     if (GET(cc, 1) == 0)
9534       {
9535       int fixed_length;
9536       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9537       int end_op = *be;
9538       *be = OP_END;
9539       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9540         cd, NULL);
9541       *be = end_op;
9542       DPRINTF(("fixed length = %d\n", fixed_length));
9543       if (fixed_length < 0)
9544         {
9545         errorcode = (fixed_length == -2)? ERR36 :
9546                     (fixed_length == -4)? ERR70 : ERR25;
9547         break;
9548         }
9549       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9550       PUT(cc, 1, fixed_length);
9551       }
9552     cc += 1 + LINK_SIZE;
9553     }
9554   }
9555
9556 /* Failed to compile, or error while post-processing */
9557
9558 if (errorcode != 0)
9559   {
9560   (PUBL(free))(re);
9561   PCRE_EARLY_ERROR_RETURN:
9562   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9563   PCRE_EARLY_ERROR_RETURN2:
9564   *errorptr = find_error_text(errorcode);
9565   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9566   return NULL;
9567   }
9568
9569 /* If the anchored option was not passed, set the flag if we can determine that
9570 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9571 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9572 of *PRUNE or *SKIP.
9573
9574 Otherwise, if we know what the first byte has to be, save it, because that
9575 speeds up unanchored matches no end. If not, see if we can set the
9576 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9577 start with ^. and also when all branches start with non-atomic .* for
9578 non-DOTALL matches when *PRUNE and SKIP are not present. */
9579
9580 if ((re->options & PCRE_ANCHORED) == 0)
9581   {
9582   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9583   else
9584     {
9585     if (firstcharflags < 0)
9586       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9587     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9588       {
9589 #if defined COMPILE_PCRE8
9590       re->first_char = firstchar & 0xff;
9591 #elif defined COMPILE_PCRE16
9592       re->first_char = firstchar & 0xffff;
9593 #elif defined COMPILE_PCRE32
9594       re->first_char = firstchar;
9595 #endif
9596       if ((firstcharflags & REQ_CASELESS) != 0)
9597         {
9598 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9599         /* We ignore non-ASCII first chars in 8 bit mode. */
9600         if (utf)
9601           {
9602           if (re->first_char < 128)
9603             {
9604             if (cd->fcc[re->first_char] != re->first_char)
9605               re->flags |= PCRE_FCH_CASELESS;
9606             }
9607           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9608             re->flags |= PCRE_FCH_CASELESS;
9609           }
9610         else
9611 #endif
9612         if (MAX_255(re->first_char)
9613             && cd->fcc[re->first_char] != re->first_char)
9614           re->flags |= PCRE_FCH_CASELESS;
9615         }
9616
9617       re->flags |= PCRE_FIRSTSET;
9618       }
9619
9620     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9621     }
9622   }
9623
9624 /* For an anchored pattern, we use the "required byte" only if it follows a
9625 variable length item in the regex. Remove the caseless flag for non-caseable
9626 bytes. */
9627
9628 if (reqcharflags >= 0 &&
9629      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9630   {
9631 #if defined COMPILE_PCRE8
9632   re->req_char = reqchar & 0xff;
9633 #elif defined COMPILE_PCRE16
9634   re->req_char = reqchar & 0xffff;
9635 #elif defined COMPILE_PCRE32
9636   re->req_char = reqchar;
9637 #endif
9638   if ((reqcharflags & REQ_CASELESS) != 0)
9639     {
9640 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9641     /* We ignore non-ASCII first chars in 8 bit mode. */
9642     if (utf)
9643       {
9644       if (re->req_char < 128)
9645         {
9646         if (cd->fcc[re->req_char] != re->req_char)
9647           re->flags |= PCRE_RCH_CASELESS;
9648         }
9649       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9650         re->flags |= PCRE_RCH_CASELESS;
9651       }
9652     else
9653 #endif
9654     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9655       re->flags |= PCRE_RCH_CASELESS;
9656     }
9657
9658   re->flags |= PCRE_REQCHSET;
9659   }
9660
9661 /* Print out the compiled data if debugging is enabled. This is never the
9662 case when building a production library. */
9663
9664 #ifdef PCRE_DEBUG
9665 printf("Length = %d top_bracket = %d top_backref = %d\n",
9666   length, re->top_bracket, re->top_backref);
9667
9668 printf("Options=%08x\n", re->options);
9669
9670 if ((re->flags & PCRE_FIRSTSET) != 0)
9671   {
9672   pcre_uchar ch = re->first_char;
9673   const char *caseless =
9674     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9675   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9676     else printf("First char = \\x%02x%s\n", ch, caseless);
9677   }
9678
9679 if ((re->flags & PCRE_REQCHSET) != 0)
9680   {
9681   pcre_uchar ch = re->req_char;
9682   const char *caseless =
9683     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9684   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9685     else printf("Req char = \\x%02x%s\n", ch, caseless);
9686   }
9687
9688 #if defined COMPILE_PCRE8
9689 pcre_printint((pcre *)re, stdout, TRUE);
9690 #elif defined COMPILE_PCRE16
9691 pcre16_printint((pcre *)re, stdout, TRUE);
9692 #elif defined COMPILE_PCRE32
9693 pcre32_printint((pcre *)re, stdout, TRUE);
9694 #endif
9695
9696 /* This check is done here in the debugging case so that the code that
9697 was compiled can be seen. */
9698
9699 if (code - codestart > length)
9700   {
9701   (PUBL(free))(re);
9702   *errorptr = find_error_text(ERR23);
9703   *erroroffset = ptr - (pcre_uchar *)pattern;
9704   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9705   return NULL;
9706   }
9707 #endif   /* PCRE_DEBUG */
9708
9709 /* Check for a pattern than can match an empty string, so that this information
9710 can be provided to applications. */
9711
9712 do
9713   {
9714   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9715     {
9716     re->flags |= PCRE_MATCH_EMPTY;
9717     break;
9718     }
9719   codestart += GET(codestart, 1);
9720   }
9721 while (*codestart == OP_ALT);
9722
9723 #if defined COMPILE_PCRE8
9724 return (pcre *)re;
9725 #elif defined COMPILE_PCRE16
9726 return (pcre16 *)re;
9727 #elif defined COMPILE_PCRE32
9728 return (pcre32 *)re;
9729 #endif
9730 }
9731
9732 /* End of pcre_compile.c */