1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
49 #include "pcre_internal.h"
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
53 /* Returns from set_start_bits() */
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
69 re compiled pattern block
70 code pointer to start of group (the bracket)
71 startcode pointer to start of the whole pattern's code
72 options the compiling options
75 Returns: the minimum length
76 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77 -2 internal error (missing capturing bracket)
78 -3 internal error (opcode not listed)
82 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83 const pcre_uchar *startcode, int options, int recurse_depth)
86 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87 BOOL utf = (options & PCRE_UTF8) != 0;
88 BOOL had_recurse = FALSE;
89 register int branchlength = 0;
90 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92 if (*code == OP_CBRA || *code == OP_SCBRA ||
93 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
95 /* Scan along the opcodes for this branch. If we get to the end of the
96 branch, check the length against that of the other branches. */
102 register pcre_uchar op = *cc;
109 /* If there is only one branch in a condition, the implied branch has zero
110 length, so we don't add anything. This covers the DEFINE "condition"
113 cs = cc + GET(cc, 1);
116 cc = cs + 1 + LINK_SIZE;
120 /* Otherwise we can fall through and treat it the same as any other
133 d = find_minlength(re, cc, startcode, options, recurse_depth);
136 do cc += GET(cc, 1); while (*cc == OP_ALT);
140 /* ACCEPT makes things far too complicated; we have to give up. */
143 case OP_ASSERT_ACCEPT:
146 /* Reached end of a branch; if it's a ket it is the end of a nested
147 call. If it's ALT it is an alternation in a nested call. If it is END it's
148 the end of the outer call. All can be handled by the same code. If an
149 ACCEPT was previously encountered, use the length that was in force at that
150 time, and pass back the shortest ACCEPT length. */
158 if (length < 0 || (!had_recurse && branchlength < length))
159 length = branchlength;
160 if (op != OP_ALT) return length;
166 /* Skip over assertive subpatterns */
171 case OP_ASSERTBACK_NOT:
172 do cc += GET(cc, 1); while (*cc == OP_ALT);
175 /* Skip over things that don't match chars */
192 case OP_NOT_WORD_BOUNDARY:
193 case OP_WORD_BOUNDARY:
194 cc += PRIV(OP_lengths)[*cc];
197 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
203 cc += PRIV(OP_lengths)[*cc];
204 do cc += GET(cc, 1); while (*cc == OP_ALT);
208 /* Handle literal characters and + repetitions */
229 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
237 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
240 /* Handle exact repetitions. The count is already in characters, but we
241 need to skip over a multibyte character in UTF8 mode. */
247 branchlength += GET2(cc,1);
250 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
255 branchlength += GET2(cc,1);
256 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
257 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
260 /* Handle single-char non-literal matchers */
269 case OP_NOT_WHITESPACE:
271 case OP_NOT_WORDCHAR:
284 /* "Any newline" might match two characters, but it also might match just
292 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
293 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
294 appear, but leave the code, just in case.) */
304 /* For repeated character types, we have to test for \p and \P, which have
305 an extra two bytes of parameters. */
310 case OP_TYPEMINQUERY:
312 case OP_TYPEPOSQUERY:
313 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
314 cc += PRIV(OP_lengths)[op];
320 if (cc[1 + IMM2_SIZE] == OP_PROP
321 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
322 cc += PRIV(OP_lengths)[op];
325 /* Check a class for variable quantification */
329 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
331 /* The original code caused an unsigned overflow in 64 bit systems,
332 so now we use a conditional statement. */
336 cc += PRIV(OP_lengths)[OP_CLASS];
338 cc += PRIV(OP_lengths)[OP_CLASS];
361 branchlength += GET2(cc,1);
362 cc += 1 + 2 * IMM2_SIZE;
371 /* Backreferences and subroutine calls are treated in the same way: we find
372 the minimum length for the subpattern. A recursion, however, causes an
373 a flag to be set that causes the length of this branch to be ignored. The
374 logic is that a recursion can only make sense if there is another
375 alternation that stops the recursing. That will provide the minimum length
376 (when no recursion happens). A backreference within the group that it is
377 referencing behaves in the same way.
379 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
380 matches an empty string (by default it causes a matching failure), so in
381 that case we must set the minimum length to zero. */
383 case OP_DNREF: /* Duplicate named pattern back reference */
385 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
387 int count = GET2(cc, 1+IMM2_SIZE);
388 pcre_uchar *slot = (pcre_uchar *)re +
389 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
393 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
394 if (cs == NULL) return -2;
395 do ce += GET(ce, 1); while (*ce == OP_ALT);
396 if (cc > cs && cc < ce)
404 int dd = find_minlength(re, cs, startcode, options, recurse_depth);
407 slot += re->name_entry_size;
411 cc += 1 + 2*IMM2_SIZE;
412 goto REPEAT_BACK_REFERENCE;
414 case OP_REF: /* Single back reference */
416 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
418 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
419 if (cs == NULL) return -2;
420 do ce += GET(ce, 1); while (*ce == OP_ALT);
421 if (cc > cs && cc < ce)
428 d = find_minlength(re, cs, startcode, options, recurse_depth);
434 /* Handle repeated back references */
436 REPEAT_BACK_REFERENCE:
460 cc += 1 + 2 * IMM2_SIZE;
468 branchlength += min * d;
471 /* We can easily detect direct recursion, but not mutual recursion. This is
472 caught by a recursion depth count. */
475 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
476 do ce += GET(ce, 1); while (*ce == OP_ALT);
477 if ((cc > cs && cc < ce) || recurse_depth > 10)
481 branchlength += find_minlength(re, cs, startcode, options,
487 /* Anything else does not or need not match a character. We can get the
488 item's length from the table, but for those that can match zero occurrences
489 of a character, we must take special action for UTF-8 characters. As it
490 happens, the "NOT" versions of these opcodes are used at present only for
491 ASCII characters, so they could be omitted from this list. However, in
492 future that may change, so we include them here so as not to leave a
493 gotcha for a future maintainer. */
528 case OP_NOTMINQUERYI:
532 case OP_NOTPOSQUERYI:
534 cc += PRIV(OP_lengths)[op];
536 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
540 /* Skip these, but we need to add in the name length. */
546 cc += PRIV(OP_lengths)[op] + cc[1];
549 /* The remaining opcodes are just skipped over. */
558 cc += PRIV(OP_lengths)[op];
561 /* This should not occur: we list all opcodes explicitly so that when
562 new ones get added they are properly considered. */
568 /* Control never gets here */
573 /*************************************************
574 * Set a bit and maybe its alternate case *
575 *************************************************/
577 /* Given a character, set its first byte's bit in the table, and also the
578 corresponding bit for the other version of a letter if we are caseless. In
579 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
580 when Unicode property support is available.
583 start_bits points to the bit map
584 p points to the character
585 caseless the caseless flag
586 cd the block with char table pointers
587 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
589 Returns: pointer after the character
592 static const pcre_uchar *
593 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
594 compile_data *cd, BOOL utf)
609 c = UCD_OTHERCASE(c);
610 (void)PRIV(ord2utf)(c, buff);
613 #endif /* Not SUPPORT_UCP */
616 #else /* Not SUPPORT_UTF */
617 (void)(utf); /* Stops warning for unused parameter */
618 #endif /* SUPPORT_UTF */
620 /* Not UTF-8 mode, or character is less than 127. */
622 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
624 #endif /* COMPILE_PCRE8 */
626 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
641 c = UCD_OTHERCASE(c);
646 #endif /* SUPPORT_UCP */
649 #else /* Not SUPPORT_UTF */
650 (void)(utf); /* Stops warning for unused parameter */
651 #endif /* SUPPORT_UTF */
653 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
660 /*************************************************
661 * Set bits for a positive character type *
662 *************************************************/
664 /* This function sets starting bits for a character type. In UTF-8 mode, we can
665 only do a direct setting for bytes less than 128, as otherwise there can be
666 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
667 environment, the tables will only recognize ASCII characters anyway, but in at
668 least one Windows environment, some higher bytes bits were set in the tables.
669 So we deal with that case by considering the UTF-8 encoding.
672 start_bits the starting bitmap
673 cbit type the type of character wanted
674 table_limit 32 for non-UTF-8; 16 for UTF-8
675 cd the block with char table pointers
681 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
684 register pcre_uint32 c;
685 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
686 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687 if (table_limit == 32) return;
688 for (c = 128; c < 256; c++)
690 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
693 (void)PRIV(ord2utf)(c, buff);
701 /*************************************************
702 * Set bits for a negative character type *
703 *************************************************/
705 /* This function sets starting bits for a negative character type such as \D.
706 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
707 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
708 Unlike in the positive case, where we can set appropriate starting bits for
709 specific high-valued UTF-8 characters, in this case we have to set the bits for
710 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
711 0xc0 (192) for simplicity.
714 start_bits the starting bitmap
715 cbit type the type of character wanted
716 table_limit 32 for non-UTF-8; 16 for UTF-8
717 cd the block with char table pointers
723 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
726 register pcre_uint32 c;
727 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
735 /*************************************************
736 * Create bitmap of starting bytes *
737 *************************************************/
739 /* This function scans a compiled unanchored expression recursively and
740 attempts to build a bitmap of the set of possible starting bytes. As time goes
741 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
742 useful for parenthesized groups in patterns such as (a*)b where the group
743 provides some optional starting bytes but scanning must continue at the outer
744 level to find at least one mandatory byte. At the outermost level, this
745 function fails unless the result is SSB_DONE.
748 code points to an expression
749 start_bits points to a 32-byte table, initialized to 0
750 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
751 cd the block with char table pointers
753 Returns: SSB_FAIL => Failed to find any starting bytes
754 SSB_DONE => Found mandatory starting bytes
755 SSB_CONTINUE => Found optional starting bytes
756 SSB_UNKNOWN => Hit an unrecognized opcode
760 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
763 register pcre_uint32 c;
764 int yield = SSB_DONE;
765 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
766 int table_limit = utf? 16:32;
768 int table_limit = 32;
772 /* ========================================================================= */
773 /* The following comment and code was inserted in January 1999. In May 2006,
774 when it was observed to cause compiler warnings about unused values, I took it
775 out again. If anybody is still using OS/2, they will have to put it back
778 /* This next statement and the later reference to dummy are here in order to
779 trick the optimizer of the IBM C compiler for OS/2 into generating correct
780 code. Apparently IBM isn't going to fix the problem, and we would rather not
781 disable optimization (in this module it actually makes a big difference, and
782 the pcre module can use all the optimization it can get). */
785 /* ========================================================================= */
790 BOOL try_next = TRUE;
791 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
793 if (*code == OP_CBRA || *code == OP_SCBRA ||
794 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
796 while (try_next) /* Loop for items in this branch */
802 /* If we reach something we don't understand, it means a new opcode has
803 been created that hasn't been added to this code. Hopefully this problem
804 will be discovered during testing. */
809 /* Fail for a valid opcode that implies no starting bits. */
812 case OP_ASSERT_ACCEPT:
842 case OP_NOTMINQUERYI:
852 case OP_NOTPOSQUERYI:
884 /* We can ignore word boundary tests. */
886 case OP_WORD_BOUNDARY:
887 case OP_NOT_WORD_BOUNDARY:
891 /* If we hit a bracket or a positive lookahead assertion, recurse to set
892 bits from within the subpattern. If it can't find anything, we have to
893 give up. If it finds some mandatory character(s), we are done for this
894 branch. Otherwise, carry on scanning after the subpattern. */
907 rc = set_start_bits(tcode, start_bits, utf, cd);
908 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
909 if (rc == SSB_DONE) try_next = FALSE; else
911 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
912 tcode += 1 + LINK_SIZE;
916 /* If we hit ALT or KET, it means we haven't found anything mandatory in
917 this branch, though we might have found something optional. For ALT, we
918 continue with the next alternative, but we have to arrange that the final
919 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
920 return SSB_CONTINUE: if this is the top level, that indicates failure,
921 but after a nested subpattern, it causes scanning to continue. */
924 yield = SSB_CONTINUE;
934 /* Skip over callout */
937 tcode += 2 + 2*LINK_SIZE;
940 /* Skip over lookbehind and negative lookahead assertions */
944 case OP_ASSERTBACK_NOT:
945 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
946 tcode += 1 + LINK_SIZE;
949 /* BRAZERO does the bracket, but carries on. */
954 rc = set_start_bits(++tcode, start_bits, utf, cd);
955 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
956 /* =========================================================================
957 See the comment at the head of this function concerning the next line,
958 which was an old fudge for the benefit of OS/2.
960 ========================================================================= */
961 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
962 tcode += 1 + LINK_SIZE;
965 /* SKIPZERO skips the bracket. */
969 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
970 tcode += 1 + LINK_SIZE;
973 /* Single-char * or ? sets the bit and tries the next item */
981 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
990 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
993 /* Single-char upto sets the bit and tries the next */
998 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1004 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1007 /* At least one single char sets the bit and stops */
1016 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1027 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1031 /* Special spacing and line-terminating items. These recognize specific
1032 lists of characters. The difference between VSPACE and ANYNL is that the
1033 latter can match the two-character CRLF sequence, but that is not
1034 relevant for finding the first character, so their code here is
1039 SET_BIT(CHAR_SPACE);
1043 #ifdef COMPILE_PCRE8
1044 SET_BIT(0xC2); /* For U+00A0 */
1045 SET_BIT(0xE1); /* For U+1680, U+180E */
1046 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1047 SET_BIT(0xE3); /* For U+3000 */
1048 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1050 SET_BIT(0xFF); /* For characters > 255 */
1051 #endif /* COMPILE_PCRE[8|16|32] */
1054 #endif /* SUPPORT_UTF */
1058 #endif /* Not EBCDIC */
1059 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1060 SET_BIT(0xFF); /* For characters > 255 */
1061 #endif /* COMPILE_PCRE[16|32] */
1075 #ifdef COMPILE_PCRE8
1076 SET_BIT(0xC2); /* For U+0085 */
1077 SET_BIT(0xE2); /* For U+2028, U+2029 */
1078 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1080 SET_BIT(0xFF); /* For characters > 255 */
1081 #endif /* COMPILE_PCRE[8|16|32] */
1084 #endif /* SUPPORT_UTF */
1087 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1088 SET_BIT(0xFF); /* For characters > 255 */
1094 /* Single character types set the bits and stop. Note that if PCRE_UCP
1095 is set, we do not see these op codes because \d etc are converted to
1096 properties. Therefore, these apply in the case when only characters less
1097 than 256 are recognized to match the types. */
1100 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1105 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1109 /* The cbit_space table has vertical tab as whitespace; we have to
1110 ensure it is set as not whitespace. Luckily, the code value is the same
1111 (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1113 case OP_NOT_WHITESPACE:
1114 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1115 start_bits[1] |= 0x08;
1119 /* The cbit_space table has vertical tab as whitespace; we have to not
1120 set it from the table. Luckily, the code value is the same (0x0b) in
1121 ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1124 c = start_bits[1]; /* Save in case it was already set */
1125 set_type_bits(start_bits, cbit_space, table_limit, cd);
1126 start_bits[1] = (start_bits[1] & ~0x08) | c;
1130 case OP_NOT_WORDCHAR:
1131 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1136 set_type_bits(start_bits, cbit_word, table_limit, cd);
1140 /* One or more character type fudges the pointer and restarts, knowing
1141 it will hit a single character type and stop there. */
1144 case OP_TYPEMINPLUS:
1145 case OP_TYPEPOSPLUS:
1150 tcode += 1 + IMM2_SIZE;
1153 /* Zero or more repeats of character types set the bits and then
1157 case OP_TYPEMINUPTO:
1158 case OP_TYPEPOSUPTO:
1159 tcode += IMM2_SIZE; /* Fall through */
1162 case OP_TYPEMINSTAR:
1163 case OP_TYPEPOSSTAR:
1165 case OP_TYPEMINQUERY:
1166 case OP_TYPEPOSQUERY:
1176 SET_BIT(CHAR_SPACE);
1180 #ifdef COMPILE_PCRE8
1181 SET_BIT(0xC2); /* For U+00A0 */
1182 SET_BIT(0xE1); /* For U+1680, U+180E */
1183 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1184 SET_BIT(0xE3); /* For U+3000 */
1185 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1187 SET_BIT(0xFF); /* For characters > 255 */
1188 #endif /* COMPILE_PCRE[8|16|32] */
1191 #endif /* SUPPORT_UTF */
1194 #endif /* Not EBCDIC */
1206 #ifdef COMPILE_PCRE8
1207 SET_BIT(0xC2); /* For U+0085 */
1208 SET_BIT(0xE2); /* For U+2028, U+2029 */
1209 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1211 SET_BIT(0xFF); /* For characters > 255 */
1212 #endif /* COMPILE_PCRE16 */
1215 #endif /* SUPPORT_UTF */
1220 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1224 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1227 /* The cbit_space table has vertical tab as whitespace; we no longer
1228 have to play fancy tricks because Perl added VT to its whitespace at
1229 release 5.18. PCRE added it at release 8.34. */
1231 case OP_NOT_WHITESPACE:
1232 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1236 set_type_bits(start_bits, cbit_space, table_limit, cd);
1239 case OP_NOT_WORDCHAR:
1240 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1244 set_type_bits(start_bits, cbit_word, table_limit, cd);
1251 /* Character class where all the information is in a bit map: set the
1252 bits and either carry on or not, according to the repeat count. If it was
1253 a negative class, and we are operating with UTF-8 characters, any byte
1254 with a value >= 0xc4 is a potentially valid starter because it starts a
1255 character with a value > 255. */
1257 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1259 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1261 /* All bits are set. */
1262 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1268 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1271 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1272 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1275 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1276 SET_BIT(0xFF); /* For characters > 255 */
1283 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1285 if (*tcode == OP_XCLASS)
1287 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1288 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1289 tcode += GET(tcode, 1);
1295 map = (pcre_uint8 *)tcode;
1296 tcode += 32 / sizeof(pcre_uchar);
1299 /* In UTF-8 mode, the bits in a bit map correspond to character
1300 values, not to byte values. However, the bit map we are constructing is
1301 for byte values. So we have to do a conversion for characters whose
1302 value is > 127. In fact, there are only two possible starting bytes for
1303 characters in the range 128 - 255. */
1305 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1309 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1312 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1313 for (c = 128; c < 256; c++)
1315 if ((map[c/8] && (1 << (c&7))) != 0)
1317 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1318 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1319 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1326 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1327 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1331 /* Advance past the bit map, and act on what follows. For a zero
1332 minimum repeat, continue; otherwise stop processing. */
1348 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1349 else try_next = FALSE;
1357 break; /* End of bitmap class handling */
1359 } /* End of switch */
1360 } /* End of try_next loop */
1362 code += GET(code, 1); /* Advance to next branch */
1364 while (*code == OP_ALT);
1372 /*************************************************
1373 * Study a compiled expression *
1374 *************************************************/
1376 /* This function is handed a compiled expression that it must study to produce
1377 information that will speed up the matching. It returns a pcre[16]_extra block
1378 which then gets handed back to pcre_exec().
1381 re points to the compiled expression
1382 options contains option bits
1383 errorptr points to where to place error messages;
1384 set NULL unless error
1386 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1387 the appropriate flags set;
1388 NULL on error or if no optimization possible
1391 #if defined COMPILE_PCRE8
1392 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1393 pcre_study(const pcre *external_re, int options, const char **errorptr)
1394 #elif defined COMPILE_PCRE16
1395 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1396 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1397 #elif defined COMPILE_PCRE32
1398 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1399 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1403 BOOL bits_set = FALSE;
1404 pcre_uint8 start_bits[32];
1405 PUBL(extra) *extra = NULL;
1406 pcre_study_data *study;
1407 const pcre_uint8 *tables;
1409 compile_data compile_block;
1410 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1415 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1417 *errorptr = "argument is not a compiled regular expression";
1421 if ((re->flags & PCRE_MODE) == 0)
1423 #if defined COMPILE_PCRE8
1424 *errorptr = "argument not compiled in 8 bit mode";
1425 #elif defined COMPILE_PCRE16
1426 *errorptr = "argument not compiled in 16 bit mode";
1427 #elif defined COMPILE_PCRE32
1428 *errorptr = "argument not compiled in 32 bit mode";
1433 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1435 *errorptr = "unknown or incorrect option bit(s) set";
1439 code = (pcre_uchar *)re + re->name_table_offset +
1440 (re->name_count * re->name_entry_size);
1442 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1443 a multiline pattern that matches only at "line starts", there is no point in
1444 seeking a list of starting bytes. */
1446 if ((re->options & PCRE_ANCHORED) == 0 &&
1447 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1451 /* Set the character tables in the block that is passed around */
1453 tables = re->tables;
1455 #if defined COMPILE_PCRE8
1457 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1459 #elif defined COMPILE_PCRE16
1461 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1463 #elif defined COMPILE_PCRE32
1465 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1469 compile_block.lcc = tables + lcc_offset;
1470 compile_block.fcc = tables + fcc_offset;
1471 compile_block.cbits = tables + cbits_offset;
1472 compile_block.ctypes = tables + ctypes_offset;
1474 /* See if we can find a fixed set of initial characters for the pattern. */
1476 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1477 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1479 bits_set = rc == SSB_DONE;
1480 if (rc == SSB_UNKNOWN)
1482 *errorptr = "internal error: opcode not recognized";
1487 /* Find the minimum length of subject string. */
1489 switch(min = find_minlength(re, code, code, re->options, 0))
1491 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1492 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1496 /* If a set of starting bytes has been identified, or if the minimum length is
1497 greater than zero, or if JIT optimization has been requested, or if
1498 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1499 pcre_study_data block. The study data is put in the latter, which is pointed to
1500 by the former, which may also get additional data set later by the calling
1501 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1502 save it in a field for returning via the pcre_fullinfo() function so that if it
1503 becomes variable in the future, we don't have to change that code. */
1505 if (bits_set || min > 0 || (options & (
1507 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1508 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1510 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1512 extra = (PUBL(extra) *)(PUBL(malloc))
1513 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1516 *errorptr = "failed to get memory";
1520 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1521 extra->flags = PCRE_EXTRA_STUDY_DATA;
1522 extra->study_data = study;
1524 study->size = sizeof(pcre_study_data);
1527 /* Set the start bits always, to avoid unset memory errors if the
1528 study data is written to a file, but set the flag only if any of the bits
1529 are set, to save time looking when none are. */
1533 study->flags |= PCRE_STUDY_MAPPED;
1534 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1536 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1541 pcre_uint8 *ptr = start_bits;
1544 printf("Start bits:\n");
1545 for (i = 0; i < 32; i++)
1546 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1550 /* Always set the minlength value in the block, because the JIT compiler
1551 makes use of it. However, don't set the bit unless the length is greater than
1552 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1553 checking the zero case. */
1557 study->flags |= PCRE_STUDY_MINLEN;
1558 study->minlength = min;
1560 else study->minlength = 0;
1562 /* If JIT support was compiled and requested, attempt the JIT compilation.
1563 If no starting bytes were found, and the minimum length is zero, and JIT
1564 compilation fails, abandon the extra block and return NULL, unless
1565 PCRE_STUDY_EXTRA_NEEDED is set. */
1568 extra->executable_jit = NULL;
1569 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1570 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1571 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1572 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1573 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1574 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1576 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1577 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1579 #if defined COMPILE_PCRE8
1580 pcre_free_study(extra);
1581 #elif defined COMPILE_PCRE16
1582 pcre16_free_study(extra);
1583 #elif defined COMPILE_PCRE32
1584 pcre32_free_study(extra);
1595 /*************************************************
1596 * Free the study data *
1597 *************************************************/
1599 /* This function frees the memory that was obtained by pcre_study().
1601 Argument: a pointer to the pcre[16]_extra block
1605 #if defined COMPILE_PCRE8
1607 pcre_free_study(pcre_extra *extra)
1608 #elif defined COMPILE_PCRE16
1610 pcre16_free_study(pcre16_extra *extra)
1611 #elif defined COMPILE_PCRE32
1613 pcre32_free_study(pcre32_extra *extra)
1619 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1620 extra->executable_jit != NULL)
1621 PRIV(jit_free)(extra->executable_jit);
1626 /* End of pcre_study.c */