1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
49 #include "pcre_internal.h"
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
53 /* Returns from set_start_bits() */
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
69 re compiled pattern block
70 code pointer to start of group (the bracket)
71 startcode pointer to start of the whole pattern's code
72 options the compiling options
73 recurses chain of recurse_check to catch mutual recursion
74 countptr pointer to call count (to catch over complexity)
76 Returns: the minimum length
77 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78 -2 internal error (missing capturing bracket)
79 -3 internal error (opcode not listed)
83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
84 const pcre_uchar *startcode, int options, recurse_check *recurses,
88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
89 BOOL utf = (options & PCRE_UTF8) != 0;
90 BOOL had_recurse = FALSE;
91 recurse_check this_recurse;
92 register int branchlength = 0;
93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
95 if ((*countptr)++ > 1000) return -1; /* too complex */
97 if (*code == OP_CBRA || *code == OP_SCBRA ||
98 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
100 /* Scan along the opcodes for this branch. If we get to the end of the
101 branch, check the length against that of the other branches. */
107 register pcre_uchar op = *cc;
114 /* If there is only one branch in a condition, the implied branch has zero
115 length, so we don't add anything. This covers the DEFINE "condition"
118 cs = cc + GET(cc, 1);
121 cc = cs + 1 + LINK_SIZE;
125 /* Otherwise we can fall through and treat it the same as any other
138 d = find_minlength(re, cc, startcode, options, recurses, countptr);
141 do cc += GET(cc, 1); while (*cc == OP_ALT);
145 /* ACCEPT makes things far too complicated; we have to give up. */
148 case OP_ASSERT_ACCEPT:
151 /* Reached end of a branch; if it's a ket it is the end of a nested
152 call. If it's ALT it is an alternation in a nested call. If it is END it's
153 the end of the outer call. All can be handled by the same code. If an
154 ACCEPT was previously encountered, use the length that was in force at that
155 time, and pass back the shortest ACCEPT length. */
163 if (length < 0 || (!had_recurse && branchlength < length))
164 length = branchlength;
165 if (op != OP_ALT) return length;
171 /* Skip over assertive subpatterns */
176 case OP_ASSERTBACK_NOT:
177 do cc += GET(cc, 1); while (*cc == OP_ALT);
180 /* Skip over things that don't match chars */
197 case OP_NOT_WORD_BOUNDARY:
198 case OP_WORD_BOUNDARY:
199 cc += PRIV(OP_lengths)[*cc];
202 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
208 cc += PRIV(OP_lengths)[*cc];
209 do cc += GET(cc, 1); while (*cc == OP_ALT);
213 /* Handle literal characters and + repetitions */
234 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
242 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
245 /* Handle exact repetitions. The count is already in characters, but we
246 need to skip over a multibyte character in UTF8 mode. */
252 branchlength += GET2(cc,1);
255 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
260 branchlength += GET2(cc,1);
261 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
265 /* Handle single-char non-literal matchers */
274 case OP_NOT_WHITESPACE:
276 case OP_NOT_WORDCHAR:
289 /* "Any newline" might match two characters, but it also might match just
297 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299 appear, but leave the code, just in case.) */
309 /* For repeated character types, we have to test for \p and \P, which have
310 an extra two bytes of parameters. */
315 case OP_TYPEMINQUERY:
317 case OP_TYPEPOSQUERY:
318 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
319 cc += PRIV(OP_lengths)[op];
325 if (cc[1 + IMM2_SIZE] == OP_PROP
326 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327 cc += PRIV(OP_lengths)[op];
330 /* Check a class for variable quantification */
334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
336 /* The original code caused an unsigned overflow in 64 bit systems,
337 so now we use a conditional statement. */
341 cc += PRIV(OP_lengths)[OP_CLASS];
343 cc += PRIV(OP_lengths)[OP_CLASS];
366 branchlength += GET2(cc,1);
367 cc += 1 + 2 * IMM2_SIZE;
376 /* Backreferences and subroutine calls are treated in the same way: we find
377 the minimum length for the subpattern. A recursion, however, causes an
378 a flag to be set that causes the length of this branch to be ignored. The
379 logic is that a recursion can only make sense if there is another
380 alternation that stops the recursing. That will provide the minimum length
381 (when no recursion happens). A backreference within the group that it is
382 referencing behaves in the same way.
384 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385 matches an empty string (by default it causes a matching failure), so in
386 that case we must set the minimum length to zero. */
388 case OP_DNREF: /* Duplicate named pattern back reference */
390 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
392 int count = GET2(cc, 1+IMM2_SIZE);
393 pcre_uchar *slot = (pcre_uchar *)re +
394 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
398 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399 if (cs == NULL) return -2;
400 do ce += GET(ce, 1); while (*ce == OP_ALT);
401 if (cc > cs && cc < ce) /* Simple recursion */
409 recurse_check *r = recurses;
410 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
411 if (r != NULL) /* Mutual recursion */
420 this_recurse.prev = recurses;
421 this_recurse.group = cs;
422 dd = find_minlength(re, cs, startcode, options, &this_recurse,
427 slot += re->name_entry_size;
431 cc += 1 + 2*IMM2_SIZE;
432 goto REPEAT_BACK_REFERENCE;
434 case OP_REF: /* Single back reference */
436 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
438 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439 if (cs == NULL) return -2;
440 do ce += GET(ce, 1); while (*ce == OP_ALT);
441 if (cc > cs && cc < ce) /* Simple recursion */
448 recurse_check *r = recurses;
449 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
450 if (r != NULL) /* Mutual recursion */
457 this_recurse.prev = recurses;
458 this_recurse.group = cs;
459 d = find_minlength(re, cs, startcode, options, &this_recurse,
467 /* Handle repeated back references */
469 REPEAT_BACK_REFERENCE:
493 cc += 1 + 2 * IMM2_SIZE;
501 branchlength += min * d;
504 /* We can easily detect direct recursion, but not mutual recursion. This is
505 caught by a recursion depth count. */
508 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509 do ce += GET(ce, 1); while (*ce == OP_ALT);
510 if (cc > cs && cc < ce) /* Simple recursion */
514 recurse_check *r = recurses;
515 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
516 if (r != NULL) /* Mutual recursion */
520 this_recurse.prev = recurses;
521 this_recurse.group = cs;
522 branchlength += find_minlength(re, cs, startcode, options,
523 &this_recurse, countptr);
529 /* Anything else does not or need not match a character. We can get the
530 item's length from the table, but for those that can match zero occurrences
531 of a character, we must take special action for UTF-8 characters. As it
532 happens, the "NOT" versions of these opcodes are used at present only for
533 ASCII characters, so they could be omitted from this list. However, in
534 future that may change, so we include them here so as not to leave a
535 gotcha for a future maintainer. */
570 case OP_NOTMINQUERYI:
574 case OP_NOTPOSQUERYI:
576 cc += PRIV(OP_lengths)[op];
578 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
582 /* Skip these, but we need to add in the name length. */
588 cc += PRIV(OP_lengths)[op] + cc[1];
591 /* The remaining opcodes are just skipped over. */
600 cc += PRIV(OP_lengths)[op];
603 /* This should not occur: we list all opcodes explicitly so that when
604 new ones get added they are properly considered. */
610 /* Control never gets here */
615 /*************************************************
616 * Set a bit and maybe its alternate case *
617 *************************************************/
619 /* Given a character, set its first byte's bit in the table, and also the
620 corresponding bit for the other version of a letter if we are caseless. In
621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622 when Unicode property support is available.
625 start_bits points to the bit map
626 p points to the character
627 caseless the caseless flag
628 cd the block with char table pointers
629 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
631 Returns: pointer after the character
634 static const pcre_uchar *
635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636 compile_data *cd, BOOL utf)
651 c = UCD_OTHERCASE(c);
652 (void)PRIV(ord2utf)(c, buff);
655 #endif /* Not SUPPORT_UCP */
658 #else /* Not SUPPORT_UTF */
659 (void)(utf); /* Stops warning for unused parameter */
660 #endif /* SUPPORT_UTF */
662 /* Not UTF-8 mode, or character is less than 127. */
664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
666 #endif /* COMPILE_PCRE8 */
668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
683 c = UCD_OTHERCASE(c);
688 #endif /* SUPPORT_UCP */
691 #else /* Not SUPPORT_UTF */
692 (void)(utf); /* Stops warning for unused parameter */
693 #endif /* SUPPORT_UTF */
695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
702 /*************************************************
703 * Set bits for a positive character type *
704 *************************************************/
706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
707 only do a direct setting for bytes less than 128, as otherwise there can be
708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709 environment, the tables will only recognize ASCII characters anyway, but in at
710 least one Windows environment, some higher bytes bits were set in the tables.
711 So we deal with that case by considering the UTF-8 encoding.
714 start_bits the starting bitmap
715 cbit type the type of character wanted
716 table_limit 32 for non-UTF-8; 16 for UTF-8
717 cd the block with char table pointers
723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
726 register pcre_uint32 c;
727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729 if (table_limit == 32) return;
730 for (c = 128; c < 256; c++)
732 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
735 (void)PRIV(ord2utf)(c, buff);
743 /*************************************************
744 * Set bits for a negative character type *
745 *************************************************/
747 /* This function sets starting bits for a negative character type such as \D.
748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750 Unlike in the positive case, where we can set appropriate starting bits for
751 specific high-valued UTF-8 characters, in this case we have to set the bits for
752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
753 0xc0 (192) for simplicity.
756 start_bits the starting bitmap
757 cbit type the type of character wanted
758 table_limit 32 for non-UTF-8; 16 for UTF-8
759 cd the block with char table pointers
765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
768 register pcre_uint32 c;
769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
777 /*************************************************
778 * Create bitmap of starting bytes *
779 *************************************************/
781 /* This function scans a compiled unanchored expression recursively and
782 attempts to build a bitmap of the set of possible starting bytes. As time goes
783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784 useful for parenthesized groups in patterns such as (a*)b where the group
785 provides some optional starting bytes but scanning must continue at the outer
786 level to find at least one mandatory byte. At the outermost level, this
787 function fails unless the result is SSB_DONE.
790 code points to an expression
791 start_bits points to a 32-byte table, initialized to 0
792 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793 cd the block with char table pointers
795 Returns: SSB_FAIL => Failed to find any starting bytes
796 SSB_DONE => Found mandatory starting bytes
797 SSB_CONTINUE => Found optional starting bytes
798 SSB_UNKNOWN => Hit an unrecognized opcode
802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
805 register pcre_uint32 c;
806 int yield = SSB_DONE;
807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
808 int table_limit = utf? 16:32;
810 int table_limit = 32;
814 /* ========================================================================= */
815 /* The following comment and code was inserted in January 1999. In May 2006,
816 when it was observed to cause compiler warnings about unused values, I took it
817 out again. If anybody is still using OS/2, they will have to put it back
820 /* This next statement and the later reference to dummy are here in order to
821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
822 code. Apparently IBM isn't going to fix the problem, and we would rather not
823 disable optimization (in this module it actually makes a big difference, and
824 the pcre module can use all the optimization it can get). */
827 /* ========================================================================= */
832 BOOL try_next = TRUE;
833 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
835 if (*code == OP_CBRA || *code == OP_SCBRA ||
836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
838 while (try_next) /* Loop for items in this branch */
844 /* If we reach something we don't understand, it means a new opcode has
845 been created that hasn't been added to this code. Hopefully this problem
846 will be discovered during testing. */
851 /* Fail for a valid opcode that implies no starting bits. */
854 case OP_ASSERT_ACCEPT:
884 case OP_NOTMINQUERYI:
894 case OP_NOTPOSQUERYI:
925 /* A "real" property test implies no starting bits, but the fake property
926 PT_CLIST identifies a list of characters. These lists are short, as they
927 are used for characters with more than one "other case", so there is no
928 point in recognizing them for OP_NOTPROP. */
931 if (tcode[1] != PT_CLIST) return SSB_FAIL;
933 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934 while ((c = *p++) < NOTACHAR)
936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
940 (void)PRIV(ord2utf)(c, buff);
944 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
950 /* We can ignore word boundary tests. */
952 case OP_WORD_BOUNDARY:
953 case OP_NOT_WORD_BOUNDARY:
957 /* If we hit a bracket or a positive lookahead assertion, recurse to set
958 bits from within the subpattern. If it can't find anything, we have to
959 give up. If it finds some mandatory character(s), we are done for this
960 branch. Otherwise, carry on scanning after the subpattern. */
973 rc = set_start_bits(tcode, start_bits, utf, cd);
974 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
975 if (rc == SSB_DONE) try_next = FALSE; else
977 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978 tcode += 1 + LINK_SIZE;
982 /* If we hit ALT or KET, it means we haven't found anything mandatory in
983 this branch, though we might have found something optional. For ALT, we
984 continue with the next alternative, but we have to arrange that the final
985 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986 return SSB_CONTINUE: if this is the top level, that indicates failure,
987 but after a nested subpattern, it causes scanning to continue. */
990 yield = SSB_CONTINUE;
1000 /* Skip over callout */
1003 tcode += 2 + 2*LINK_SIZE;
1006 /* Skip over lookbehind and negative lookahead assertions */
1010 case OP_ASSERTBACK_NOT:
1011 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012 tcode += 1 + LINK_SIZE;
1015 /* BRAZERO does the bracket, but carries on. */
1020 rc = set_start_bits(++tcode, start_bits, utf, cd);
1021 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1022 /* =========================================================================
1023 See the comment at the head of this function concerning the next line,
1024 which was an old fudge for the benefit of OS/2.
1026 ========================================================================= */
1027 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028 tcode += 1 + LINK_SIZE;
1031 /* SKIPZERO skips the bracket. */
1035 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036 tcode += 1 + LINK_SIZE;
1039 /* Single-char * or ? sets the bit and tries the next item */
1047 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1056 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1059 /* Single-char upto sets the bit and tries the next */
1064 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1070 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1073 /* At least one single char sets the bit and stops */
1082 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1093 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1097 /* Special spacing and line-terminating items. These recognize specific
1098 lists of characters. The difference between VSPACE and ANYNL is that the
1099 latter can match the two-character CRLF sequence, but that is not
1100 relevant for finding the first character, so their code here is
1105 SET_BIT(CHAR_SPACE);
1109 #ifdef COMPILE_PCRE8
1110 SET_BIT(0xC2); /* For U+00A0 */
1111 SET_BIT(0xE1); /* For U+1680, U+180E */
1112 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1113 SET_BIT(0xE3); /* For U+3000 */
1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1116 SET_BIT(0xFF); /* For characters > 255 */
1117 #endif /* COMPILE_PCRE[8|16|32] */
1120 #endif /* SUPPORT_UTF */
1124 #endif /* Not EBCDIC */
1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126 SET_BIT(0xFF); /* For characters > 255 */
1127 #endif /* COMPILE_PCRE[16|32] */
1141 #ifdef COMPILE_PCRE8
1142 SET_BIT(0xC2); /* For U+0085 */
1143 SET_BIT(0xE2); /* For U+2028, U+2029 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1146 SET_BIT(0xFF); /* For characters > 255 */
1147 #endif /* COMPILE_PCRE[8|16|32] */
1150 #endif /* SUPPORT_UTF */
1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154 SET_BIT(0xFF); /* For characters > 255 */
1160 /* Single character types set the bits and stop. Note that if PCRE_UCP
1161 is set, we do not see these op codes because \d etc are converted to
1162 properties. Therefore, these apply in the case when only characters less
1163 than 256 are recognized to match the types. */
1166 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1171 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1175 /* The cbit_space table has vertical tab as whitespace; we no longer
1176 have to play fancy tricks because Perl added VT to its whitespace at
1177 release 5.18. PCRE added it at release 8.34. */
1179 case OP_NOT_WHITESPACE:
1180 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1185 set_type_bits(start_bits, cbit_space, table_limit, cd);
1189 case OP_NOT_WORDCHAR:
1190 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1195 set_type_bits(start_bits, cbit_word, table_limit, cd);
1199 /* One or more character type fudges the pointer and restarts, knowing
1200 it will hit a single character type and stop there. */
1203 case OP_TYPEMINPLUS:
1204 case OP_TYPEPOSPLUS:
1209 tcode += 1 + IMM2_SIZE;
1212 /* Zero or more repeats of character types set the bits and then
1216 case OP_TYPEMINUPTO:
1217 case OP_TYPEPOSUPTO:
1218 tcode += IMM2_SIZE; /* Fall through */
1221 case OP_TYPEMINSTAR:
1222 case OP_TYPEPOSSTAR:
1224 case OP_TYPEMINQUERY:
1225 case OP_TYPEPOSQUERY:
1235 SET_BIT(CHAR_SPACE);
1239 #ifdef COMPILE_PCRE8
1240 SET_BIT(0xC2); /* For U+00A0 */
1241 SET_BIT(0xE1); /* For U+1680, U+180E */
1242 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1243 SET_BIT(0xE3); /* For U+3000 */
1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1246 SET_BIT(0xFF); /* For characters > 255 */
1247 #endif /* COMPILE_PCRE[8|16|32] */
1250 #endif /* SUPPORT_UTF */
1253 #endif /* Not EBCDIC */
1265 #ifdef COMPILE_PCRE8
1266 SET_BIT(0xC2); /* For U+0085 */
1267 SET_BIT(0xE2); /* For U+2028, U+2029 */
1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1270 SET_BIT(0xFF); /* For characters > 255 */
1271 #endif /* COMPILE_PCRE16 */
1274 #endif /* SUPPORT_UTF */
1279 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1283 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1286 /* The cbit_space table has vertical tab as whitespace; we no longer
1287 have to play fancy tricks because Perl added VT to its whitespace at
1288 release 5.18. PCRE added it at release 8.34. */
1290 case OP_NOT_WHITESPACE:
1291 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1295 set_type_bits(start_bits, cbit_space, table_limit, cd);
1298 case OP_NOT_WORDCHAR:
1299 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1303 set_type_bits(start_bits, cbit_word, table_limit, cd);
1310 /* Character class where all the information is in a bit map: set the
1311 bits and either carry on or not, according to the repeat count. If it was
1312 a negative class, and we are operating with UTF-8 characters, any byte
1313 with a value >= 0xc4 is a potentially valid starter because it starts a
1314 character with a value > 255. */
1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1318 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1320 /* All bits are set. */
1321 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1330 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1331 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335 SET_BIT(0xFF); /* For characters > 255 */
1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1344 if (*tcode == OP_XCLASS)
1346 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348 tcode += GET(tcode, 1);
1354 map = (pcre_uint8 *)tcode;
1355 tcode += 32 / sizeof(pcre_uchar);
1358 /* In UTF-8 mode, the bits in a bit map correspond to character
1359 values, not to byte values. However, the bit map we are constructing is
1360 for byte values. So we have to do a conversion for characters whose
1361 value is > 127. In fact, there are only two possible starting bytes for
1362 characters in the range 128 - 255. */
1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1371 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372 for (c = 128; c < 256; c++)
1374 if ((map[c/8] & (1 << (c&7))) != 0)
1376 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1377 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1378 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1385 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1390 /* Advance past the bit map, and act on what follows. For a zero
1391 minimum repeat, continue; otherwise stop processing. */
1407 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408 else try_next = FALSE;
1416 break; /* End of bitmap class handling */
1418 } /* End of switch */
1419 } /* End of try_next loop */
1421 code += GET(code, 1); /* Advance to next branch */
1423 while (*code == OP_ALT);
1431 /*************************************************
1432 * Study a compiled expression *
1433 *************************************************/
1435 /* This function is handed a compiled expression that it must study to produce
1436 information that will speed up the matching. It returns a pcre[16]_extra block
1437 which then gets handed back to pcre_exec().
1440 re points to the compiled expression
1441 options contains option bits
1442 errorptr points to where to place error messages;
1443 set NULL unless error
1445 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1446 the appropriate flags set;
1447 NULL on error or if no optimization possible
1450 #if defined COMPILE_PCRE8
1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
1453 #elif defined COMPILE_PCRE16
1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456 #elif defined COMPILE_PCRE32
1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1463 BOOL bits_set = FALSE;
1464 pcre_uint8 start_bits[32];
1465 PUBL(extra) *extra = NULL;
1466 pcre_study_data *study;
1467 const pcre_uint8 *tables;
1469 compile_data compile_block;
1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1477 *errorptr = "argument is not a compiled regular expression";
1481 if ((re->flags & PCRE_MODE) == 0)
1483 #if defined COMPILE_PCRE8
1484 *errorptr = "argument not compiled in 8 bit mode";
1485 #elif defined COMPILE_PCRE16
1486 *errorptr = "argument not compiled in 16 bit mode";
1487 #elif defined COMPILE_PCRE32
1488 *errorptr = "argument not compiled in 32 bit mode";
1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1495 *errorptr = "unknown or incorrect option bit(s) set";
1499 code = (pcre_uchar *)re + re->name_table_offset +
1500 (re->name_count * re->name_entry_size);
1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1503 a multiline pattern that matches only at "line starts", there is no point in
1504 seeking a list of starting bytes. */
1506 if ((re->options & PCRE_ANCHORED) == 0 &&
1507 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1511 /* Set the character tables in the block that is passed around */
1513 tables = re->tables;
1515 #if defined COMPILE_PCRE8
1517 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519 #elif defined COMPILE_PCRE16
1521 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1523 #elif defined COMPILE_PCRE32
1525 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1529 compile_block.lcc = tables + lcc_offset;
1530 compile_block.fcc = tables + fcc_offset;
1531 compile_block.cbits = tables + cbits_offset;
1532 compile_block.ctypes = tables + ctypes_offset;
1534 /* See if we can find a fixed set of initial characters for the pattern. */
1536 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1539 bits_set = rc == SSB_DONE;
1540 if (rc == SSB_UNKNOWN)
1542 *errorptr = "internal error: opcode not recognized";
1547 /* Find the minimum length of subject string. */
1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1551 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1556 /* If a set of starting bytes has been identified, or if the minimum length is
1557 greater than zero, or if JIT optimization has been requested, or if
1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559 pcre_study_data block. The study data is put in the latter, which is pointed to
1560 by the former, which may also get additional data set later by the calling
1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562 save it in a field for returning via the pcre_fullinfo() function so that if it
1563 becomes variable in the future, we don't have to change that code. */
1565 if (bits_set || min > 0 || (options & (
1567 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1568 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1570 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1572 extra = (PUBL(extra) *)(PUBL(malloc))
1573 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1576 *errorptr = "failed to get memory";
1580 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581 extra->flags = PCRE_EXTRA_STUDY_DATA;
1582 extra->study_data = study;
1584 study->size = sizeof(pcre_study_data);
1587 /* Set the start bits always, to avoid unset memory errors if the
1588 study data is written to a file, but set the flag only if any of the bits
1589 are set, to save time looking when none are. */
1593 study->flags |= PCRE_STUDY_MAPPED;
1594 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1596 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1601 pcre_uint8 *ptr = start_bits;
1604 printf("Start bits:\n");
1605 for (i = 0; i < 32; i++)
1606 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1610 /* Always set the minlength value in the block, because the JIT compiler
1611 makes use of it. However, don't set the bit unless the length is greater than
1612 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613 checking the zero case. */
1617 study->flags |= PCRE_STUDY_MINLEN;
1618 study->minlength = min;
1620 else study->minlength = 0;
1622 /* If JIT support was compiled and requested, attempt the JIT compilation.
1623 If no starting bytes were found, and the minimum length is zero, and JIT
1624 compilation fails, abandon the extra block and return NULL, unless
1625 PCRE_STUDY_EXTRA_NEEDED is set. */
1628 extra->executable_jit = NULL;
1629 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1636 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1639 #if defined COMPILE_PCRE8
1640 pcre_free_study(extra);
1641 #elif defined COMPILE_PCRE16
1642 pcre16_free_study(extra);
1643 #elif defined COMPILE_PCRE32
1644 pcre32_free_study(extra);
1655 /*************************************************
1656 * Free the study data *
1657 *************************************************/
1659 /* This function frees the memory that was obtained by pcre_study().
1661 Argument: a pointer to the pcre[16]_extra block
1665 #if defined COMPILE_PCRE8
1667 pcre_free_study(pcre_extra *extra)
1668 #elif defined COMPILE_PCRE16
1670 pcre16_free_study(pcre16_extra *extra)
1671 #elif defined COMPILE_PCRE32
1673 pcre32_free_study(pcre32_extra *extra)
1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680 extra->executable_jit != NULL)
1681 PRIV(jit_free)(extra->executable_jit);
1686 /* End of pcre_study.c */