1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
84 #define MATCH_NOMATCH 0
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
106 #define REC_STACK_SAVE_MAX 30
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
159 Returns: >= 0 the number of subject bytes matched
161 -2 partial match; always given if at end subject
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
190 if (length < 0) return -1;
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
210 PCRE_PUCHAR endptr = p + length;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
219 if (c != d && c != d + ur->other_case)
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
261 return (int)(eptr - eptr_start);
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
320 #define REGISTER register
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
329 #define RRETURN(ra) \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
387 /* Structure for remembering the local variables in a private frame */
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
393 /* Function arguments that may change */
396 const pcre_uchar *Xecode;
400 unsigned int Xrdepth;
402 /* Function local variables */
404 PCRE_PUCHAR Xcallpat;
406 PCRE_PUCHAR Xcharptr;
412 PCRE_PUCHAR Xsaved_eptr;
414 recursion_info Xnew_recursive;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
425 pcre_uchar Xocchars[6];
435 unsigned int Xnumber;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
444 /* Where to jump back to */
453 /***************************************************************************
454 ***************************************************************************/
458 /*************************************************
459 * Match from current position *
460 *************************************************/
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
514 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
515 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
516 unsigned int rdepth) __attribute__((noinline,noclone));
519 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
520 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
523 /* These variables do not need to be preserved over recursion in this function,
524 so they can be ordinary variables in all cases. Mark some of them with
525 "register" because they are used a lot in loops. */
527 register int rrc; /* Returns from recursive calls */
528 register int i; /* Used for loops not involving calls to RMATCH() */
529 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
530 register BOOL utf; /* Local copy of UTF flag for speed */
532 BOOL minimize, possessive; /* Quantifier options */
536 /* When recursion is not being used, all "local" variables that have to be
537 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
538 frame on the stack here; subsequent instantiations are obtained from the heap
539 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
540 the top-level on the stack rather than malloc-ing them all gives a performance
541 boost in many cases where there is not much "recursion". */
544 heapframe *frame = (heapframe *)md->match_frames_base;
546 /* Copy in the original argument variables */
549 frame->Xecode = ecode;
550 frame->Xmstart = mstart;
551 frame->Xoffset_top = offset_top;
552 frame->Xeptrb = eptrb;
553 frame->Xrdepth = rdepth;
555 /* This is where control jumps back to to effect "recursion" */
559 /* Macros make the argument variables come from the current frame */
561 #define eptr frame->Xeptr
562 #define ecode frame->Xecode
563 #define mstart frame->Xmstart
564 #define offset_top frame->Xoffset_top
565 #define eptrb frame->Xeptrb
566 #define rdepth frame->Xrdepth
568 /* Ditto for the local variables */
571 #define charptr frame->Xcharptr
573 #define callpat frame->Xcallpat
574 #define codelink frame->Xcodelink
575 #define data frame->Xdata
576 #define next frame->Xnext
577 #define pp frame->Xpp
578 #define prev frame->Xprev
579 #define saved_eptr frame->Xsaved_eptr
581 #define new_recursive frame->Xnew_recursive
583 #define cur_is_word frame->Xcur_is_word
584 #define condition frame->Xcondition
585 #define prev_is_word frame->Xprev_is_word
588 #define prop_type frame->Xprop_type
589 #define prop_value frame->Xprop_value
590 #define prop_fail_result frame->Xprop_fail_result
591 #define oclength frame->Xoclength
592 #define occhars frame->Xocchars
595 #define ctype frame->Xctype
596 #define fc frame->Xfc
597 #define fi frame->Xfi
598 #define length frame->Xlength
599 #define max frame->Xmax
600 #define min frame->Xmin
601 #define number frame->Xnumber
602 #define offset frame->Xoffset
603 #define op frame->Xop
604 #define save_capture_last frame->Xsave_capture_last
605 #define save_offset1 frame->Xsave_offset1
606 #define save_offset2 frame->Xsave_offset2
607 #define save_offset3 frame->Xsave_offset3
608 #define stacksave frame->Xstacksave
610 #define newptrb frame->Xnewptrb
612 /* When recursion is being used, local variables are allocated on the stack and
613 get preserved during recursion in the normal way. In this environment, fi and
614 i, and fc and c, can be the same variables. */
616 #else /* NO_RECURSE not defined */
620 /* Many of the following variables are used only in small blocks of the code.
621 My normal style of coding would have declared them within each of those blocks.
622 However, in order to accommodate the version of this code that uses an external
623 "stack" implemented on the heap, it is easier to declare them all here, so the
624 declarations can be cut out in a block. The only declarations within blocks
625 below are for variables that do not have to be preserved over a recursive call
629 const pcre_uchar *charptr;
631 const pcre_uchar *callpat;
632 const pcre_uchar *data;
633 const pcre_uchar *next;
635 const pcre_uchar *prev;
636 PCRE_PUCHAR saved_eptr;
638 recursion_info new_recursive;
646 unsigned int prop_value;
647 int prop_fail_result;
649 pcre_uchar occhars[6];
660 pcre_int32 save_capture_last;
661 int save_offset1, save_offset2, save_offset3;
662 int stacksave[REC_STACK_SAVE_MAX];
666 /* There is a special fudge for calling match() in a way that causes it to
667 measure the size of its basic stack frame when the stack is being used for
668 recursion. The second argument (ecode) being NULL triggers this behaviour. It
669 cannot normally ever be NULL. The return is the negated value of the frame
675 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
678 int len = (char *)&rdepth - (char *)eptr;
679 return (len > 0)? -len : len;
682 #endif /* NO_RECURSE */
684 /* To save space on the stack and in the heap frame, I have doubled up on some
685 of the local variables that are used only in localised parts of the code, but
686 still need to be preserved over recursive calls of match(). These macros define
687 the alternative names that are used. */
689 #define allow_zero cur_is_word
690 #define cbegroup condition
691 #define code_offset codelink
692 #define condassert condition
693 #define matched_once prev_is_word
695 #define save_mark data
697 /* These statements are here to stop the compiler complaining about unitialized
702 prop_fail_result = 0;
706 /* This label is used for tail recursion, which is used in a few cases even
707 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
708 used. Thanks to Ian Taylor for noticing this possibility and sending the
713 /* OK, now we can get on with the real code of the function. Recursive calls
714 are specified by the macro RMATCH and RRETURN is used to return. When
715 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
716 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
717 defined). However, RMATCH isn't like a function call because it's quite a
718 complicated macro. It has to be used in one particular way. This shouldn't,
719 however, impact performance when true recursion is being used. */
722 utf = md->utf; /* Local copy of the flag */
727 /* First check that we haven't called match() too many times, or that we
728 haven't exceeded the recursive call limit. */
730 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
731 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
733 /* At the start of a group with an unlimited repeat that may match an empty
734 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
735 done this way to save having to use another function argument, which would take
736 up space on the stack. See also MATCH_CONDASSERT below.
738 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
739 such remembered pointers, to be checked when we hit the closing ket, in order
740 to break infinite loops that match no characters. When match() is called in
741 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
742 NOT be used with tail recursion, because the memory block that is used is on
743 the stack, so a new one may be required for each match(). */
745 if (md->match_function_type == MATCH_CBEGROUP)
747 newptrb.epb_saved_eptr = eptr;
748 newptrb.epb_prev = eptrb;
750 md->match_function_type = 0;
753 /* Now start processing the opcodes. */
757 minimize = possessive = FALSE;
763 md->nomatch_mark = ecode + 2;
764 md->mark = NULL; /* In case previously set by assertion */
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
767 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
768 md->mark == NULL) md->mark = ecode + 2;
770 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
771 argument, and we must check whether that argument matches this MARK's
772 argument. It is passed back in md->start_match_ptr (an overloading of that
773 variable). If it does match, we reset that variable to the current subject
774 position and return MATCH_SKIP. Otherwise, pass back the return code
777 else if (rrc == MATCH_SKIP_ARG &&
778 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
780 md->start_match_ptr = eptr;
786 RRETURN(MATCH_NOMATCH);
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_COMMIT);
795 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
798 RRETURN(MATCH_PRUNE);
801 md->nomatch_mark = ecode + 2;
802 md->mark = NULL; /* In case previously set by assertion */
803 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
805 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
806 md->mark == NULL) md->mark = ecode + 2;
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 RRETURN(MATCH_PRUNE);
811 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
814 md->start_match_ptr = eptr; /* Pass back current position */
817 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
818 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
819 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
820 that failed and any that precede it (either they also failed, or were not
821 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
822 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
823 set to the count of the one that failed. */
826 md->skip_arg_count++;
827 if (md->skip_arg_count <= md->ignore_skip_arg)
829 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
832 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
836 /* Pass back the current skip name by overloading md->start_match_ptr and
837 returning the special MATCH_SKIP_ARG return code. This will either be
838 caught by a matching MARK, or get to the top, where it causes a rematch
839 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
841 md->start_match_ptr = ecode + 2;
842 RRETURN(MATCH_SKIP_ARG);
844 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
845 the branch in which it occurs can be determined. Overload the start of
846 match pointer to do this. */
849 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
852 md->start_match_ptr = ecode;
856 md->nomatch_mark = ecode + 2;
857 md->mark = NULL; /* In case previously set by assertion */
858 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
860 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
861 md->mark == NULL) md->mark = ecode + 2;
862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863 md->start_match_ptr = ecode;
866 /* Handle an atomic group that does not contain any capturing parentheses.
867 This can be handled like an assertion. Prior to 8.13, all atomic groups
868 were handled this way. In 8.13, the code was changed as below for ONCE, so
869 that backups pass through the group and thereby reset captured values.
870 However, this uses a lot more stack, so in 8.20, atomic groups that do not
871 contain any captures generate OP_ONCE_NC, which can be handled in the old,
872 less stack intensive way.
874 Check the alternative branches in turn - the matching won't pass the KET
875 for this kind of subpattern. If any one branch matches, we carry on as at
876 the end of a normal bracket, leaving the subject pointer, but resetting
877 the start-of-match value in case it was changed by \K. */
882 save_mark = md->mark;
885 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
886 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
888 mstart = md->start_match_ptr;
891 if (rrc == MATCH_THEN)
893 next = ecode + GET(ecode,1);
894 if (md->start_match_ptr < next &&
895 (*ecode == OP_ALT || *next == OP_ALT))
899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
900 ecode += GET(ecode,1);
901 md->mark = save_mark;
903 while (*ecode == OP_ALT);
905 /* If hit the end of the group (which could be repeated), fail */
907 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
909 /* Continue as from after the group, updating the offsets high water
910 mark, since extracts may have been taken. */
912 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
914 offset_top = md->end_offset_top;
915 eptr = md->end_match_ptr;
917 /* For a non-repeating ket, just continue at this level. This also
918 happens for a repeating ket if no characters were matched in the group.
919 This is the forcible breaking of infinite loops as implemented in Perl
922 if (*ecode == OP_KET || eptr == saved_eptr)
924 ecode += 1+LINK_SIZE;
928 /* The repeating kets try the rest of the pattern or restart from the
929 preceding bracket, in the appropriate order. The second "call" of match()
930 uses tail recursion, to avoid using another stack frame. */
932 if (*ecode == OP_KETRMIN)
934 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 else /* OP_KETRMAX */
941 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
943 ecode += 1 + LINK_SIZE;
946 /* Control never gets here */
948 /* Handle a capturing bracket, other than those that are possessive with an
949 unlimited repeat. If there is space in the offset vector, save the current
950 subject position in the working slot at the top of the vector. We mustn't
951 change the current values of the data slot, because they may be set from a
952 previous iteration of this group, and be referred to by a reference inside
953 the group. A failure to match might occur after the group has succeeded,
954 if something later on doesn't match. For this reason, we need to restore
955 the working value and also the values of the final offsets, in case they
956 were set by a previous iteration of the same bracket.
958 If there isn't enough space in the offset vector, treat this as if it were
959 a non-capturing bracket. Don't worry about setting the flag for the error
960 case here; that is handled in the code for KET. */
964 number = GET2(ecode, 1+LINK_SIZE);
965 offset = number << 1;
968 printf("start bracket %d\n", number);
970 pchars(eptr, 16, TRUE, md);
974 if (offset < md->offset_max)
976 save_offset1 = md->offset_vector[offset];
977 save_offset2 = md->offset_vector[offset+1];
978 save_offset3 = md->offset_vector[md->offset_end - number];
979 save_capture_last = md->capture_last;
980 save_mark = md->mark;
982 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
983 md->offset_vector[md->offset_end - number] =
984 (int)(eptr - md->start_subject);
988 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
989 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
991 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
993 /* If we backed up to a THEN, check whether it is within the current
994 branch by comparing the address of the THEN that is passed back with
995 the end of the branch. If it is within the current branch, and the
996 branch is one of two or more alternatives (it either starts or ends
997 with OP_ALT), we have reached the limit of THEN's action, so convert
998 the return code to NOMATCH, which will cause normal backtracking to
999 happen from now on. Otherwise, THEN is passed back to an outer
1000 alternative. This implements Perl's treatment of parenthesized groups,
1001 where a group not containing | does not affect the current alternative,
1002 that is, (X) is NOT the same as (X|(*F)). */
1004 if (rrc == MATCH_THEN)
1006 next = ecode + GET(ecode,1);
1007 if (md->start_match_ptr < next &&
1008 (*ecode == OP_ALT || *next == OP_ALT))
1009 rrc = MATCH_NOMATCH;
1012 /* Anything other than NOMATCH is passed back. */
1014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1015 md->capture_last = save_capture_last;
1016 ecode += GET(ecode, 1);
1017 md->mark = save_mark;
1018 if (*ecode != OP_ALT) break;
1021 DPRINTF(("bracket %d failed\n", number));
1022 md->offset_vector[offset] = save_offset1;
1023 md->offset_vector[offset+1] = save_offset2;
1024 md->offset_vector[md->offset_end - number] = save_offset3;
1026 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1031 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1032 as a non-capturing bracket. */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1039 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1040 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1042 /* Non-capturing or atomic group, except for possessive with unlimited
1043 repeat and ONCE group with no captures. Loop for all the alternatives.
1045 When we get to the final alternative within the brackets, we used to return
1046 the result of a recursive call to match() whatever happened so it was
1047 possible to reduce stack usage by turning this into a tail recursion,
1048 except in the case of a possibly empty group. However, now that there is
1049 the possiblity of (*THEN) occurring in the final alternative, this
1050 optimization is no longer always possible.
1052 We can optimize if we know there are no (*THEN)s in the pattern; at present
1053 this is the best that can be done.
1055 MATCH_ONCE is returned when the end of an atomic group is successfully
1056 reached, but subsequent matching fails. It passes back up the tree (causing
1057 captured values to be reset) until the original atomic group level is
1058 reached. This is tested by comparing md->once_target with the start of the
1059 group. At this point, the return is converted into MATCH_NOMATCH so that
1060 previous backup points can be taken. */
1065 DPRINTF(("start non-capturing bracket\n"));
1069 if (op >= OP_SBRA || op == OP_ONCE)
1070 md->match_function_type = MATCH_CBEGROUP;
1072 /* If this is not a possibly empty group, and there are no (*THEN)s in
1073 the pattern, and this is the final alternative, optimize as described
1076 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1078 ecode += PRIV(OP_lengths)[*ecode];
1082 /* In all other cases, we have to make another call to match(). */
1084 save_mark = md->mark;
1085 save_capture_last = md->capture_last;
1086 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1089 /* See comment in the code for capturing groups above about handling
1092 if (rrc == MATCH_THEN)
1094 next = ecode + GET(ecode,1);
1095 if (md->start_match_ptr < next &&
1096 (*ecode == OP_ALT || *next == OP_ALT))
1097 rrc = MATCH_NOMATCH;
1100 if (rrc != MATCH_NOMATCH)
1102 if (rrc == MATCH_ONCE)
1104 const pcre_uchar *scode = ecode;
1105 if (*scode != OP_ONCE) /* If not at start, find it */
1107 while (*scode == OP_ALT) scode += GET(scode, 1);
1108 scode -= GET(scode, 1);
1110 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1114 ecode += GET(ecode, 1);
1115 md->mark = save_mark;
1116 if (*ecode != OP_ALT) break;
1117 md->capture_last = save_capture_last;
1120 RRETURN(MATCH_NOMATCH);
1122 /* Handle possessive capturing brackets with an unlimited repeat. We come
1123 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1124 handled similarly to the normal case above. However, the matching is
1125 different. The end of these brackets will always be OP_KETRPOS, which
1126 returns MATCH_KETRPOS without going further in the pattern. By this means
1127 we can handle the group by iteration rather than recursion, thereby
1128 reducing the amount of stack needed. */
1135 number = GET2(ecode, 1+LINK_SIZE);
1136 offset = number << 1;
1139 printf("start possessive bracket %d\n", number);
1141 pchars(eptr, 16, TRUE, md);
1145 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1147 matched_once = FALSE;
1148 code_offset = (int)(ecode - md->start_code);
1150 save_offset1 = md->offset_vector[offset];
1151 save_offset2 = md->offset_vector[offset+1];
1152 save_offset3 = md->offset_vector[md->offset_end - number];
1153 save_capture_last = md->capture_last;
1155 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1157 /* Each time round the loop, save the current subject position for use
1158 when the group matches. For MATCH_MATCH, the group has matched, so we
1159 restart it with a new subject starting position, remembering that we had
1160 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1161 usual. If we haven't matched any alternatives in any iteration, check to
1162 see if a previous iteration matched. If so, the group has matched;
1163 continue from afterwards. Otherwise it has failed; restore the previous
1164 capture values before returning NOMATCH. */
1168 md->offset_vector[md->offset_end - number] =
1169 (int)(eptr - md->start_subject);
1170 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1171 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1173 if (rrc == MATCH_KETRPOS)
1175 offset_top = md->end_offset_top;
1176 ecode = md->start_code + code_offset;
1177 save_capture_last = md->capture_last;
1178 matched_once = TRUE;
1179 mstart = md->start_match_ptr; /* In case \K changed it */
1180 if (eptr == md->end_match_ptr) /* Matched an empty string */
1182 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1185 eptr = md->end_match_ptr;
1189 /* See comment in the code for capturing groups above about handling
1192 if (rrc == MATCH_THEN)
1194 next = ecode + GET(ecode,1);
1195 if (md->start_match_ptr < next &&
1196 (*ecode == OP_ALT || *next == OP_ALT))
1197 rrc = MATCH_NOMATCH;
1200 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1201 md->capture_last = save_capture_last;
1202 ecode += GET(ecode, 1);
1203 if (*ecode != OP_ALT) break;
1208 md->offset_vector[offset] = save_offset1;
1209 md->offset_vector[offset+1] = save_offset2;
1210 md->offset_vector[md->offset_end - number] = save_offset3;
1213 if (allow_zero || matched_once)
1215 ecode += 1 + LINK_SIZE;
1219 RRETURN(MATCH_NOMATCH);
1221 /* Non-capturing possessive bracket with unlimited repeat. We come here
1222 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1223 without the capturing complication. It is written out separately for speed
1230 POSSESSIVE_NON_CAPTURE:
1231 matched_once = FALSE;
1232 code_offset = (int)(ecode - md->start_code);
1233 save_capture_last = md->capture_last;
1237 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1238 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1240 if (rrc == MATCH_KETRPOS)
1242 offset_top = md->end_offset_top;
1243 ecode = md->start_code + code_offset;
1244 matched_once = TRUE;
1245 mstart = md->start_match_ptr; /* In case \K reset it */
1246 if (eptr == md->end_match_ptr) /* Matched an empty string */
1248 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1251 eptr = md->end_match_ptr;
1255 /* See comment in the code for capturing groups above about handling
1258 if (rrc == MATCH_THEN)
1260 next = ecode + GET(ecode,1);
1261 if (md->start_match_ptr < next &&
1262 (*ecode == OP_ALT || *next == OP_ALT))
1263 rrc = MATCH_NOMATCH;
1266 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1267 ecode += GET(ecode, 1);
1268 if (*ecode != OP_ALT) break;
1269 md->capture_last = save_capture_last;
1272 if (matched_once || allow_zero)
1274 ecode += 1 + LINK_SIZE;
1277 RRETURN(MATCH_NOMATCH);
1279 /* Control never reaches here. */
1281 /* Conditional group: compilation checked that there are no more than two
1282 branches. If the condition is false, skipping the first branch takes us
1283 past the end of the item if there is only one branch, but that's exactly
1289 /* The variable codelink will be added to ecode when the condition is
1290 false, to get to the second branch. Setting it to the offset to the ALT
1291 or KET, then incrementing ecode achieves this effect. We now have ecode
1292 pointing to the condition or callout. */
1294 codelink = GET(ecode, 1); /* Offset to the second branch */
1295 ecode += 1 + LINK_SIZE; /* From this opcode */
1297 /* Because of the way auto-callout works during compile, a callout item is
1298 inserted between OP_COND and an assertion condition. */
1300 if (*ecode == OP_CALLOUT)
1302 if (PUBL(callout) != NULL)
1304 PUBL(callout_block) cb;
1305 cb.version = 2; /* Version 1 of the callout block */
1306 cb.callout_number = ecode[1];
1307 cb.offset_vector = md->offset_vector;
1308 #if defined COMPILE_PCRE8
1309 cb.subject = (PCRE_SPTR)md->start_subject;
1310 #elif defined COMPILE_PCRE16
1311 cb.subject = (PCRE_SPTR16)md->start_subject;
1312 #elif defined COMPILE_PCRE32
1313 cb.subject = (PCRE_SPTR32)md->start_subject;
1315 cb.subject_length = (int)(md->end_subject - md->start_subject);
1316 cb.start_match = (int)(mstart - md->start_subject);
1317 cb.current_position = (int)(eptr - md->start_subject);
1318 cb.pattern_position = GET(ecode, 2);
1319 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1320 cb.capture_top = offset_top/2;
1321 cb.capture_last = md->capture_last & CAPLMASK;
1322 /* Internal change requires this for API compatibility. */
1323 if (cb.capture_last == 0) cb.capture_last = -1;
1324 cb.callout_data = md->callout_data;
1325 cb.mark = md->nomatch_mark;
1326 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1327 if (rrc < 0) RRETURN(rrc);
1330 /* Advance ecode past the callout, so it now points to the condition. We
1331 must adjust codelink so that the value of ecode+codelink is unchanged. */
1333 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1334 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1337 /* Test the various possible conditions */
1340 switch(condcode = *ecode)
1342 case OP_RREF: /* Numbered group recursion test */
1343 if (md->recursive != NULL) /* Not recursing => FALSE */
1345 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1346 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1350 case OP_DNRREF: /* Duplicate named group recursion test */
1351 if (md->recursive != NULL)
1353 int count = GET2(ecode, 1 + IMM2_SIZE);
1354 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1357 unsigned int recno = GET2(slot, 0);
1358 condition = recno == md->recursive->group_num;
1359 if (condition) break;
1360 slot += md->name_entry_size;
1365 case OP_CREF: /* Numbered group used test */
1366 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1367 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1370 case OP_DNCREF: /* Duplicate named group used test */
1372 int count = GET2(ecode, 1 + IMM2_SIZE);
1373 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1376 offset = GET2(slot, 0) << 1;
1377 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1378 if (condition) break;
1379 slot += md->name_entry_size;
1384 case OP_DEF: /* DEFINE - always false */
1385 case OP_FAIL: /* From optimized (?!) condition */
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1393 md->match_function_type = MATCH_CONDASSERT;
1394 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1395 if (rrc == MATCH_MATCH)
1397 if (md->end_offset_top > offset_top)
1398 offset_top = md->end_offset_top; /* Captures may have happened */
1401 /* Advance ecode past the assertion to the start of the first branch,
1402 but adjust it so that the general choosing code below works. If the
1403 assertion has a quantifier that allows zero repeats we must skip over
1404 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1406 if (*ecode == OP_BRAZERO) ecode++;
1407 ecode += GET(ecode, 1);
1408 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1409 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1412 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1413 assertion; it is therefore treated as NOMATCH. Any other return is an
1416 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1418 RRETURN(rrc); /* Need braces because of following else */
1423 /* Choose branch according to the condition */
1425 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1427 /* We are now at the branch that is to be obeyed. As there is only one, we
1428 can use tail recursion to avoid using another stack frame, except when
1429 there is unlimited repeat of a possibly empty group. In the latter case, a
1430 recursive call to match() is always required, unless the second alternative
1431 doesn't exist, in which case we can just plough on. Note that, for
1432 compatibility with Perl, the | in a conditional group is NOT treated as
1433 creating two alternatives. If a THEN is encountered in the branch, it
1434 propagates out to the enclosing alternative (unless nested in a deeper set
1435 of alternatives, of course). */
1437 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1444 md->match_function_type = MATCH_CBEGROUP;
1445 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1449 /* Condition false & no alternative; continue after the group. */
1457 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1458 to close any currently open capturing brackets. */
1461 number = GET2(ecode, 1); /* Must be less than 65536 */
1462 offset = number << 1;
1465 printf("end bracket %d at *ACCEPT", number);
1469 md->capture_last = (md->capture_last & OVFLMASK) | number;
1470 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1472 md->offset_vector[offset] =
1473 md->offset_vector[md->offset_end - number];
1474 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1476 /* If this group is at or above the current highwater mark, ensure that
1477 any groups between the current high water mark and this group are marked
1478 unset and then update the high water mark. */
1480 if (offset >= offset_top)
1482 register int *iptr = md->offset_vector + offset_top;
1483 register int *iend = md->offset_vector + offset;
1484 while (iptr < iend) *iptr++ = -1;
1485 offset_top = offset + 2;
1488 ecode += 1 + IMM2_SIZE;
1492 /* End of the pattern, either real or forced. */
1496 case OP_ASSERT_ACCEPT:
1498 /* If we have matched an empty string, fail if not in an assertion and not
1499 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1500 is set and we have matched at the start of the subject. In both cases,
1501 backtracking will then try other alternatives, if any. */
1503 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1504 md->recursive == NULL &&
1506 (md->notempty_atstart &&
1507 mstart == md->start_subject + md->start_offset)))
1508 RRETURN(MATCH_NOMATCH);
1510 /* Otherwise, we have a match. */
1512 md->end_match_ptr = eptr; /* Record where we ended */
1513 md->end_offset_top = offset_top; /* and how many extracts were taken */
1514 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1516 /* For some reason, the macros don't work properly if an expression is
1517 given as the argument to RRETURN when the heap is in use. */
1519 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1522 /* Assertion brackets. Check the alternative branches in turn - the
1523 matching won't pass the KET for an assertion. If any one branch matches,
1524 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1525 start of each branch to move the current point backwards, so the code at
1526 this level is identical to the lookahead case. When the assertion is part
1527 of a condition, we want to return immediately afterwards. The caller of
1528 this incarnation of the match() function will have set MATCH_CONDASSERT in
1529 md->match_function type, and one of these opcodes will be the first opcode
1530 that is processed. We use a local variable that is preserved over calls to
1531 match() to remember this case. */
1535 save_mark = md->mark;
1536 if (md->match_function_type == MATCH_CONDASSERT)
1539 md->match_function_type = 0;
1541 else condassert = FALSE;
1543 /* Loop for each branch */
1547 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1549 /* A match means that the assertion is true; break out of the loop
1550 that matches its alternatives. */
1552 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1554 mstart = md->start_match_ptr; /* In case \K reset it */
1558 /* If not matched, restore the previous mark setting. */
1560 md->mark = save_mark;
1562 /* See comment in the code for capturing groups above about handling
1565 if (rrc == MATCH_THEN)
1567 next = ecode + GET(ecode,1);
1568 if (md->start_match_ptr < next &&
1569 (*ecode == OP_ALT || *next == OP_ALT))
1570 rrc = MATCH_NOMATCH;
1573 /* Anything other than NOMATCH causes the entire assertion to fail,
1574 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1575 uncaptured THEN, which means they take their normal effect. This
1576 consistent approach does not always have exactly the same effect as in
1579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1580 ecode += GET(ecode, 1);
1582 while (*ecode == OP_ALT); /* Continue for next alternative */
1584 /* If we have tried all the alternative branches, the assertion has
1585 failed. If not, we broke out after a match. */
1587 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1589 /* If checking an assertion for a condition, return MATCH_MATCH. */
1591 if (condassert) RRETURN(MATCH_MATCH);
1593 /* Continue from after a successful assertion, updating the offsets high
1594 water mark, since extracts may have been taken during the assertion. */
1596 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1597 ecode += 1 + LINK_SIZE;
1598 offset_top = md->end_offset_top;
1601 /* Negative assertion: all branches must fail to match for the assertion to
1605 case OP_ASSERTBACK_NOT:
1606 save_mark = md->mark;
1607 if (md->match_function_type == MATCH_CONDASSERT)
1610 md->match_function_type = 0;
1612 else condassert = FALSE;
1614 /* Loop for each alternative branch. */
1618 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1619 md->mark = save_mark; /* Always restore the mark setting */
1623 case MATCH_MATCH: /* A successful match means */
1624 case MATCH_ACCEPT: /* the assertion has failed. */
1625 RRETURN(MATCH_NOMATCH);
1627 case MATCH_NOMATCH: /* Carry on with next branch */
1630 /* See comment in the code for capturing groups above about handling
1634 next = ecode + GET(ecode,1);
1635 if (md->start_match_ptr < next &&
1636 (*ecode == OP_ALT || *next == OP_ALT))
1638 rrc = MATCH_NOMATCH;
1641 /* Otherwise fall through. */
1643 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1644 assertion to fail to match, without considering any more alternatives.
1645 Failing to match means the assertion is true. This is a consistent
1646 approach, but does not always have the same effect as in Perl. */
1650 case MATCH_SKIP_ARG:
1652 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1653 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1655 /* Anything else is an error */
1661 /* Continue with next branch */
1663 ecode += GET(ecode,1);
1665 while (*ecode == OP_ALT);
1667 /* All branches in the assertion failed to match. */
1670 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1671 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1674 /* Move the subject pointer back. This occurs only at the start of
1675 each branch of a lookbehind assertion. If we are too close to the start to
1676 move back, this match function fails. When working with UTF-8 we move
1677 back a number of characters, not bytes. */
1687 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1694 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1697 eptr -= GET(ecode, 1);
1698 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1701 /* Save the earliest consulted character, then skip to next op code */
1703 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1704 ecode += 1 + LINK_SIZE;
1707 /* The callout item calls an external function, if one is provided, passing
1708 details of the match so far. This is mainly for debugging, though the
1709 function is able to force a failure. */
1712 if (PUBL(callout) != NULL)
1714 PUBL(callout_block) cb;
1715 cb.version = 2; /* Version 1 of the callout block */
1716 cb.callout_number = ecode[1];
1717 cb.offset_vector = md->offset_vector;
1718 #if defined COMPILE_PCRE8
1719 cb.subject = (PCRE_SPTR)md->start_subject;
1720 #elif defined COMPILE_PCRE16
1721 cb.subject = (PCRE_SPTR16)md->start_subject;
1722 #elif defined COMPILE_PCRE32
1723 cb.subject = (PCRE_SPTR32)md->start_subject;
1725 cb.subject_length = (int)(md->end_subject - md->start_subject);
1726 cb.start_match = (int)(mstart - md->start_subject);
1727 cb.current_position = (int)(eptr - md->start_subject);
1728 cb.pattern_position = GET(ecode, 2);
1729 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1730 cb.capture_top = offset_top/2;
1731 cb.capture_last = md->capture_last & CAPLMASK;
1732 /* Internal change requires this for API compatibility. */
1733 if (cb.capture_last == 0) cb.capture_last = -1;
1734 cb.callout_data = md->callout_data;
1735 cb.mark = md->nomatch_mark;
1736 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1737 if (rrc < 0) RRETURN(rrc);
1739 ecode += 2 + 2*LINK_SIZE;
1742 /* Recursion either matches the current regex, or some subexpression. The
1743 offset data is the offset to the starting bracket from the start of the
1744 whole pattern. (This is so that it works from duplicated subpatterns.)
1746 The state of the capturing groups is preserved over recursion, and
1747 re-instated afterwards. We don't know how many are started and not yet
1748 finished (offset_top records the completed total) so we just have to save
1749 all the potential data. There may be up to 65535 such values, which is too
1750 large to put on the stack, but using malloc for small numbers seems
1751 expensive. As a compromise, the stack is used when there are no more than
1752 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1754 There are also other values that have to be saved. We use a chained
1755 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1756 for the original version of this logic. It has, however, been hacked around
1757 a lot, so he is not to blame for the current way it works. */
1764 callpat = md->start_code + GET(ecode, 1);
1765 recno = (callpat == md->start_code)? 0 :
1766 GET2(callpat, 1 + LINK_SIZE);
1768 /* Check for repeating a recursion without advancing the subject pointer.
1769 This should catch convoluted mutual recursions. (Some simple cases are
1770 caught at compile time.) */
1772 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1773 if (recno == ri->group_num && eptr == ri->subject_position)
1774 RRETURN(PCRE_ERROR_RECURSELOOP);
1776 /* Add to "recursing stack" */
1778 new_recursive.group_num = recno;
1779 new_recursive.saved_capture_last = md->capture_last;
1780 new_recursive.subject_position = eptr;
1781 new_recursive.prevrec = md->recursive;
1782 md->recursive = &new_recursive;
1784 /* Where to continue from afterwards */
1786 ecode += 1 + LINK_SIZE;
1788 /* Now save the offset data */
1790 new_recursive.saved_max = md->offset_end;
1791 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1792 new_recursive.offset_save = stacksave;
1795 new_recursive.offset_save =
1796 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1797 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1799 memcpy(new_recursive.offset_save, md->offset_vector,
1800 new_recursive.saved_max * sizeof(int));
1802 /* OK, now we can do the recursion. After processing each alternative,
1803 restore the offset data and the last captured value. If there were nested
1804 recursions, md->recursive might be changed, so reset it before looping.
1807 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1808 cbegroup = (*callpat >= OP_SBRA);
1811 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1812 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1814 memcpy(md->offset_vector, new_recursive.offset_save,
1815 new_recursive.saved_max * sizeof(int));
1816 md->capture_last = new_recursive.saved_capture_last;
1817 md->recursive = new_recursive.prevrec;
1818 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1820 DPRINTF(("Recursion matched\n"));
1821 if (new_recursive.offset_save != stacksave)
1822 (PUBL(free))(new_recursive.offset_save);
1824 /* Set where we got to in the subject, and reset the start in case
1825 it was changed by \K. This *is* propagated back out of a recursion,
1826 for Perl compatibility. */
1828 eptr = md->end_match_ptr;
1829 mstart = md->start_match_ptr;
1830 goto RECURSION_MATCHED; /* Exit loop; end processing */
1833 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1834 recursion; they cause a NOMATCH for the entire recursion. These codes
1835 are defined in a range that can be tested for. */
1837 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1839 if (new_recursive.offset_save != stacksave)
1840 (PUBL(free))(new_recursive.offset_save);
1841 RRETURN(MATCH_NOMATCH);
1844 /* Any return code other than NOMATCH is an error. */
1846 if (rrc != MATCH_NOMATCH)
1848 DPRINTF(("Recursion gave error %d\n", rrc));
1849 if (new_recursive.offset_save != stacksave)
1850 (PUBL(free))(new_recursive.offset_save);
1854 md->recursive = &new_recursive;
1855 callpat += GET(callpat, 1);
1857 while (*callpat == OP_ALT);
1859 DPRINTF(("Recursion didn't match\n"));
1860 md->recursive = new_recursive.prevrec;
1861 if (new_recursive.offset_save != stacksave)
1862 (PUBL(free))(new_recursive.offset_save);
1863 RRETURN(MATCH_NOMATCH);
1869 /* An alternation is the end of a branch; scan along to find the end of the
1870 bracketed group and go to there. */
1873 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1876 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1877 indicating that it may occur zero times. It may repeat infinitely, or not
1878 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1879 with fixed upper repeat limits are compiled as a number of copies, with the
1880 optional ones preceded by BRAZERO or BRAMINZERO. */
1884 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 do next += GET(next, 1); while (*next == OP_ALT);
1887 ecode = next + 1 + LINK_SIZE;
1892 do next += GET(next, 1); while (*next == OP_ALT);
1893 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1900 do next += GET(next,1); while (*next == OP_ALT);
1901 ecode = next + 1 + LINK_SIZE;
1904 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1905 here; just jump to the group, with allow_zero set TRUE. */
1910 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1911 goto POSSESSIVE_NON_CAPTURE;
1913 /* End of a group, repeated or non-repeating. */
1919 prev = ecode - GET(ecode, 1);
1921 /* If this was a group that remembered the subject start, in order to break
1922 infinite repeats of empty string matches, retrieve the subject start from
1923 the chain. Otherwise, set it NULL. */
1925 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1927 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1928 eptrb = eptrb->epb_prev; /* Backup to previous group */
1930 else saved_eptr = NULL;
1932 /* If we are at the end of an assertion group or a non-capturing atomic
1933 group, stop matching and return MATCH_MATCH, but record the current high
1934 water mark for use by positive assertions. We also need to record the match
1935 start in case it was changed by \K. */
1937 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1938 *prev == OP_ONCE_NC)
1940 md->end_match_ptr = eptr; /* For ONCE_NC */
1941 md->end_offset_top = offset_top;
1942 md->start_match_ptr = mstart;
1943 RRETURN(MATCH_MATCH); /* Sets md->mark */
1946 /* For capturing groups we have to check the group number back at the start
1947 and if necessary complete handling an extraction by setting the offsets and
1948 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1949 into group 0, so it won't be picked up here. Instead, we catch it when the
1950 OP_END is reached. Other recursion is handled here. We just have to record
1951 the current subject position and start match pointer and give a MATCH
1954 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1955 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1957 number = GET2(prev, 1+LINK_SIZE);
1958 offset = number << 1;
1961 printf("end bracket %d", number);
1965 /* Handle a recursively called group. */
1967 if (md->recursive != NULL && md->recursive->group_num == number)
1969 md->end_match_ptr = eptr;
1970 md->start_match_ptr = mstart;
1971 RRETURN(MATCH_MATCH);
1974 /* Deal with capturing */
1976 md->capture_last = (md->capture_last & OVFLMASK) | number;
1977 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1979 /* If offset is greater than offset_top, it means that we are
1980 "skipping" a capturing group, and that group's offsets must be marked
1981 unset. In earlier versions of PCRE, all the offsets were unset at the
1982 start of matching, but this doesn't work because atomic groups and
1983 assertions can cause a value to be set that should later be unset.
1984 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1985 part of the atomic group, but this is not on the final matching path,
1986 so must be unset when 2 is set. (If there is no group 2, there is no
1987 problem, because offset_top will then be 2, indicating no capture.) */
1989 if (offset > offset_top)
1991 register int *iptr = md->offset_vector + offset_top;
1992 register int *iend = md->offset_vector + offset;
1993 while (iptr < iend) *iptr++ = -1;
1996 /* Now make the extraction */
1998 md->offset_vector[offset] =
1999 md->offset_vector[md->offset_end - number];
2000 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2001 if (offset_top <= offset) offset_top = offset + 2;
2005 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2006 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2007 at a time from the outer level, thus saving stack. This must precede the
2008 empty string test - in this case that test is done at the outer level. */
2010 if (*ecode == OP_KETRPOS)
2012 md->start_match_ptr = mstart; /* In case \K reset it */
2013 md->end_match_ptr = eptr;
2014 md->end_offset_top = offset_top;
2015 RRETURN(MATCH_KETRPOS);
2018 /* For an ordinary non-repeating ket, just continue at this level. This
2019 also happens for a repeating ket if no characters were matched in the
2020 group. This is the forcible breaking of infinite loops as implemented in
2021 Perl 5.005. For a non-repeating atomic group that includes captures,
2022 establish a backup point by processing the rest of the pattern at a lower
2023 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2024 original OP_ONCE level, thereby bypassing intermediate backup points, but
2025 resetting any captures that happened along the way. */
2027 if (*ecode == OP_KET || eptr == saved_eptr)
2029 if (*prev == OP_ONCE)
2031 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2033 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2034 RRETURN(MATCH_ONCE);
2036 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2040 /* The normal repeating kets try the rest of the pattern or restart from
2041 the preceding bracket, in the appropriate order. In the second case, we can
2042 use tail recursion to avoid using another stack frame, unless we have an
2043 an atomic group or an unlimited repeat of a group that can match an empty
2046 if (*ecode == OP_KETRMIN)
2048 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2050 if (*prev == OP_ONCE)
2052 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2054 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2055 RRETURN(MATCH_ONCE);
2057 if (*prev >= OP_SBRA) /* Could match an empty string */
2059 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2065 else /* OP_KETRMAX */
2067 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2068 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2070 if (*prev == OP_ONCE)
2072 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2073 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2074 md->once_target = prev;
2075 RRETURN(MATCH_ONCE);
2077 ecode += 1 + LINK_SIZE;
2080 /* Control never gets here */
2082 /* Not multiline mode: start of subject assertion, unless notbol. */
2085 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2087 /* Start of subject assertion */
2090 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2094 /* Multiline mode: start of subject unless notbol, or after any newline. */
2097 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2098 if (eptr != md->start_subject &&
2099 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2100 RRETURN(MATCH_NOMATCH);
2104 /* Start of match assertion */
2107 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2111 /* Reset the start of match point */
2118 /* Multiline mode: assert before any newline, or before end of subject
2119 unless noteol is set. */
2122 if (eptr < md->end_subject)
2124 if (!IS_NEWLINE(eptr))
2126 if (md->partial != 0 &&
2127 eptr + 1 >= md->end_subject &&
2128 NLBLOCK->nltype == NLTYPE_FIXED &&
2129 NLBLOCK->nllen == 2 &&
2130 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2133 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2135 RRETURN(MATCH_NOMATCH);
2140 if (md->noteol) RRETURN(MATCH_NOMATCH);
2146 /* Not multiline mode: assert before a terminating newline or before end of
2147 subject unless noteol is set. */
2150 if (md->noteol) RRETURN(MATCH_NOMATCH);
2151 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2153 /* ... else fall through for endonly */
2155 /* End of subject assertion (\z) */
2158 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2163 /* End of subject or ending \n assertion (\Z) */
2167 if (eptr < md->end_subject &&
2168 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2170 if (md->partial != 0 &&
2171 eptr + 1 >= md->end_subject &&
2172 NLBLOCK->nltype == NLTYPE_FIXED &&
2173 NLBLOCK->nllen == 2 &&
2174 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2177 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2179 RRETURN(MATCH_NOMATCH);
2182 /* Either at end of string or \n before end. */
2188 /* Word boundary assertions */
2190 case OP_NOT_WORD_BOUNDARY:
2191 case OP_WORD_BOUNDARY:
2194 /* Find out if the previous and current characters are "word" characters.
2195 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2196 be "non-word" characters. Remember the earliest consulted character for
2197 partial matching. */
2202 /* Get status of previous character */
2204 if (eptr == md->start_subject) prev_is_word = FALSE; else
2206 PCRE_PUCHAR lastptr = eptr - 1;
2208 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2209 GETCHAR(c, lastptr);
2213 if (c == '_') prev_is_word = TRUE; else
2215 int cat = UCD_CATEGORY(c);
2216 prev_is_word = (cat == ucp_L || cat == ucp_N);
2221 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2224 /* Get status of next character */
2226 if (eptr >= md->end_subject)
2229 cur_is_word = FALSE;
2237 if (c == '_') cur_is_word = TRUE; else
2239 int cat = UCD_CATEGORY(c);
2240 cur_is_word = (cat == ucp_L || cat == ucp_N);
2245 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2251 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2252 consistency with the behaviour of \w we do use it in this case. */
2255 /* Get status of previous character */
2257 if (eptr == md->start_subject) prev_is_word = FALSE; else
2259 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2264 if (c == '_') prev_is_word = TRUE; else
2266 int cat = UCD_CATEGORY(c);
2267 prev_is_word = (cat == ucp_L || cat == ucp_N);
2272 prev_is_word = MAX_255(eptr[-1])
2273 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2276 /* Get status of next character */
2278 if (eptr >= md->end_subject)
2281 cur_is_word = FALSE;
2288 if (c == '_') cur_is_word = TRUE; else
2290 int cat = UCD_CATEGORY(c);
2291 cur_is_word = (cat == ucp_L || cat == ucp_N);
2296 cur_is_word = MAX_255(*eptr)
2297 && ((md->ctypes[*eptr] & ctype_word) != 0);
2300 /* Now see if the situation is what we want */
2302 if ((*ecode++ == OP_WORD_BOUNDARY)?
2303 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2304 RRETURN(MATCH_NOMATCH);
2308 /* Match any single character type except newline; have to take care with
2309 CRLF newlines and partial matching. */
2312 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2313 if (md->partial != 0 &&
2314 eptr + 1 >= md->end_subject &&
2315 NLBLOCK->nltype == NLTYPE_FIXED &&
2316 NLBLOCK->nllen == 2 &&
2317 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2320 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2325 /* Match any single character whatsoever. */
2328 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2329 { /* not be updated before SCHECK_PARTIAL. */
2331 RRETURN(MATCH_NOMATCH);
2335 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2340 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2341 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2344 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2345 { /* not be updated before SCHECK_PARTIAL. */
2347 RRETURN(MATCH_NOMATCH);
2354 if (eptr >= md->end_subject)
2357 RRETURN(MATCH_NOMATCH);
2359 GETCHARINCTEST(c, eptr);
2361 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2364 (md->ctypes[c] & ctype_digit) != 0
2366 RRETURN(MATCH_NOMATCH);
2371 if (eptr >= md->end_subject)
2374 RRETURN(MATCH_NOMATCH);
2376 GETCHARINCTEST(c, eptr);
2378 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2381 (md->ctypes[c] & ctype_digit) == 0
2383 RRETURN(MATCH_NOMATCH);
2387 case OP_NOT_WHITESPACE:
2388 if (eptr >= md->end_subject)
2391 RRETURN(MATCH_NOMATCH);
2393 GETCHARINCTEST(c, eptr);
2395 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2398 (md->ctypes[c] & ctype_space) != 0
2400 RRETURN(MATCH_NOMATCH);
2405 if (eptr >= md->end_subject)
2408 RRETURN(MATCH_NOMATCH);
2410 GETCHARINCTEST(c, eptr);
2412 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2415 (md->ctypes[c] & ctype_space) == 0
2417 RRETURN(MATCH_NOMATCH);
2421 case OP_NOT_WORDCHAR:
2422 if (eptr >= md->end_subject)
2425 RRETURN(MATCH_NOMATCH);
2427 GETCHARINCTEST(c, eptr);
2429 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2432 (md->ctypes[c] & ctype_word) != 0
2434 RRETURN(MATCH_NOMATCH);
2439 if (eptr >= md->end_subject)
2442 RRETURN(MATCH_NOMATCH);
2444 GETCHARINCTEST(c, eptr);
2446 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2449 (md->ctypes[c] & ctype_word) == 0
2451 RRETURN(MATCH_NOMATCH);
2456 if (eptr >= md->end_subject)
2459 RRETURN(MATCH_NOMATCH);
2461 GETCHARINCTEST(c, eptr);
2464 default: RRETURN(MATCH_NOMATCH);
2467 if (eptr >= md->end_subject)
2471 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2483 #endif /* Not EBCDIC */
2484 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2491 if (eptr >= md->end_subject)
2494 RRETURN(MATCH_NOMATCH);
2496 GETCHARINCTEST(c, eptr);
2499 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2506 if (eptr >= md->end_subject)
2509 RRETURN(MATCH_NOMATCH);
2511 GETCHARINCTEST(c, eptr);
2514 HSPACE_CASES: break; /* Byte and multibyte cases */
2515 default: RRETURN(MATCH_NOMATCH);
2521 if (eptr >= md->end_subject)
2524 RRETURN(MATCH_NOMATCH);
2526 GETCHARINCTEST(c, eptr);
2529 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2536 if (eptr >= md->end_subject)
2539 RRETURN(MATCH_NOMATCH);
2541 GETCHARINCTEST(c, eptr);
2544 VSPACE_CASES: break;
2545 default: RRETURN(MATCH_NOMATCH);
2551 /* Check the next character by Unicode property. We will get here only
2552 if the support is in the binary; otherwise a compile-time error occurs. */
2556 if (eptr >= md->end_subject)
2559 RRETURN(MATCH_NOMATCH);
2561 GETCHARINCTEST(c, eptr);
2563 const pcre_uint32 *cp;
2564 const ucd_record *prop = GET_UCD(c);
2569 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2573 if ((prop->chartype == ucp_Lu ||
2574 prop->chartype == ucp_Ll ||
2575 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2576 RRETURN(MATCH_NOMATCH);
2580 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2581 RRETURN(MATCH_NOMATCH);
2585 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2586 RRETURN(MATCH_NOMATCH);
2590 if ((ecode[2] != prop->script) == (op == OP_PROP))
2591 RRETURN(MATCH_NOMATCH);
2594 /* These are specials */
2597 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2598 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2599 RRETURN(MATCH_NOMATCH);
2602 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2603 which means that Perl space and POSIX space are now identical. PCRE
2604 was changed at release 8.34. */
2606 case PT_SPACE: /* Perl space */
2607 case PT_PXSPACE: /* POSIX space */
2612 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2616 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2617 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2623 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2624 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2625 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2626 RRETURN(MATCH_NOMATCH);
2630 cp = PRIV(ucd_caseless_sets) + ecode[2];
2634 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2636 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2641 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2642 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2643 c >= 0xe000) == (op == OP_NOTPROP))
2644 RRETURN(MATCH_NOMATCH);
2647 /* This should never occur */
2650 RRETURN(PCRE_ERROR_INTERNAL);
2657 /* Match an extended Unicode sequence. We will get here only if the support
2658 is in the binary; otherwise a compile-time error occurs. */
2661 if (eptr >= md->end_subject)
2664 RRETURN(MATCH_NOMATCH);
2669 GETCHARINCTEST(c, eptr);
2670 lgb = UCD_GRAPHBREAK(c);
2671 while (eptr < md->end_subject)
2674 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2675 rgb = UCD_GRAPHBREAK(c);
2676 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2684 #endif /* SUPPORT_UCP */
2687 /* Match a back reference, possibly repeatedly. Look past the end of the
2688 item to see if there is repeat information following. The code is similar
2689 to that for character classes, but repeated for efficiency. Then obey
2690 similar code to character type repeats - written out again for speed.
2691 However, if the referenced string is the empty string, always treat
2692 it as matched, any number of times (otherwise there could be infinite
2693 loops). If the reference is unset, there are two possibilities:
2695 (a) In the default, Perl-compatible state, set the length negative;
2696 this ensures that every attempt at a match fails. We can't just fail
2697 here, because of the possibility of quantifiers with zero minima.
2699 (b) If the JavaScript compatibility flag is set, set the length to zero
2700 so that the back reference matches an empty string.
2702 Otherwise, set the length to the length of what was matched by the
2703 referenced subpattern.
2705 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2706 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2707 and OP_DNREFI are used. In this case we must scan the list of groups to
2708 which the name refers, and use the first one that is set. */
2712 caseless = op == OP_DNREFI;
2714 int count = GET2(ecode, 1+IMM2_SIZE);
2715 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2716 ecode += 1 + 2*IMM2_SIZE;
2718 /* Setting the default length first and initializing 'offset' avoids
2719 compiler warnings in the REF_REPEAT code. */
2721 length = (md->jscript_compat)? 0 : -1;
2726 offset = GET2(slot, 0) << 1;
2727 if (offset < offset_top && md->offset_vector[offset] >= 0)
2729 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2732 slot += md->name_entry_size;
2739 caseless = op == OP_REFI;
2740 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2741 ecode += 1 + IMM2_SIZE;
2742 if (offset >= offset_top || md->offset_vector[offset] < 0)
2743 length = (md->jscript_compat)? 0 : -1;
2745 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2747 /* Set up for repetition, or handle the non-repeated case */
2758 c = *ecode++ - OP_CRSTAR;
2759 minimize = (c & 1) != 0;
2760 min = rep_min[c]; /* Pick up values from tables; */
2761 max = rep_max[c]; /* zero for max => infinity */
2762 if (max == 0) max = INT_MAX;
2767 minimize = (*ecode == OP_CRMINRANGE);
2768 min = GET2(ecode, 1);
2769 max = GET2(ecode, 1 + IMM2_SIZE);
2770 if (max == 0) max = INT_MAX;
2771 ecode += 1 + 2 * IMM2_SIZE;
2774 default: /* No repeat follows */
2775 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2777 if (length == -2) eptr = md->end_subject; /* Partial match */
2779 RRETURN(MATCH_NOMATCH);
2782 continue; /* With the main loop */
2785 /* Handle repeated back references. If the length of the reference is
2786 zero, just continue with the main loop. If the length is negative, it
2787 means the reference is unset in non-Java-compatible mode. If the minimum is
2788 zero, we can continue at the same level without recursion. For any other
2789 minimum, carrying on will result in NOMATCH. */
2791 if (length == 0) continue;
2792 if (length < 0 && min == 0) continue;
2794 /* First, ensure the minimum number of matches are present. We get back
2795 the length of the reference string explicitly rather than passing the
2796 address of eptr, so that eptr can be a register variable. */
2798 for (i = 1; i <= min; i++)
2801 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2803 if (slength == -2) eptr = md->end_subject; /* Partial match */
2805 RRETURN(MATCH_NOMATCH);
2810 /* If min = max, continue at the same level without recursion.
2811 They are not both allowed to be zero. */
2813 if (min == max) continue;
2815 /* If minimizing, keep trying and advancing the pointer */
2819 for (fi = min;; fi++)
2822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 if (fi >= max) RRETURN(MATCH_NOMATCH);
2825 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2827 if (slength == -2) eptr = md->end_subject; /* Partial match */
2829 RRETURN(MATCH_NOMATCH);
2833 /* Control never gets here */
2836 /* If maximizing, find the longest string and work backwards */
2841 for (i = min; i < max; i++)
2844 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2846 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2847 the soft partial matching case. */
2849 if (slength == -2 && md->partial != 0 &&
2850 md->end_subject > md->start_used_ptr)
2853 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2862 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2866 RRETURN(MATCH_NOMATCH);
2868 /* Control never gets here */
2870 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2871 used when all the characters in the class have values in the range 0-255,
2872 and either the matching is caseful, or the characters are in the range
2873 0-127 when UTF-8 processing is enabled. The only difference between
2874 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2877 First, look past the end of the item to see if there is repeat information
2878 following. Then obey similar code to character type repeats - written out
2884 /* The data variable is saved across frames, so the byte map needs to
2886 #define BYTE_MAP ((pcre_uint8 *)data)
2887 data = ecode + 1; /* Save for matching */
2888 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2901 c = *ecode++ - OP_CRSTAR;
2902 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2903 else possessive = TRUE;
2904 min = rep_min[c]; /* Pick up values from tables; */
2905 max = rep_max[c]; /* zero for max => infinity */
2906 if (max == 0) max = INT_MAX;
2912 minimize = (*ecode == OP_CRMINRANGE);
2913 possessive = (*ecode == OP_CRPOSRANGE);
2914 min = GET2(ecode, 1);
2915 max = GET2(ecode, 1 + IMM2_SIZE);
2916 if (max == 0) max = INT_MAX;
2917 ecode += 1 + 2 * IMM2_SIZE;
2920 default: /* No repeat follows */
2925 /* First, ensure the minimum number of matches are present. */
2930 for (i = 1; i <= min; i++)
2932 if (eptr >= md->end_subject)
2935 RRETURN(MATCH_NOMATCH);
2937 GETCHARINC(c, eptr);
2940 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2943 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2950 for (i = 1; i <= min; i++)
2952 if (eptr >= md->end_subject)
2955 RRETURN(MATCH_NOMATCH);
2958 #ifndef COMPILE_PCRE8
2961 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2965 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2969 /* If max == min we can continue with the main loop without the
2972 if (min == max) continue;
2974 /* If minimizing, keep testing the rest of the expression and advancing
2975 the pointer while it matches the class. */
2982 for (fi = min;; fi++)
2984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2986 if (fi >= max) RRETURN(MATCH_NOMATCH);
2987 if (eptr >= md->end_subject)
2990 RRETURN(MATCH_NOMATCH);
2992 GETCHARINC(c, eptr);
2995 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2998 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3005 for (fi = min;; fi++)
3007 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3009 if (fi >= max) RRETURN(MATCH_NOMATCH);
3010 if (eptr >= md->end_subject)
3013 RRETURN(MATCH_NOMATCH);
3016 #ifndef COMPILE_PCRE8
3019 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3023 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3026 /* Control never gets here */
3029 /* If maximizing, find the longest possible run, then work backwards. */
3038 for (i = min; i < max; i++)
3041 if (eptr >= md->end_subject)
3046 GETCHARLEN(c, eptr, len);
3049 if (op == OP_CLASS) break;
3052 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3056 if (possessive) continue; /* No backtracking */
3060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3062 if (eptr-- == pp) break; /* Stop if tried at original pos */
3070 for (i = min; i < max; i++)
3072 if (eptr >= md->end_subject)
3078 #ifndef COMPILE_PCRE8
3081 if (op == OP_CLASS) break;
3085 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3089 if (possessive) continue; /* No backtracking */
3093 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3099 RRETURN(MATCH_NOMATCH);
3103 /* Control never gets here */
3106 /* Match an extended character class. In the 8-bit library, this opcode is
3107 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3108 32-bit libraries, codepoints greater than 255 may be encountered even when
3109 UTF is not supported. */
3111 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3114 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3115 ecode += GET(ecode, 1); /* Advance past the item */
3128 c = *ecode++ - OP_CRSTAR;
3129 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3130 else possessive = TRUE;
3131 min = rep_min[c]; /* Pick up values from tables; */
3132 max = rep_max[c]; /* zero for max => infinity */
3133 if (max == 0) max = INT_MAX;
3139 minimize = (*ecode == OP_CRMINRANGE);
3140 possessive = (*ecode == OP_CRPOSRANGE);
3141 min = GET2(ecode, 1);
3142 max = GET2(ecode, 1 + IMM2_SIZE);
3143 if (max == 0) max = INT_MAX;
3144 ecode += 1 + 2 * IMM2_SIZE;
3147 default: /* No repeat follows */
3152 /* First, ensure the minimum number of matches are present. */
3154 for (i = 1; i <= min; i++)
3156 if (eptr >= md->end_subject)
3159 RRETURN(MATCH_NOMATCH);
3161 GETCHARINCTEST(c, eptr);
3162 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3165 /* If max == min we can continue with the main loop without the
3168 if (min == max) continue;
3170 /* If minimizing, keep testing the rest of the expression and advancing
3171 the pointer while it matches the class. */
3175 for (fi = min;; fi++)
3177 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3178 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3179 if (fi >= max) RRETURN(MATCH_NOMATCH);
3180 if (eptr >= md->end_subject)
3183 RRETURN(MATCH_NOMATCH);
3185 GETCHARINCTEST(c, eptr);
3186 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3188 /* Control never gets here */
3191 /* If maximizing, find the longest possible run, then work backwards. */
3196 for (i = min; i < max; i++)
3199 if (eptr >= md->end_subject)
3205 GETCHARLENTEST(c, eptr, len);
3209 if (!PRIV(xclass)(c, data, utf)) break;
3213 if (possessive) continue; /* No backtracking */
3217 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3218 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3219 if (eptr-- == pp) break; /* Stop if tried at original pos */
3221 if (utf) BACKCHAR(eptr);
3224 RRETURN(MATCH_NOMATCH);
3227 /* Control never gets here */
3229 #endif /* End of XCLASS */
3231 /* Match a single character, casefully */
3239 GETCHARLEN(fc, ecode, length);
3240 if (length > md->end_subject - eptr)
3242 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3243 RRETURN(MATCH_NOMATCH);
3245 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3251 if (md->end_subject - eptr < 1)
3253 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3254 RRETURN(MATCH_NOMATCH);
3256 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3261 /* Match a single character, caselessly. If we are at the end of the
3262 subject, give up immediately. */
3265 if (eptr >= md->end_subject)
3268 RRETURN(MATCH_NOMATCH);
3276 GETCHARLEN(fc, ecode, length);
3278 /* If the pattern character's value is < 128, we have only one byte, and
3279 we know that its other case must also be one byte long, so we can use the
3280 fast lookup table. We know that there is at least one byte left in the
3285 pcre_uint32 cc = UCHAR21(eptr);
3286 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3291 /* Otherwise we must pick up the subject character. Note that we cannot
3292 use the value of "length" to check for sufficient bytes left, because the
3293 other case of the character may have more or fewer bytes. */
3298 GETCHARINC(dc, eptr);
3301 /* If we have Unicode property support, we can use it to test the other
3302 case of the character, if there is one. */
3307 if (dc != UCD_OTHERCASE(fc))
3309 RRETURN(MATCH_NOMATCH);
3314 #endif /* SUPPORT_UTF */
3318 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3319 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3325 /* Match a single character repeatedly. */
3329 min = max = GET2(ecode, 1);
3330 ecode += 1 + IMM2_SIZE;
3343 max = GET2(ecode, 1);
3344 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3345 ecode += 1 + IMM2_SIZE;
3384 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3385 minimize = (c & 1) != 0;
3386 min = rep_min[c]; /* Pick up values from tables; */
3387 max = rep_max[c]; /* zero for max => infinity */
3388 if (max == 0) max = INT_MAX;
3390 /* Common code for all repeated single-character matches. We first check
3391 for the minimum number of characters. If the minimum equals the maximum, we
3392 are done. Otherwise, if minimizing, check the rest of the pattern for a
3393 match; if there isn't one, advance up to the maximum, one character at a
3396 If maximizing, advance up to the maximum number of matching characters,
3397 until eptr is past the end of the maximum run. If possessive, we are
3398 then done (no backing up). Otherwise, match at this position; anything
3399 other than no match is immediately returned. For nomatch, back up one
3400 character, unless we are matching \R and the last thing matched was
3401 \r\n, in which case, back up two bytes. When we reach the first optional
3402 character position, we can save stack by doing a tail recurse.
3404 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3413 GETCHARLEN(fc, ecode, length);
3416 /* Handle multibyte character matching specially here. There is
3417 support for caseless matching if UCP support is present. */
3422 pcre_uint32 othercase;
3423 if (op >= OP_STARI && /* Caseless */
3424 (othercase = UCD_OTHERCASE(fc)) != fc)
3425 oclength = PRIV(ord2utf)(othercase, occhars);
3427 #endif /* SUPPORT_UCP */
3429 for (i = 1; i <= min; i++)
3431 if (eptr <= md->end_subject - length &&
3432 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3434 else if (oclength > 0 &&
3435 eptr <= md->end_subject - oclength &&
3436 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3437 #endif /* SUPPORT_UCP */
3441 RRETURN(MATCH_NOMATCH);
3445 if (min == max) continue;
3449 for (fi = min;; fi++)
3451 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3452 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453 if (fi >= max) RRETURN(MATCH_NOMATCH);
3454 if (eptr <= md->end_subject - length &&
3455 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3457 else if (oclength > 0 &&
3458 eptr <= md->end_subject - oclength &&
3459 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3460 #endif /* SUPPORT_UCP */
3464 RRETURN(MATCH_NOMATCH);
3467 /* Control never gets here */
3473 for (i = min; i < max; i++)
3475 if (eptr <= md->end_subject - length &&
3476 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3478 else if (oclength > 0 &&
3479 eptr <= md->end_subject - oclength &&
3480 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3481 #endif /* SUPPORT_UCP */
3489 if (possessive) continue; /* No backtracking */
3492 if (eptr <= pp) goto TAIL_RECURSE;
3493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 #else /* without SUPPORT_UCP */
3500 #endif /* SUPPORT_UCP */
3503 /* Control never gets here */
3506 /* If the length of a UTF-8 character is 1, we fall through here, and
3507 obey the code as for non-UTF-8 characters below, though in this case the
3508 value of fc will always be < 128. */
3511 #endif /* SUPPORT_UTF */
3512 /* When not in UTF-8 mode, load a single-byte character. */
3515 /* The value of fc at this point is always one character, though we may
3516 or may not be in UTF mode. The code is duplicated for the caseless and
3517 caseful cases, for speed, since matching characters is likely to be quite
3518 common. First, ensure the minimum number of matches are present. If min =
3519 max, continue at the same level without recursing. Otherwise, if
3520 minimizing, keep trying the rest of the expression and advancing one
3521 matching character if failing, up to the maximum. Alternatively, if
3522 maximizing, find the maximum number of characters and work backwards. */
3524 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3525 max, (char *)eptr));
3527 if (op >= OP_STARI) /* Caseless */
3529 #ifdef COMPILE_PCRE8
3530 /* fc must be < 128 if UTF is enabled. */
3535 if (utf && fc > 127)
3536 foc = UCD_OTHERCASE(fc);
3538 if (utf && fc > 127)
3540 #endif /* SUPPORT_UCP */
3542 #endif /* SUPPORT_UTF */
3543 foc = TABLE_GET(fc, md->fcc, fc);
3544 #endif /* COMPILE_PCRE8 */
3546 for (i = 1; i <= min; i++)
3548 pcre_uint32 cc; /* Faster than pcre_uchar */
3549 if (eptr >= md->end_subject)
3552 RRETURN(MATCH_NOMATCH);
3554 cc = UCHAR21TEST(eptr);
3555 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3558 if (min == max) continue;
3561 for (fi = min;; fi++)
3563 pcre_uint32 cc; /* Faster than pcre_uchar */
3564 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3565 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3566 if (fi >= max) RRETURN(MATCH_NOMATCH);
3567 if (eptr >= md->end_subject)
3570 RRETURN(MATCH_NOMATCH);
3572 cc = UCHAR21TEST(eptr);
3573 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3576 /* Control never gets here */
3581 for (i = min; i < max; i++)
3583 pcre_uint32 cc; /* Faster than pcre_uchar */
3584 if (eptr >= md->end_subject)
3589 cc = UCHAR21TEST(eptr);
3590 if (fc != cc && foc != cc) break;
3593 if (possessive) continue; /* No backtracking */
3596 if (eptr == pp) goto TAIL_RECURSE;
3597 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601 /* Control never gets here */
3605 /* Caseful comparisons (includes all multi-byte characters) */
3609 for (i = 1; i <= min; i++)
3611 if (eptr >= md->end_subject)
3614 RRETURN(MATCH_NOMATCH);
3616 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3619 if (min == max) continue;
3623 for (fi = min;; fi++)
3625 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3627 if (fi >= max) RRETURN(MATCH_NOMATCH);
3628 if (eptr >= md->end_subject)
3631 RRETURN(MATCH_NOMATCH);
3633 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3635 /* Control never gets here */
3640 for (i = min; i < max; i++)
3642 if (eptr >= md->end_subject)
3647 if (fc != UCHAR21TEST(eptr)) break;
3650 if (possessive) continue; /* No backtracking */
3653 if (eptr == pp) goto TAIL_RECURSE;
3654 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3656 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 /* Control never gets here */
3661 /* Control never gets here */
3663 /* Match a negated single one-byte character. The character we are
3664 checking can be multibyte. */
3668 if (eptr >= md->end_subject)
3671 RRETURN(MATCH_NOMATCH);
3676 register pcre_uint32 ch, och;
3679 GETCHARINC(ch, ecode);
3680 GETCHARINC(c, eptr);
3684 if (ch == c) RRETURN(MATCH_NOMATCH);
3690 och = UCD_OTHERCASE(ch);
3694 #endif /* SUPPORT_UCP */
3696 och = TABLE_GET(ch, md->fcc, ch);
3697 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3703 register pcre_uint32 ch = ecode[1];
3705 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3706 RRETURN(MATCH_NOMATCH);
3711 /* Match a negated single one-byte character repeatedly. This is almost a
3712 repeat of the code for a repeated single character, but I haven't found a
3713 nice way of commoning these up that doesn't require a test of the
3714 positive/negative option for each character match. Maybe that wouldn't add
3715 very much to the time taken, but character matching *is* what this is all
3720 min = max = GET2(ecode, 1);
3721 ecode += 1 + IMM2_SIZE;
3727 case OP_NOTMINUPTOI:
3729 max = GET2(ecode, 1);
3730 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3731 ecode += 1 + IMM2_SIZE;
3735 case OP_NOTPOSSTARI:
3743 case OP_NOTPOSPLUSI:
3750 case OP_NOTPOSQUERY:
3751 case OP_NOTPOSQUERYI:
3759 case OP_NOTPOSUPTOI:
3762 max = GET2(ecode, 1);
3763 ecode += 1 + IMM2_SIZE;
3769 case OP_NOTMINSTARI:
3773 case OP_NOTMINPLUSI:
3776 case OP_NOTMINQUERY:
3777 case OP_NOTMINQUERYI:
3778 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3779 minimize = (c & 1) != 0;
3780 min = rep_min[c]; /* Pick up values from tables; */
3781 max = rep_max[c]; /* zero for max => infinity */
3782 if (max == 0) max = INT_MAX;
3784 /* Common code for all repeated single-byte matches. */
3787 GETCHARINCTEST(fc, ecode);
3789 /* The code is duplicated for the caseless and caseful cases, for speed,
3790 since matching characters is likely to be quite common. First, ensure the
3791 minimum number of matches are present. If min = max, continue at the same
3792 level without recursing. Otherwise, if minimizing, keep trying the rest of
3793 the expression and advancing one matching character if failing, up to the
3794 maximum. Alternatively, if maximizing, find the maximum number of
3795 characters and work backwards. */
3797 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3798 max, (char *)eptr));
3800 if (op >= OP_NOTSTARI) /* Caseless */
3804 if (utf && fc > 127)
3805 foc = UCD_OTHERCASE(fc);
3807 if (utf && fc > 127)
3809 #endif /* SUPPORT_UCP */
3811 #endif /* SUPPORT_UTF */
3812 foc = TABLE_GET(fc, md->fcc, fc);
3817 register pcre_uint32 d;
3818 for (i = 1; i <= min; i++)
3820 if (eptr >= md->end_subject)
3823 RRETURN(MATCH_NOMATCH);
3825 GETCHARINC(d, eptr);
3826 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3830 #endif /* SUPPORT_UTF */
3833 for (i = 1; i <= min; i++)
3835 if (eptr >= md->end_subject)
3838 RRETURN(MATCH_NOMATCH);
3840 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3845 if (min == max) continue;
3852 register pcre_uint32 d;
3853 for (fi = min;; fi++)
3855 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3857 if (fi >= max) RRETURN(MATCH_NOMATCH);
3858 if (eptr >= md->end_subject)
3861 RRETURN(MATCH_NOMATCH);
3863 GETCHARINC(d, eptr);
3864 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3868 #endif /*SUPPORT_UTF */
3871 for (fi = min;; fi++)
3873 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3875 if (fi >= max) RRETURN(MATCH_NOMATCH);
3876 if (eptr >= md->end_subject)
3879 RRETURN(MATCH_NOMATCH);
3881 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3885 /* Control never gets here */
3897 register pcre_uint32 d;
3898 for (i = min; i < max; i++)
3901 if (eptr >= md->end_subject)
3906 GETCHARLEN(d, eptr, len);
3907 if (fc == d || (unsigned int)foc == d) break;
3910 if (possessive) continue; /* No backtracking */
3913 if (eptr <= pp) goto TAIL_RECURSE;
3914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3921 #endif /* SUPPORT_UTF */
3924 for (i = min; i < max; i++)
3926 if (eptr >= md->end_subject)
3931 if (fc == *eptr || foc == *eptr) break;
3934 if (possessive) continue; /* No backtracking */
3937 if (eptr == pp) goto TAIL_RECURSE;
3938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3943 /* Control never gets here */
3947 /* Caseful comparisons */
3954 register pcre_uint32 d;
3955 for (i = 1; i <= min; i++)
3957 if (eptr >= md->end_subject)
3960 RRETURN(MATCH_NOMATCH);
3962 GETCHARINC(d, eptr);
3963 if (fc == d) RRETURN(MATCH_NOMATCH);
3970 for (i = 1; i <= min; i++)
3972 if (eptr >= md->end_subject)
3975 RRETURN(MATCH_NOMATCH);
3977 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3981 if (min == max) continue;
3988 register pcre_uint32 d;
3989 for (fi = min;; fi++)
3991 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3993 if (fi >= max) RRETURN(MATCH_NOMATCH);
3994 if (eptr >= md->end_subject)
3997 RRETURN(MATCH_NOMATCH);
3999 GETCHARINC(d, eptr);
4000 if (fc == d) RRETURN(MATCH_NOMATCH);
4007 for (fi = min;; fi++)
4009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4011 if (fi >= max) RRETURN(MATCH_NOMATCH);
4012 if (eptr >= md->end_subject)
4015 RRETURN(MATCH_NOMATCH);
4017 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4020 /* Control never gets here */
4032 register pcre_uint32 d;
4033 for (i = min; i < max; i++)
4036 if (eptr >= md->end_subject)
4041 GETCHARLEN(d, eptr, len);
4045 if (possessive) continue; /* No backtracking */
4048 if (eptr <= pp) goto TAIL_RECURSE;
4049 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4059 for (i = min; i < max; i++)
4061 if (eptr >= md->end_subject)
4066 if (fc == *eptr) break;
4069 if (possessive) continue; /* No backtracking */
4072 if (eptr == pp) goto TAIL_RECURSE;
4073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4078 /* Control never gets here */
4081 /* Control never gets here */
4083 /* Match a single character type repeatedly; several different opcodes
4084 share code. This is very similar to the code for single characters, but we
4085 repeat it in the interests of efficiency. */
4088 min = max = GET2(ecode, 1);
4090 ecode += 1 + IMM2_SIZE;
4094 case OP_TYPEMINUPTO:
4096 max = GET2(ecode, 1);
4097 minimize = *ecode == OP_TYPEMINUPTO;
4098 ecode += 1 + IMM2_SIZE;
4101 case OP_TYPEPOSSTAR:
4108 case OP_TYPEPOSPLUS:
4115 case OP_TYPEPOSQUERY:
4122 case OP_TYPEPOSUPTO:
4125 max = GET2(ecode, 1);
4126 ecode += 1 + IMM2_SIZE;
4130 case OP_TYPEMINSTAR:
4132 case OP_TYPEMINPLUS:
4134 case OP_TYPEMINQUERY:
4135 c = *ecode++ - OP_TYPESTAR;
4136 minimize = (c & 1) != 0;
4137 min = rep_min[c]; /* Pick up values from tables; */
4138 max = rep_max[c]; /* zero for max => infinity */
4139 if (max == 0) max = INT_MAX;
4141 /* Common code for all repeated single character type matches. Note that
4142 in UTF-8 mode, '.' matches a character of any length, but for the other
4143 character types, the valid characters are all one-byte long. */
4146 ctype = *ecode++; /* Code for the character type */
4149 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4151 prop_fail_result = ctype == OP_NOTPROP;
4152 prop_type = *ecode++;
4153 prop_value = *ecode++;
4155 else prop_type = -1;
4158 /* First, ensure the minimum number of matches are present. Use inline
4159 code for maximizing the speed, and do the type test once at the start
4160 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4161 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4162 and single-bytes. */
4172 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4173 for (i = 1; i <= min; i++)
4175 if (eptr >= md->end_subject)
4178 RRETURN(MATCH_NOMATCH);
4180 GETCHARINCTEST(c, eptr);
4185 for (i = 1; i <= min; i++)
4188 if (eptr >= md->end_subject)
4191 RRETURN(MATCH_NOMATCH);
4193 GETCHARINCTEST(c, eptr);
4194 chartype = UCD_CHARTYPE(c);
4195 if ((chartype == ucp_Lu ||
4196 chartype == ucp_Ll ||
4197 chartype == ucp_Lt) == prop_fail_result)
4198 RRETURN(MATCH_NOMATCH);
4203 for (i = 1; i <= min; i++)
4205 if (eptr >= md->end_subject)
4208 RRETURN(MATCH_NOMATCH);
4210 GETCHARINCTEST(c, eptr);
4211 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4212 RRETURN(MATCH_NOMATCH);
4217 for (i = 1; i <= min; i++)
4219 if (eptr >= md->end_subject)
4222 RRETURN(MATCH_NOMATCH);
4224 GETCHARINCTEST(c, eptr);
4225 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4226 RRETURN(MATCH_NOMATCH);
4231 for (i = 1; i <= min; i++)
4233 if (eptr >= md->end_subject)
4236 RRETURN(MATCH_NOMATCH);
4238 GETCHARINCTEST(c, eptr);
4239 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4240 RRETURN(MATCH_NOMATCH);
4245 for (i = 1; i <= min; i++)
4248 if (eptr >= md->end_subject)
4251 RRETURN(MATCH_NOMATCH);
4253 GETCHARINCTEST(c, eptr);
4254 category = UCD_CATEGORY(c);
4255 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4256 RRETURN(MATCH_NOMATCH);
4260 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4261 which means that Perl space and POSIX space are now identical. PCRE
4262 was changed at release 8.34. */
4264 case PT_SPACE: /* Perl space */
4265 case PT_PXSPACE: /* POSIX space */
4266 for (i = 1; i <= min; i++)
4268 if (eptr >= md->end_subject)
4271 RRETURN(MATCH_NOMATCH);
4273 GETCHARINCTEST(c, eptr);
4278 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4282 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4283 RRETURN(MATCH_NOMATCH);
4290 for (i = 1; i <= min; i++)
4293 if (eptr >= md->end_subject)
4296 RRETURN(MATCH_NOMATCH);
4298 GETCHARINCTEST(c, eptr);
4299 category = UCD_CATEGORY(c);
4300 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4301 == prop_fail_result)
4302 RRETURN(MATCH_NOMATCH);
4307 for (i = 1; i <= min; i++)
4309 const pcre_uint32 *cp;
4310 if (eptr >= md->end_subject)
4313 RRETURN(MATCH_NOMATCH);
4315 GETCHARINCTEST(c, eptr);
4316 cp = PRIV(ucd_caseless_sets) + prop_value;
4320 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4322 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4328 for (i = 1; i <= min; i++)
4330 if (eptr >= md->end_subject)
4333 RRETURN(MATCH_NOMATCH);
4335 GETCHARINCTEST(c, eptr);
4336 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4337 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4338 c >= 0xe000) == prop_fail_result)
4339 RRETURN(MATCH_NOMATCH);
4343 /* This should not occur */
4346 RRETURN(PCRE_ERROR_INTERNAL);
4350 /* Match extended Unicode sequences. We will get here only if the
4351 support is in the binary; otherwise a compile-time error occurs. */
4353 else if (ctype == OP_EXTUNI)
4355 for (i = 1; i <= min; i++)
4357 if (eptr >= md->end_subject)
4360 RRETURN(MATCH_NOMATCH);
4365 GETCHARINCTEST(c, eptr);
4366 lgb = UCD_GRAPHBREAK(c);
4367 while (eptr < md->end_subject)
4370 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4371 rgb = UCD_GRAPHBREAK(c);
4372 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4382 #endif /* SUPPORT_UCP */
4384 /* Handle all other cases when the coding is UTF-8 */
4387 if (utf) switch(ctype)
4390 for (i = 1; i <= min; i++)
4392 if (eptr >= md->end_subject)
4395 RRETURN(MATCH_NOMATCH);
4397 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4398 if (md->partial != 0 &&
4399 eptr + 1 >= md->end_subject &&
4400 NLBLOCK->nltype == NLTYPE_FIXED &&
4401 NLBLOCK->nllen == 2 &&
4402 UCHAR21(eptr) == NLBLOCK->nl[0])
4405 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4408 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4413 for (i = 1; i <= min; i++)
4415 if (eptr >= md->end_subject)
4418 RRETURN(MATCH_NOMATCH);
4421 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4426 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4431 for (i = 1; i <= min; i++)
4433 if (eptr >= md->end_subject)
4436 RRETURN(MATCH_NOMATCH);
4438 GETCHARINC(c, eptr);
4441 default: RRETURN(MATCH_NOMATCH);
4444 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4456 #endif /* Not EBCDIC */
4457 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4464 for (i = 1; i <= min; i++)
4466 if (eptr >= md->end_subject)
4469 RRETURN(MATCH_NOMATCH);
4471 GETCHARINC(c, eptr);
4474 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4481 for (i = 1; i <= min; i++)
4483 if (eptr >= md->end_subject)
4486 RRETURN(MATCH_NOMATCH);
4488 GETCHARINC(c, eptr);
4491 HSPACE_CASES: break; /* Byte and multibyte cases */
4492 default: RRETURN(MATCH_NOMATCH);
4498 for (i = 1; i <= min; i++)
4500 if (eptr >= md->end_subject)
4503 RRETURN(MATCH_NOMATCH);
4505 GETCHARINC(c, eptr);
4508 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4515 for (i = 1; i <= min; i++)
4517 if (eptr >= md->end_subject)
4520 RRETURN(MATCH_NOMATCH);
4522 GETCHARINC(c, eptr);
4525 VSPACE_CASES: break;
4526 default: RRETURN(MATCH_NOMATCH);
4532 for (i = 1; i <= min; i++)
4534 if (eptr >= md->end_subject)
4537 RRETURN(MATCH_NOMATCH);
4539 GETCHARINC(c, eptr);
4540 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4541 RRETURN(MATCH_NOMATCH);
4546 for (i = 1; i <= min; i++)
4549 if (eptr >= md->end_subject)
4552 RRETURN(MATCH_NOMATCH);
4555 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4556 RRETURN(MATCH_NOMATCH);
4558 /* No need to skip more bytes - we know it's a 1-byte character */
4562 case OP_NOT_WHITESPACE:
4563 for (i = 1; i <= min; i++)
4566 if (eptr >= md->end_subject)
4569 RRETURN(MATCH_NOMATCH);
4572 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4573 RRETURN(MATCH_NOMATCH);
4575 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4580 for (i = 1; i <= min; i++)
4583 if (eptr >= md->end_subject)
4586 RRETURN(MATCH_NOMATCH);
4589 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4590 RRETURN(MATCH_NOMATCH);
4592 /* No need to skip more bytes - we know it's a 1-byte character */
4596 case OP_NOT_WORDCHAR:
4597 for (i = 1; i <= min; i++)
4600 if (eptr >= md->end_subject)
4603 RRETURN(MATCH_NOMATCH);
4606 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4607 RRETURN(MATCH_NOMATCH);
4609 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4614 for (i = 1; i <= min; i++)
4617 if (eptr >= md->end_subject)
4620 RRETURN(MATCH_NOMATCH);
4623 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4624 RRETURN(MATCH_NOMATCH);
4626 /* No need to skip more bytes - we know it's a 1-byte character */
4631 RRETURN(PCRE_ERROR_INTERNAL);
4632 } /* End switch(ctype) */
4635 #endif /* SUPPORT_UTF */
4637 /* Code for the non-UTF-8 case for minimum matching of operators other
4638 than OP_PROP and OP_NOTPROP. */
4643 for (i = 1; i <= min; i++)
4645 if (eptr >= md->end_subject)
4648 RRETURN(MATCH_NOMATCH);
4650 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4651 if (md->partial != 0 &&
4652 eptr + 1 >= md->end_subject &&
4653 NLBLOCK->nltype == NLTYPE_FIXED &&
4654 NLBLOCK->nllen == 2 &&
4655 *eptr == NLBLOCK->nl[0])
4658 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4665 if (eptr > md->end_subject - min)
4668 RRETURN(MATCH_NOMATCH);
4674 if (eptr > md->end_subject - min)
4677 RRETURN(MATCH_NOMATCH);
4683 for (i = 1; i <= min; i++)
4685 if (eptr >= md->end_subject)
4688 RRETURN(MATCH_NOMATCH);
4692 default: RRETURN(MATCH_NOMATCH);
4695 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4704 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4708 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4715 for (i = 1; i <= min; i++)
4717 if (eptr >= md->end_subject)
4720 RRETURN(MATCH_NOMATCH);
4726 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4727 HSPACE_MULTIBYTE_CASES:
4729 RRETURN(MATCH_NOMATCH);
4735 for (i = 1; i <= min; i++)
4737 if (eptr >= md->end_subject)
4740 RRETURN(MATCH_NOMATCH);
4744 default: RRETURN(MATCH_NOMATCH);
4746 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4747 HSPACE_MULTIBYTE_CASES:
4755 for (i = 1; i <= min; i++)
4757 if (eptr >= md->end_subject)
4760 RRETURN(MATCH_NOMATCH);
4765 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4766 VSPACE_MULTIBYTE_CASES:
4768 RRETURN(MATCH_NOMATCH);
4775 for (i = 1; i <= min; i++)
4777 if (eptr >= md->end_subject)
4780 RRETURN(MATCH_NOMATCH);
4784 default: RRETURN(MATCH_NOMATCH);
4786 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4787 VSPACE_MULTIBYTE_CASES:
4795 for (i = 1; i <= min; i++)
4797 if (eptr >= md->end_subject)
4800 RRETURN(MATCH_NOMATCH);
4802 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4803 RRETURN(MATCH_NOMATCH);
4809 for (i = 1; i <= min; i++)
4811 if (eptr >= md->end_subject)
4814 RRETURN(MATCH_NOMATCH);
4816 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4817 RRETURN(MATCH_NOMATCH);
4822 case OP_NOT_WHITESPACE:
4823 for (i = 1; i <= min; i++)
4825 if (eptr >= md->end_subject)
4828 RRETURN(MATCH_NOMATCH);
4830 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4831 RRETURN(MATCH_NOMATCH);
4837 for (i = 1; i <= min; i++)
4839 if (eptr >= md->end_subject)
4842 RRETURN(MATCH_NOMATCH);
4844 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4845 RRETURN(MATCH_NOMATCH);
4850 case OP_NOT_WORDCHAR:
4851 for (i = 1; i <= min; i++)
4853 if (eptr >= md->end_subject)
4856 RRETURN(MATCH_NOMATCH);
4858 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4859 RRETURN(MATCH_NOMATCH);
4865 for (i = 1; i <= min; i++)
4867 if (eptr >= md->end_subject)
4870 RRETURN(MATCH_NOMATCH);
4872 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4873 RRETURN(MATCH_NOMATCH);
4879 RRETURN(PCRE_ERROR_INTERNAL);
4883 /* If min = max, continue at the same level without recursing */
4885 if (min == max) continue;
4887 /* If minimizing, we have to test the rest of the pattern before each
4888 subsequent match. Again, separate the UTF-8 case for speed, and also
4889 separate the UCP cases. */
4899 for (fi = min;; fi++)
4901 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4902 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4903 if (fi >= max) RRETURN(MATCH_NOMATCH);
4904 if (eptr >= md->end_subject)
4907 RRETURN(MATCH_NOMATCH);
4909 GETCHARINCTEST(c, eptr);
4910 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4912 /* Control never gets here */
4915 for (fi = min;; fi++)
4918 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4920 if (fi >= max) RRETURN(MATCH_NOMATCH);
4921 if (eptr >= md->end_subject)
4924 RRETURN(MATCH_NOMATCH);
4926 GETCHARINCTEST(c, eptr);
4927 chartype = UCD_CHARTYPE(c);
4928 if ((chartype == ucp_Lu ||
4929 chartype == ucp_Ll ||
4930 chartype == ucp_Lt) == prop_fail_result)
4931 RRETURN(MATCH_NOMATCH);
4933 /* Control never gets here */
4936 for (fi = min;; fi++)
4938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4940 if (fi >= max) RRETURN(MATCH_NOMATCH);
4941 if (eptr >= md->end_subject)
4944 RRETURN(MATCH_NOMATCH);
4946 GETCHARINCTEST(c, eptr);
4947 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4948 RRETURN(MATCH_NOMATCH);
4950 /* Control never gets here */
4953 for (fi = min;; fi++)
4955 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4956 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4957 if (fi >= max) RRETURN(MATCH_NOMATCH);
4958 if (eptr >= md->end_subject)
4961 RRETURN(MATCH_NOMATCH);
4963 GETCHARINCTEST(c, eptr);
4964 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4965 RRETURN(MATCH_NOMATCH);
4967 /* Control never gets here */
4970 for (fi = min;; fi++)
4972 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4974 if (fi >= max) RRETURN(MATCH_NOMATCH);
4975 if (eptr >= md->end_subject)
4978 RRETURN(MATCH_NOMATCH);
4980 GETCHARINCTEST(c, eptr);
4981 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4982 RRETURN(MATCH_NOMATCH);
4984 /* Control never gets here */
4987 for (fi = min;; fi++)
4990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4992 if (fi >= max) RRETURN(MATCH_NOMATCH);
4993 if (eptr >= md->end_subject)
4996 RRETURN(MATCH_NOMATCH);
4998 GETCHARINCTEST(c, eptr);
4999 category = UCD_CATEGORY(c);
5000 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5001 RRETURN(MATCH_NOMATCH);
5003 /* Control never gets here */
5005 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5006 which means that Perl space and POSIX space are now identical. PCRE
5007 was changed at release 8.34. */
5009 case PT_SPACE: /* Perl space */
5010 case PT_PXSPACE: /* POSIX space */
5011 for (fi = min;; fi++)
5013 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5015 if (fi >= max) RRETURN(MATCH_NOMATCH);
5016 if (eptr >= md->end_subject)
5019 RRETURN(MATCH_NOMATCH);
5021 GETCHARINCTEST(c, eptr);
5026 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5030 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5031 RRETURN(MATCH_NOMATCH);
5035 /* Control never gets here */
5038 for (fi = min;; fi++)
5041 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5043 if (fi >= max) RRETURN(MATCH_NOMATCH);
5044 if (eptr >= md->end_subject)
5047 RRETURN(MATCH_NOMATCH);
5049 GETCHARINCTEST(c, eptr);
5050 category = UCD_CATEGORY(c);
5051 if ((category == ucp_L ||
5052 category == ucp_N ||
5053 c == CHAR_UNDERSCORE)
5054 == prop_fail_result)
5055 RRETURN(MATCH_NOMATCH);
5057 /* Control never gets here */
5060 for (fi = min;; fi++)
5062 const pcre_uint32 *cp;
5063 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5065 if (fi >= max) RRETURN(MATCH_NOMATCH);
5066 if (eptr >= md->end_subject)
5069 RRETURN(MATCH_NOMATCH);
5071 GETCHARINCTEST(c, eptr);
5072 cp = PRIV(ucd_caseless_sets) + prop_value;
5076 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5078 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5081 /* Control never gets here */
5084 for (fi = min;; fi++)
5086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5088 if (fi >= max) RRETURN(MATCH_NOMATCH);
5089 if (eptr >= md->end_subject)
5092 RRETURN(MATCH_NOMATCH);
5094 GETCHARINCTEST(c, eptr);
5095 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5096 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5097 c >= 0xe000) == prop_fail_result)
5098 RRETURN(MATCH_NOMATCH);
5100 /* Control never gets here */
5102 /* This should never occur */
5104 RRETURN(PCRE_ERROR_INTERNAL);
5108 /* Match extended Unicode sequences. We will get here only if the
5109 support is in the binary; otherwise a compile-time error occurs. */
5111 else if (ctype == OP_EXTUNI)
5113 for (fi = min;; fi++)
5115 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5116 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5117 if (fi >= max) RRETURN(MATCH_NOMATCH);
5118 if (eptr >= md->end_subject)
5121 RRETURN(MATCH_NOMATCH);
5126 GETCHARINCTEST(c, eptr);
5127 lgb = UCD_GRAPHBREAK(c);
5128 while (eptr < md->end_subject)
5131 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5132 rgb = UCD_GRAPHBREAK(c);
5133 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5142 #endif /* SUPPORT_UCP */
5147 for (fi = min;; fi++)
5149 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5151 if (fi >= max) RRETURN(MATCH_NOMATCH);
5152 if (eptr >= md->end_subject)
5155 RRETURN(MATCH_NOMATCH);
5157 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5158 RRETURN(MATCH_NOMATCH);
5159 GETCHARINC(c, eptr);
5162 case OP_ANY: /* This is the non-NL case */
5163 if (md->partial != 0 && /* Take care with CRLF partial */
5164 eptr >= md->end_subject &&
5165 NLBLOCK->nltype == NLTYPE_FIXED &&
5166 NLBLOCK->nllen == 2 &&
5167 c == NLBLOCK->nl[0])
5170 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5181 default: RRETURN(MATCH_NOMATCH);
5183 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5195 #endif /* Not EBCDIC */
5196 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5204 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5212 HSPACE_CASES: break;
5213 default: RRETURN(MATCH_NOMATCH);
5220 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5228 VSPACE_CASES: break;
5229 default: RRETURN(MATCH_NOMATCH);
5234 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5235 RRETURN(MATCH_NOMATCH);
5239 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5240 RRETURN(MATCH_NOMATCH);
5243 case OP_NOT_WHITESPACE:
5244 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5245 RRETURN(MATCH_NOMATCH);
5249 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5250 RRETURN(MATCH_NOMATCH);
5253 case OP_NOT_WORDCHAR:
5254 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5255 RRETURN(MATCH_NOMATCH);
5259 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5260 RRETURN(MATCH_NOMATCH);
5264 RRETURN(PCRE_ERROR_INTERNAL);
5272 for (fi = min;; fi++)
5274 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5275 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5276 if (fi >= max) RRETURN(MATCH_NOMATCH);
5277 if (eptr >= md->end_subject)
5280 RRETURN(MATCH_NOMATCH);
5282 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5283 RRETURN(MATCH_NOMATCH);
5287 case OP_ANY: /* This is the non-NL case */
5288 if (md->partial != 0 && /* Take care with CRLF partial */
5289 eptr >= md->end_subject &&
5290 NLBLOCK->nltype == NLTYPE_FIXED &&
5291 NLBLOCK->nllen == 2 &&
5292 c == NLBLOCK->nl[0])
5295 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5306 default: RRETURN(MATCH_NOMATCH);
5308 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5317 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5321 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5331 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5332 HSPACE_MULTIBYTE_CASES:
5334 RRETURN(MATCH_NOMATCH);
5341 default: RRETURN(MATCH_NOMATCH);
5343 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5344 HSPACE_MULTIBYTE_CASES:
5355 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5356 VSPACE_MULTIBYTE_CASES:
5358 RRETURN(MATCH_NOMATCH);
5365 default: RRETURN(MATCH_NOMATCH);
5367 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5368 VSPACE_MULTIBYTE_CASES:
5375 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5379 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5382 case OP_NOT_WHITESPACE:
5383 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5387 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5390 case OP_NOT_WORDCHAR:
5391 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5395 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5399 RRETURN(PCRE_ERROR_INTERNAL);
5403 /* Control never gets here */
5406 /* If maximizing, it is worth using inline code for speed, doing the type
5407 test once at the start (i.e. keep it out of the loop). Again, keep the
5408 UTF-8 and UCP stuff separate. */
5412 pp = eptr; /* Remember where we started */
5420 for (i = min; i < max; i++)
5423 if (eptr >= md->end_subject)
5428 GETCHARLENTEST(c, eptr, len);
5429 if (prop_fail_result) break;
5435 for (i = min; i < max; i++)
5439 if (eptr >= md->end_subject)
5444 GETCHARLENTEST(c, eptr, len);
5445 chartype = UCD_CHARTYPE(c);
5446 if ((chartype == ucp_Lu ||
5447 chartype == ucp_Ll ||
5448 chartype == ucp_Lt) == prop_fail_result)
5455 for (i = min; i < max; i++)
5458 if (eptr >= md->end_subject)
5463 GETCHARLENTEST(c, eptr, len);
5464 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5470 for (i = min; i < max; i++)
5473 if (eptr >= md->end_subject)
5478 GETCHARLENTEST(c, eptr, len);
5479 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5485 for (i = min; i < max; i++)
5488 if (eptr >= md->end_subject)
5493 GETCHARLENTEST(c, eptr, len);
5494 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5500 for (i = min; i < max; i++)
5504 if (eptr >= md->end_subject)
5509 GETCHARLENTEST(c, eptr, len);
5510 category = UCD_CATEGORY(c);
5511 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5517 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5518 which means that Perl space and POSIX space are now identical. PCRE
5519 was changed at release 8.34. */
5521 case PT_SPACE: /* Perl space */
5522 case PT_PXSPACE: /* POSIX space */
5523 for (i = min; i < max; i++)
5526 if (eptr >= md->end_subject)
5531 GETCHARLENTEST(c, eptr, len);
5536 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5540 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5541 goto ENDLOOP99; /* Break the loop */
5550 for (i = min; i < max; i++)
5554 if (eptr >= md->end_subject)
5559 GETCHARLENTEST(c, eptr, len);
5560 category = UCD_CATEGORY(c);
5561 if ((category == ucp_L || category == ucp_N ||
5562 c == CHAR_UNDERSCORE) == prop_fail_result)
5569 for (i = min; i < max; i++)
5571 const pcre_uint32 *cp;
5573 if (eptr >= md->end_subject)
5578 GETCHARLENTEST(c, eptr, len);
5579 cp = PRIV(ucd_caseless_sets) + prop_value;
5583 { if (prop_fail_result) break; else goto GOT_MAX; }
5585 { if (prop_fail_result) goto GOT_MAX; else break; }
5593 for (i = min; i < max; i++)
5596 if (eptr >= md->end_subject)
5601 GETCHARLENTEST(c, eptr, len);
5602 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5603 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5604 c >= 0xe000) == prop_fail_result)
5611 RRETURN(PCRE_ERROR_INTERNAL);
5614 /* eptr is now past the end of the maximum run */
5616 if (possessive) continue; /* No backtracking */
5619 if (eptr <= pp) goto TAIL_RECURSE;
5620 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5623 if (utf) BACKCHAR(eptr);
5627 /* Match extended Unicode grapheme clusters. We will get here only if the
5628 support is in the binary; otherwise a compile-time error occurs. */
5630 else if (ctype == OP_EXTUNI)
5632 for (i = min; i < max; i++)
5634 if (eptr >= md->end_subject)
5642 GETCHARINCTEST(c, eptr);
5643 lgb = UCD_GRAPHBREAK(c);
5644 while (eptr < md->end_subject)
5647 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5648 rgb = UCD_GRAPHBREAK(c);
5649 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5657 /* eptr is now past the end of the maximum run */
5659 if (possessive) continue; /* No backtracking */
5661 /* We use <= pp rather than == pp to detect the start of the run while
5662 backtracking because the use of \C in UTF mode can cause BACKCHAR to
5663 move back past pp. This is just palliative; the use of \C in UTF mode
5664 is fraught with danger. */
5671 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5672 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5673 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5675 /* Backtracking over an extended grapheme cluster involves inspecting
5676 the previous two characters (if present) to see if a break is
5677 permitted between them. */
5680 if (!utf) c = *eptr; else
5685 rgb = UCD_GRAPHBREAK(c);
5689 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5691 if (!utf) c = *fptr; else
5696 lgb = UCD_GRAPHBREAK(c);
5697 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5705 #endif /* SUPPORT_UCP */
5713 for (i = min; i < max; i++)
5715 if (eptr >= md->end_subject)
5720 if (IS_NEWLINE(eptr)) break;
5721 if (md->partial != 0 && /* Take care with CRLF partial */
5722 eptr + 1 >= md->end_subject &&
5723 NLBLOCK->nltype == NLTYPE_FIXED &&
5724 NLBLOCK->nllen == 2 &&
5725 UCHAR21(eptr) == NLBLOCK->nl[0])
5728 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5731 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5738 for (i = min; i < max; i++)
5740 if (eptr >= md->end_subject)
5746 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5751 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5756 /* The byte case is the same as non-UTF8 */
5760 if (c > (unsigned int)(md->end_subject - eptr))
5762 eptr = md->end_subject;
5769 for (i = min; i < max; i++)
5772 if (eptr >= md->end_subject)
5777 GETCHARLEN(c, eptr, len);
5780 if (++eptr >= md->end_subject) break;
5781 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5787 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5789 && c != 0x2028 && c != 0x2029
5790 #endif /* Not EBCDIC */
5800 for (i = min; i < max; i++)
5804 if (eptr >= md->end_subject)
5809 GETCHARLEN(c, eptr, len);
5812 HSPACE_CASES: gotspace = TRUE; break;
5813 default: gotspace = FALSE; break;
5815 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5822 for (i = min; i < max; i++)
5826 if (eptr >= md->end_subject)
5831 GETCHARLEN(c, eptr, len);
5834 VSPACE_CASES: gotspace = TRUE; break;
5835 default: gotspace = FALSE; break;
5837 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5843 for (i = min; i < max; i++)
5846 if (eptr >= md->end_subject)
5851 GETCHARLEN(c, eptr, len);
5852 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5858 for (i = min; i < max; i++)
5861 if (eptr >= md->end_subject)
5866 GETCHARLEN(c, eptr, len);
5867 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5872 case OP_NOT_WHITESPACE:
5873 for (i = min; i < max; i++)
5876 if (eptr >= md->end_subject)
5881 GETCHARLEN(c, eptr, len);
5882 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5888 for (i = min; i < max; i++)
5891 if (eptr >= md->end_subject)
5896 GETCHARLEN(c, eptr, len);
5897 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5902 case OP_NOT_WORDCHAR:
5903 for (i = min; i < max; i++)
5906 if (eptr >= md->end_subject)
5911 GETCHARLEN(c, eptr, len);
5912 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5918 for (i = min; i < max; i++)
5921 if (eptr >= md->end_subject)
5926 GETCHARLEN(c, eptr, len);
5927 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5933 RRETURN(PCRE_ERROR_INTERNAL);
5936 if (possessive) continue; /* No backtracking */
5939 if (eptr <= pp) goto TAIL_RECURSE;
5940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5944 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5945 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5949 #endif /* SUPPORT_UTF */
5955 for (i = min; i < max; i++)
5957 if (eptr >= md->end_subject)
5962 if (IS_NEWLINE(eptr)) break;
5963 if (md->partial != 0 && /* Take care with CRLF partial */
5964 eptr + 1 >= md->end_subject &&
5965 NLBLOCK->nltype == NLTYPE_FIXED &&
5966 NLBLOCK->nllen == 2 &&
5967 *eptr == NLBLOCK->nl[0])
5970 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5979 if (c > (unsigned int)(md->end_subject - eptr))
5981 eptr = md->end_subject;
5988 for (i = min; i < max; i++)
5990 if (eptr >= md->end_subject)
5998 if (++eptr >= md->end_subject) break;
5999 if (*eptr == CHAR_LF) eptr++;
6003 if (c != CHAR_LF && (md->bsr_anycrlf ||
6004 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6005 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6006 && c != 0x2028 && c != 0x2029
6015 for (i = min; i < max; i++)
6017 if (eptr >= md->end_subject)
6024 default: eptr++; break;
6026 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6027 HSPACE_MULTIBYTE_CASES:
6036 for (i = min; i < max; i++)
6038 if (eptr >= md->end_subject)
6045 default: goto ENDLOOP01;
6047 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6048 HSPACE_MULTIBYTE_CASES:
6057 for (i = min; i < max; i++)
6059 if (eptr >= md->end_subject)
6066 default: eptr++; break;
6068 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6069 VSPACE_MULTIBYTE_CASES:
6078 for (i = min; i < max; i++)
6080 if (eptr >= md->end_subject)
6087 default: goto ENDLOOP03;
6089 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6090 VSPACE_MULTIBYTE_CASES:
6099 for (i = min; i < max; i++)
6101 if (eptr >= md->end_subject)
6106 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6112 for (i = min; i < max; i++)
6114 if (eptr >= md->end_subject)
6119 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6124 case OP_NOT_WHITESPACE:
6125 for (i = min; i < max; i++)
6127 if (eptr >= md->end_subject)
6132 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6138 for (i = min; i < max; i++)
6140 if (eptr >= md->end_subject)
6145 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6150 case OP_NOT_WORDCHAR:
6151 for (i = min; i < max; i++)
6153 if (eptr >= md->end_subject)
6158 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6164 for (i = min; i < max; i++)
6166 if (eptr >= md->end_subject)
6171 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6177 RRETURN(PCRE_ERROR_INTERNAL);
6180 if (possessive) continue; /* No backtracking */
6183 if (eptr == pp) goto TAIL_RECURSE;
6184 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6187 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6188 eptr[-1] == CHAR_CR) eptr--;
6192 /* Control never gets here */
6195 /* There's been some horrible disaster. Arrival here can only mean there is
6196 something seriously wrong in the code above or the OP_xxx definitions. */
6199 DPRINTF(("Unknown opcode %d\n", *ecode));
6200 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6203 /* Do not stick any code in here without much thought; it is assumed
6204 that "continue" in the code above comes out to here to repeat the main
6207 } /* End of main loop */
6208 /* Control never reaches here */
6211 /* When compiling to use the heap rather than the stack for recursive calls to
6212 match(), the RRETURN() macro jumps here. The number that is saved in
6213 frame->Xwhere indicates which label we actually want to return to. */
6216 #define LBL(val) case val: goto L_RM##val;
6218 switch (frame->Xwhere)
6220 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6221 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6222 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6223 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6224 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6226 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6231 LBL(22) LBL(23) LBL(28) LBL(30)
6232 LBL(32) LBL(34) LBL(42) LBL(46)
6234 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6235 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6236 #endif /* SUPPORT_UCP */
6237 #endif /* SUPPORT_UTF */
6239 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6240 return PCRE_ERROR_INTERNAL;
6243 #endif /* NO_RECURSE */
6247 /***************************************************************************
6248 ****************************************************************************
6249 RECURSION IN THE match() FUNCTION
6251 Undefine all the macros that were defined above to handle this. */
6269 #undef new_recursive
6282 #undef save_capture_last
6292 /* These two are defined as macros in both cases */
6297 /***************************************************************************
6298 ***************************************************************************/
6302 /*************************************************
6303 * Release allocated heap frames *
6304 *************************************************/
6306 /* This function releases all the allocated frames. The base frame is on the
6307 machine stack, and so must not be freed.
6309 Argument: the address of the base frame
6314 release_match_heapframes (heapframe *frame_base)
6316 heapframe *nextframe = frame_base->Xnextframe;
6317 while (nextframe != NULL)
6319 heapframe *oldframe = nextframe;
6320 nextframe = nextframe->Xnextframe;
6321 (PUBL(stack_free))(oldframe);
6327 /*************************************************
6328 * Execute a Regular Expression *
6329 *************************************************/
6331 /* This function applies a compiled re to a subject string and picks out
6332 portions of the string if it matches. Two elements in the vector are set for
6333 each substring: the offsets to the start and end of the substring.
6336 argument_re points to the compiled expression
6337 extra_data points to extra data or is NULL
6338 subject points to the subject string
6339 length length of subject string (may contain binary zeros)
6340 start_offset where to start in the subject string
6342 offsets points to a vector of ints to be filled in with offsets
6343 offsetcount the number of elements in the vector
6345 Returns: > 0 => success; value is the number of elements filled in
6346 = 0 => success, but offsets is not big enough
6347 -1 => failed to match
6348 < -1 => some kind of unexpected problem
6351 #if defined COMPILE_PCRE8
6352 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6353 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6354 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6356 #elif defined COMPILE_PCRE16
6357 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6358 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6359 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6361 #elif defined COMPILE_PCRE32
6362 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6363 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6364 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6368 int rc, ocount, arg_offset_max;
6370 BOOL using_temporary_offsets = FALSE;
6375 BOOL has_first_char = FALSE;
6376 BOOL has_req_char = FALSE;
6377 pcre_uchar first_char = 0;
6378 pcre_uchar first_char2 = 0;
6379 pcre_uchar req_char = 0;
6380 pcre_uchar req_char2 = 0;
6381 match_data match_block;
6382 match_data *md = &match_block;
6383 const pcre_uint8 *tables;
6384 const pcre_uint8 *start_bits = NULL;
6385 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6386 PCRE_PUCHAR end_subject;
6387 PCRE_PUCHAR start_partial = NULL;
6388 PCRE_PUCHAR match_partial = NULL;
6389 PCRE_PUCHAR req_char_ptr = start_match - 1;
6391 const pcre_study_data *study;
6392 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6395 heapframe frame_zero;
6396 frame_zero.Xprevframe = NULL; /* Marks the top level */
6397 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6398 md->match_frames_base = &frame_zero;
6401 /* Check for the special magic call that measures the size of the stack used
6402 per recursive call of match(). Without the funny casting for sizeof, a Windows
6403 compiler gave this error: "unary minus operator applied to unsigned type,
6404 result still unsigned". Hopefully the cast fixes that. */
6406 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6407 start_offset == -999)
6409 return -((int)sizeof(heapframe));
6411 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6414 /* Plausibility checks */
6416 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6417 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6418 return PCRE_ERROR_NULL;
6419 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6420 if (length < 0) return PCRE_ERROR_BADLENGTH;
6421 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6423 /* Check that the first field in the block is the magic number. If it is not,
6424 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6425 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6426 means that the pattern is likely compiled with different endianness. */
6428 if (re->magic_number != MAGIC_NUMBER)
6429 return re->magic_number == REVERSED_MAGIC_NUMBER?
6430 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6431 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6433 /* These two settings are used in the code for checking a UTF-8 string that
6434 follows immediately afterwards. Other values in the md block are used only
6435 during "normal" pcre_exec() processing, not when the JIT support is in use,
6436 so they are set up later. */
6438 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6439 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6440 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6441 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6443 /* Check a UTF-8 string if required. Pass back the character offset and error
6444 code for an invalid string if a results vector is available. */
6447 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6450 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6453 if (offsetcount >= 2)
6455 offsets[0] = erroroffset;
6456 offsets[1] = errorcode;
6458 #if defined COMPILE_PCRE8
6459 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6460 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6461 #elif defined COMPILE_PCRE16
6462 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6463 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6464 #elif defined COMPILE_PCRE32
6465 return PCRE_ERROR_BADUTF32;
6468 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6469 /* Check that a start_offset points to the start of a UTF character. */
6470 if (start_offset > 0 && start_offset < length &&
6471 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6472 return PCRE_ERROR_BADUTF8_OFFSET;
6477 /* If the pattern was successfully studied with JIT support, run the JIT
6478 executable instead of the rest of this function. Most options must be set at
6479 compile time for the JIT code to be usable. Fallback to the normal code path if
6480 an unsupported flag is set. */
6483 if (extra_data != NULL
6484 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6485 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6486 && extra_data->executable_jit != NULL
6487 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6489 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6490 start_offset, options, offsets, offsetcount);
6492 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6493 mode is not compiled. In this case we simply fallback to interpreter. */
6495 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6499 /* Carry on with non-JIT matching. This information is for finding all the
6500 numbers associated with a given name, for condition testing. */
6502 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6503 md->name_count = re->name_count;
6504 md->name_entry_size = re->name_entry_size;
6506 /* Fish out the optional data from the extra_data structure, first setting
6507 the default values. */
6510 md->match_limit = MATCH_LIMIT;
6511 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6512 md->callout_data = NULL;
6514 /* The table pointer is always in native byte order. */
6516 tables = re->tables;
6518 /* The two limit values override the defaults, whatever their value. */
6520 if (extra_data != NULL)
6522 unsigned long int flags = extra_data->flags;
6523 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6524 study = (const pcre_study_data *)extra_data->study_data;
6525 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6526 md->match_limit = extra_data->match_limit;
6527 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6528 md->match_limit_recursion = extra_data->match_limit_recursion;
6529 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6530 md->callout_data = extra_data->callout_data;
6531 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6534 /* Limits in the regex override only if they are smaller. */
6536 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6537 md->match_limit = re->limit_match;
6539 if ((re->flags & PCRE_RLSET) != 0 &&
6540 re->limit_recursion < md->match_limit_recursion)
6541 md->match_limit_recursion = re->limit_recursion;
6543 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6544 is a feature that makes it possible to save compiled regex and re-use them
6545 in other programs later. */
6547 if (tables == NULL) tables = PRIV(default_tables);
6549 /* Set up other data */
6551 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6552 startline = (re->flags & PCRE_STARTLINE) != 0;
6553 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6555 /* The code starts after the real_pcre block and the capture name table. */
6557 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6558 re->name_count * re->name_entry_size;
6560 md->start_subject = (PCRE_PUCHAR)subject;
6561 md->start_offset = start_offset;
6562 md->end_subject = md->start_subject + length;
6563 end_subject = md->end_subject;
6565 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6566 md->use_ucp = (re->options & PCRE_UCP) != 0;
6567 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6568 md->ignore_skip_arg = 0;
6570 /* Some options are unpacked into BOOL variables in the hope that testing
6571 them will be faster than individual option bits. */
6573 md->notbol = (options & PCRE_NOTBOL) != 0;
6574 md->noteol = (options & PCRE_NOTEOL) != 0;
6575 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6576 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6579 md->mark = md->nomatch_mark = NULL; /* In case never set */
6581 md->recursive = NULL; /* No recursion at top level */
6582 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6584 md->lcc = tables + lcc_offset;
6585 md->fcc = tables + fcc_offset;
6586 md->ctypes = tables + ctypes_offset;
6588 /* Handle different \R options. */
6590 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6593 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6594 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6597 md->bsr_anycrlf = TRUE;
6599 md->bsr_anycrlf = FALSE;
6603 case PCRE_BSR_ANYCRLF:
6604 md->bsr_anycrlf = TRUE;
6607 case PCRE_BSR_UNICODE:
6608 md->bsr_anycrlf = FALSE;
6611 default: return PCRE_ERROR_BADNEWLINE;
6614 /* Handle different types of newline. The three bits give eight cases. If
6615 nothing is set at run time, whatever was used at compile time applies. */
6617 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6618 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6620 case 0: newline = NEWLINE; break; /* Compile-time default */
6621 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6622 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6623 case PCRE_NEWLINE_CR+
6624 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6625 case PCRE_NEWLINE_ANY: newline = -1; break;
6626 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6627 default: return PCRE_ERROR_BADNEWLINE;
6632 md->nltype = NLTYPE_ANYCRLF;
6634 else if (newline < 0)
6636 md->nltype = NLTYPE_ANY;
6640 md->nltype = NLTYPE_FIXED;
6644 md->nl[0] = (newline >> 8) & 255;
6645 md->nl[1] = newline & 255;
6650 md->nl[0] = newline;
6654 /* Partial matching was originally supported only for a restricted set of
6655 regexes; from release 8.00 there are no restrictions, but the bits are still
6656 defined (though never set). So there's no harm in leaving this code. */
6658 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6659 return PCRE_ERROR_BADPARTIAL;
6661 /* If the expression has got more back references than the offsets supplied can
6662 hold, we get a temporary chunk of working store to use during the matching.
6663 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6666 ocount = offsetcount - (offsetcount % 3);
6667 arg_offset_max = (2*ocount)/3;
6669 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6671 ocount = re->top_backref * 3 + 3;
6672 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6673 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6674 using_temporary_offsets = TRUE;
6675 DPRINTF(("Got memory to hold back references\n"));
6677 else md->offset_vector = offsets;
6678 md->offset_end = ocount;
6679 md->offset_max = (2*ocount)/3;
6680 md->capture_last = 0;
6682 /* Reset the working variable associated with each extraction. These should
6683 never be used unless previously set, but they get saved and restored, and so we
6684 initialize them to avoid reading uninitialized locations. Also, unset the
6685 offsets for the matched string. This is really just for tidiness with callouts,
6686 in case they inspect these fields. */
6688 if (md->offset_vector != NULL)
6690 register int *iptr = md->offset_vector + ocount;
6691 register int *iend = iptr - re->top_bracket;
6692 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6693 while (--iptr >= iend) *iptr = -1;
6694 if (offsetcount > 0) md->offset_vector[0] = -1;
6695 if (offsetcount > 1) md->offset_vector[1] = -1;
6698 /* Set up the first character to match, if available. The first_char value is
6699 never set for an anchored regular expression, but the anchoring may be forced
6700 at run time, so we have to test for anchoring. The first char may be unset for
6701 an unanchored pattern, of course. If there's no first char and the pattern was
6702 studied, there may be a bitmap of possible first characters. */
6706 if ((re->flags & PCRE_FIRSTSET) != 0)
6708 has_first_char = TRUE;
6709 first_char = first_char2 = (pcre_uchar)(re->first_char);
6710 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6712 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6713 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6714 if (utf && first_char > 127)
6715 first_char2 = UCD_OTHERCASE(first_char);
6720 if (!startline && study != NULL &&
6721 (study->flags & PCRE_STUDY_MAPPED) != 0)
6722 start_bits = study->start_bits;
6725 /* For anchored or unanchored matches, there may be a "last known required
6728 if ((re->flags & PCRE_REQCHSET) != 0)
6730 has_req_char = TRUE;
6731 req_char = req_char2 = (pcre_uchar)(re->req_char);
6732 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6734 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6735 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6736 if (utf && req_char > 127)
6737 req_char2 = UCD_OTHERCASE(req_char);
6743 /* ==========================================================================*/
6745 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6746 the loop runs just once. */
6750 PCRE_PUCHAR save_end_subject = end_subject;
6751 PCRE_PUCHAR new_start_match;
6753 /* If firstline is TRUE, the start of the match is constrained to the first
6754 line of a multiline string. That is, the match must be before or at the first
6755 newline. Implement this by temporarily adjusting end_subject so that we stop
6756 scanning at a newline. If the match fails at the newline, later code breaks
6761 PCRE_PUCHAR t = start_match;
6765 while (t < md->end_subject && !IS_NEWLINE(t))
6768 ACROSSCHAR(t < end_subject, *t, t++);
6773 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6777 /* There are some optimizations that avoid running the match if a known
6778 starting point is not found, or if a known later character is not present.
6779 However, there is an option that disables these, for testing and for ensuring
6780 that all callouts do actually occur. The option can be set in the regex by
6781 (*NO_START_OPT) or passed in match-time options. */
6783 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6785 /* Advance to a unique first char if there is one. */
6791 if (first_char != first_char2)
6792 while (start_match < end_subject &&
6793 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6796 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6800 /* Or to just after a linebreak for a multiline match */
6804 if (start_match > md->start_subject + start_offset)
6809 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6812 ACROSSCHAR(start_match < end_subject, *start_match,
6818 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6821 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6822 and we are now at a LF, advance the match position by one more character.
6825 if (start_match[-1] == CHAR_CR &&
6826 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6827 start_match < end_subject &&
6828 UCHAR21TEST(start_match) == CHAR_NL)
6833 /* Or to a non-unique first byte after study */
6835 else if (start_bits != NULL)
6837 while (start_match < end_subject)
6839 register pcre_uint32 c = UCHAR21TEST(start_match);
6840 #ifndef COMPILE_PCRE8
6841 if (c > 255) c = 255;
6843 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6847 } /* Starting optimizations */
6849 /* Restore fudged end_subject */
6851 end_subject = save_end_subject;
6853 /* The following two optimizations are disabled for partial matching or if
6854 disabling is explicitly requested. */
6856 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6858 /* If the pattern was studied, a minimum subject length may be set. This is
6859 a lower bound; no actual string of that length may actually match the
6860 pattern. Although the value is, strictly, in characters, we treat it as
6861 bytes to avoid spending too much time in this optimization. */
6863 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6864 (pcre_uint32)(end_subject - start_match) < study->minlength)
6870 /* If req_char is set, we know that that character must appear in the
6871 subject for the match to succeed. If the first character is set, req_char
6872 must be later in the subject; otherwise the test starts at the match point.
6873 This optimization can save a huge amount of backtracking in patterns with
6874 nested unlimited repeats that aren't going to match. Writing separate code
6875 for cased/caseless versions makes it go faster, as does using an
6876 autoincrement and backing off on a match.
6878 HOWEVER: when the subject string is very, very long, searching to its end
6879 can take a long time, and give bad performance on quite ordinary patterns.
6880 This showed up when somebody was matching something like /^\d+C/ on a
6881 32-megabyte string... so we don't do this when the string is sufficiently
6884 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6886 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6888 /* We don't need to repeat the search if we haven't yet reached the
6889 place we found it at last time. */
6891 if (p > req_char_ptr)
6893 if (req_char != req_char2)
6895 while (p < end_subject)
6897 register pcre_uint32 pp = UCHAR21INCTEST(p);
6898 if (pp == req_char || pp == req_char2) { p--; break; }
6903 while (p < end_subject)
6905 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6909 /* If we can't find the required character, break the matching loop,
6910 forcing a match failure. */
6912 if (p >= end_subject)
6918 /* If we have found the required character, save the point where we
6919 found it, so that we don't search again next time round the loop if
6920 the start hasn't passed this character yet. */
6927 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6928 printf(">>>> Match against: ");
6929 pchars(start_match, end_subject - start_match, TRUE, md);
6933 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6934 first starting point for which a partial match was found. */
6936 md->start_match_ptr = start_match;
6937 md->start_used_ptr = start_match;
6938 md->match_call_count = 0;
6939 md->match_function_type = 0;
6940 md->end_offset_top = 0;
6941 md->skip_arg_count = 0;
6942 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6943 if (md->hitend && start_partial == NULL)
6945 start_partial = md->start_used_ptr;
6946 match_partial = start_match;
6951 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6952 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6953 entirely. The only way we can do that is to re-do the match at the same
6954 point, with a flag to force SKIP with an argument to be ignored. Just
6955 treating this case as NOMATCH does not work because it does not check other
6956 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6958 case MATCH_SKIP_ARG:
6959 new_start_match = start_match;
6960 md->ignore_skip_arg = md->skip_arg_count;
6963 /* SKIP passes back the next starting point explicitly, but if it is no
6964 greater than the match we have just done, treat it as NOMATCH. */
6967 if (md->start_match_ptr > start_match)
6969 new_start_match = md->start_match_ptr;
6974 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6975 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6980 md->ignore_skip_arg = 0;
6981 new_start_match = start_match + 1;
6984 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6989 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6995 /* Any other return is either a match, or some kind of error. */
7001 /* Control reaches here for the various types of "no match at this point"
7002 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7006 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7007 newline in the subject (though it may continue over the newline). Therefore,
7008 if we have just failed to match, starting at a newline, do not continue. */
7010 if (firstline && IS_NEWLINE(start_match)) break;
7012 /* Advance to new matching position */
7014 start_match = new_start_match;
7016 /* Break the loop if the pattern is anchored or if we have passed the end of
7019 if (anchored || start_match > end_subject) break;
7021 /* If we have just passed a CR and we are now at a LF, and the pattern does
7022 not contain any explicit matches for \r or \n, and the newline option is CRLF
7023 or ANY or ANYCRLF, advance the match position by one more character. In
7024 normal matching start_match will aways be greater than the first position at
7025 this stage, but a failed *SKIP can cause a return at the same point, which is
7026 why the first test exists. */
7028 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7029 start_match[-1] == CHAR_CR &&
7030 start_match < end_subject &&
7031 *start_match == CHAR_NL &&
7032 (re->flags & PCRE_HASCRORLF) == 0 &&
7033 (md->nltype == NLTYPE_ANY ||
7034 md->nltype == NLTYPE_ANYCRLF ||
7038 md->mark = NULL; /* Reset for start of next match attempt */
7039 } /* End of for(;;) "bumpalong" loop */
7041 /* ==========================================================================*/
7043 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7046 (1) The pattern is anchored or the match was failed by (*COMMIT);
7048 (2) We are past the end of the subject;
7050 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7051 this option requests that a match occur at or before the first newline in
7054 When we have a match and the offset vector is big enough to deal with any
7055 backreferences, captured substring offsets will already be set up. In the case
7056 where we had to get some local store to hold offsets for backreference
7057 processing, copy those that we can. In this case there need not be overflow if
7058 certain parts of the pattern were not used, even though there are more
7059 capturing parentheses than vector slots. */
7063 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7065 if (using_temporary_offsets)
7067 if (arg_offset_max >= 4)
7069 memcpy(offsets + 2, md->offset_vector + 2,
7070 (arg_offset_max - 2) * sizeof(int));
7071 DPRINTF(("Copied offsets from temporary memory\n"));
7073 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7074 DPRINTF(("Freeing temporary memory\n"));
7075 (PUBL(free))(md->offset_vector);
7078 /* Set the return code to the number of captured strings, or 0 if there were
7079 too many to fit into the vector. */
7081 rc = ((md->capture_last & OVFLBIT) != 0 &&
7082 md->end_offset_top >= arg_offset_max)?
7083 0 : md->end_offset_top/2;
7085 /* If there is space in the offset vector, set any unused pairs at the end of
7086 the pattern to -1 for backwards compatibility. It is documented that this
7087 happens. In earlier versions, the whole set of potential capturing offsets
7088 was set to -1 each time round the loop, but this is handled differently now.
7089 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7090 those at the end that need unsetting here. We can't just unset them all at
7091 the start of the whole thing because they may get set in one branch that is
7092 not the final matching branch. */
7094 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7096 register int *iptr, *iend;
7097 int resetcount = 2 + re->top_bracket * 2;
7098 if (resetcount > offsetcount) resetcount = offsetcount;
7099 iptr = offsets + md->end_offset_top;
7100 iend = offsets + resetcount;
7101 while (iptr < iend) *iptr++ = -1;
7104 /* If there is space, set up the whole thing as substring 0. The value of
7105 md->start_match_ptr might be modified if \K was encountered on the success
7108 if (offsetcount < 2) rc = 0; else
7110 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7111 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7114 /* Return MARK data if requested */
7116 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7117 *(extra_data->mark) = (pcre_uchar *)md->mark;
7118 DPRINTF((">>>> returning %d\n", rc));
7120 release_match_heapframes(&frame_zero);
7125 /* Control gets here if there has been an error, or if the overall match
7126 attempt has failed at all permitted starting positions. */
7128 if (using_temporary_offsets)
7130 DPRINTF(("Freeing temporary memory\n"));
7131 (PUBL(free))(md->offset_vector);
7134 /* For anything other than nomatch or partial match, just return the code. */
7136 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7138 DPRINTF((">>>> error: returning %d\n", rc));
7140 release_match_heapframes(&frame_zero);
7145 /* Handle partial matches - disable any mark data */
7147 if (match_partial != NULL)
7149 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7151 if (offsetcount > 1)
7153 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7154 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7155 if (offsetcount > 2)
7156 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7158 rc = PCRE_ERROR_PARTIAL;
7161 /* This is the classic nomatch case */
7165 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7166 rc = PCRE_ERROR_NOMATCH;
7169 /* Return the MARK data if it has been requested. */
7171 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7172 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7174 release_match_heapframes(&frame_zero);
7179 /* End of pcre_exec.c */