1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
84 #define MATCH_NOMATCH 0
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
106 #define REC_STACK_SAVE_MAX 30
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
159 Returns: >= 0 the number of subject bytes matched
161 -2 partial match; always given if at end subject
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
190 if (length < 0) return -1;
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
210 PCRE_PUCHAR endptr = p + length;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
219 if (c != d && c != d + ur->other_case)
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
261 return (int)(eptr - eptr_start);
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
320 #define REGISTER register
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
329 #define RRETURN(ra) \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
387 /* Structure for remembering the local variables in a private frame */
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
393 /* Function arguments that may change */
396 const pcre_uchar *Xecode;
400 unsigned int Xrdepth;
402 /* Function local variables */
404 PCRE_PUCHAR Xcallpat;
406 PCRE_PUCHAR Xcharptr;
412 PCRE_PUCHAR Xsaved_eptr;
414 recursion_info Xnew_recursive;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
425 pcre_uchar Xocchars[6];
435 unsigned int Xnumber;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
444 /* Where to jump back to */
453 /***************************************************************************
454 ***************************************************************************/
458 /*************************************************
459 * Match from current position *
460 *************************************************/
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
526 BOOL minimize, possessive; /* Quantifier options */
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
538 heapframe *frame = (heapframe *)md->match_frames_base;
540 /* Copy in the original argument variables */
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
549 /* This is where control jumps back to to effect "recursion" */
553 /* Macros make the argument variables come from the current frame */
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
562 /* Ditto for the local variables */
565 #define charptr frame->Xcharptr
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
575 #define new_recursive frame->Xnew_recursive
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
604 #define newptrb frame->Xnewptrb
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
610 #else /* NO_RECURSE not defined */
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
623 const pcre_uchar *charptr;
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
632 recursion_info new_recursive;
640 unsigned int prop_value;
641 int prop_fail_result;
643 pcre_uchar occhars[6];
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
676 #endif /* NO_RECURSE */
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
689 #define save_mark data
691 /* These statements are here to stop the compiler complaining about unitialized
696 prop_fail_result = 0;
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
716 utf = md->utf; /* Local copy of the flag */
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
739 if (md->match_function_type == MATCH_CBEGROUP)
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
744 md->match_function_type = 0;
747 /* Now start processing the opcodes. */
751 minimize = possessive = FALSE;
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
774 md->start_match_ptr = eptr;
780 RRETURN(MATCH_NOMATCH);
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
876 save_mark = md->mark;
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
882 mstart = md->start_match_ptr;
885 if (rrc == MATCH_THEN)
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
897 while (*ecode == OP_ALT);
899 /* If hit the end of the group (which could be repeated), fail */
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
916 if (*ecode == OP_KET || eptr == saved_eptr)
918 ecode += 1+LINK_SIZE;
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
926 if (*ecode == OP_KETRMIN)
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
933 else /* OP_KETRMAX */
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
940 /* Control never gets here */
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
962 printf("start bracket %d\n", number);
964 pchars(eptr, 16, TRUE, md);
968 if (offset < md->offset_max)
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
998 if (rrc == MATCH_THEN)
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1006 /* Anything other than NOMATCH is passed back. */
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1059 DPRINTF(("start non-capturing bracket\n"));
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1072 ecode += PRIV(OP_lengths)[*ecode];
1076 /* In all other cases, we have to make another call to match(). */
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 /* See comment in the code for capturing groups above about handling
1086 if (rrc == MATCH_THEN)
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1094 if (rrc != MATCH_NOMATCH)
1096 if (rrc == MATCH_ONCE)
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1114 RRETURN(MATCH_NOMATCH);
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1133 printf("start possessive bracket %d\n", number);
1135 pchars(eptr, 16, TRUE, md);
1139 if (offset < md->offset_max)
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1167 if (rrc == MATCH_KETRPOS)
1169 offset_top = md->end_offset_top;
1170 eptr = md->end_match_ptr;
1171 ecode = md->start_code + code_offset;
1172 save_capture_last = md->capture_last;
1173 matched_once = TRUE;
1174 mstart = md->start_match_ptr; /* In case \K changed it */
1178 /* See comment in the code for capturing groups above about handling
1181 if (rrc == MATCH_THEN)
1183 next = ecode + GET(ecode,1);
1184 if (md->start_match_ptr < next &&
1185 (*ecode == OP_ALT || *next == OP_ALT))
1186 rrc = MATCH_NOMATCH;
1189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190 md->capture_last = save_capture_last;
1191 ecode += GET(ecode, 1);
1192 if (*ecode != OP_ALT) break;
1197 md->offset_vector[offset] = save_offset1;
1198 md->offset_vector[offset+1] = save_offset2;
1199 md->offset_vector[md->offset_end - number] = save_offset3;
1202 if (allow_zero || matched_once)
1204 ecode += 1 + LINK_SIZE;
1208 RRETURN(MATCH_NOMATCH);
1211 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1212 as a non-capturing bracket. */
1214 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222 /* Non-capturing possessive bracket with unlimited repeat. We come here
1223 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1224 without the capturing complication. It is written out separately for speed
1231 POSSESSIVE_NON_CAPTURE:
1232 matched_once = FALSE;
1233 code_offset = (int)(ecode - md->start_code);
1234 save_capture_last = md->capture_last;
1238 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1239 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 if (rrc == MATCH_KETRPOS)
1243 offset_top = md->end_offset_top;
1244 eptr = md->end_match_ptr;
1245 ecode = md->start_code + code_offset;
1246 matched_once = TRUE;
1247 mstart = md->start_match_ptr; /* In case \K reset it */
1251 /* See comment in the code for capturing groups above about handling
1254 if (rrc == MATCH_THEN)
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1268 if (matched_once || allow_zero)
1270 ecode += 1 + LINK_SIZE;
1273 RRETURN(MATCH_NOMATCH);
1275 /* Control never reaches here. */
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1296 if (*ecode == OP_CALLOUT)
1298 if (PUBL(callout) != NULL)
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1333 /* Test the various possible conditions */
1336 switch(condcode = *ecode)
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1366 case OP_DNCREF: /* Duplicate named group used test */
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1380 case OP_DEF: /* DEFINE - always false */
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. If the
1398 assertion has a quantifier that allows zero repeats we must skip over
1399 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1401 if (*ecode == OP_BRAZERO) ecode++;
1402 ecode += GET(ecode, 1);
1403 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1404 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1407 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1408 assertion; it is therefore treated as NOMATCH. Any other return is an
1411 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 RRETURN(rrc); /* Need braces because of following else */
1418 /* Choose branch according to the condition */
1420 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1422 /* We are now at the branch that is to be obeyed. As there is only one, we
1423 can use tail recursion to avoid using another stack frame, except when
1424 there is unlimited repeat of a possibly empty group. In the latter case, a
1425 recursive call to match() is always required, unless the second alternative
1426 doesn't exist, in which case we can just plough on. Note that, for
1427 compatibility with Perl, the | in a conditional group is NOT treated as
1428 creating two alternatives. If a THEN is encountered in the branch, it
1429 propagates out to the enclosing alternative (unless nested in a deeper set
1430 of alternatives, of course). */
1432 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1439 md->match_function_type = MATCH_CBEGROUP;
1440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1444 /* Condition false & no alternative; continue after the group. */
1452 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1453 to close any currently open capturing brackets. */
1456 number = GET2(ecode, 1); /* Must be less than 65536 */
1457 offset = number << 1;
1460 printf("end bracket %d at *ACCEPT", number);
1464 md->capture_last = (md->capture_last & OVFLMASK) | number;
1465 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1467 md->offset_vector[offset] =
1468 md->offset_vector[md->offset_end - number];
1469 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1471 /* If this group is at or above the current highwater mark, ensure that
1472 any groups between the current high water mark and this group are marked
1473 unset and then update the high water mark. */
1475 if (offset >= offset_top)
1477 register int *iptr = md->offset_vector + offset_top;
1478 register int *iend = md->offset_vector + offset;
1479 while (iptr < iend) *iptr++ = -1;
1480 offset_top = offset + 2;
1483 ecode += 1 + IMM2_SIZE;
1487 /* End of the pattern, either real or forced. */
1491 case OP_ASSERT_ACCEPT:
1493 /* If we have matched an empty string, fail if not in an assertion and not
1494 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1495 is set and we have matched at the start of the subject. In both cases,
1496 backtracking will then try other alternatives, if any. */
1498 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1499 md->recursive == NULL &&
1501 (md->notempty_atstart &&
1502 mstart == md->start_subject + md->start_offset)))
1503 RRETURN(MATCH_NOMATCH);
1505 /* Otherwise, we have a match. */
1507 md->end_match_ptr = eptr; /* Record where we ended */
1508 md->end_offset_top = offset_top; /* and how many extracts were taken */
1509 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1511 /* For some reason, the macros don't work properly if an expression is
1512 given as the argument to RRETURN when the heap is in use. */
1514 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1517 /* Assertion brackets. Check the alternative branches in turn - the
1518 matching won't pass the KET for an assertion. If any one branch matches,
1519 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1520 start of each branch to move the current point backwards, so the code at
1521 this level is identical to the lookahead case. When the assertion is part
1522 of a condition, we want to return immediately afterwards. The caller of
1523 this incarnation of the match() function will have set MATCH_CONDASSERT in
1524 md->match_function type, and one of these opcodes will be the first opcode
1525 that is processed. We use a local variable that is preserved over calls to
1526 match() to remember this case. */
1530 save_mark = md->mark;
1531 if (md->match_function_type == MATCH_CONDASSERT)
1534 md->match_function_type = 0;
1536 else condassert = FALSE;
1538 /* Loop for each branch */
1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1544 /* A match means that the assertion is true; break out of the loop
1545 that matches its alternatives. */
1547 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1549 mstart = md->start_match_ptr; /* In case \K reset it */
1553 /* If not matched, restore the previous mark setting. */
1555 md->mark = save_mark;
1557 /* See comment in the code for capturing groups above about handling
1560 if (rrc == MATCH_THEN)
1562 next = ecode + GET(ecode,1);
1563 if (md->start_match_ptr < next &&
1564 (*ecode == OP_ALT || *next == OP_ALT))
1565 rrc = MATCH_NOMATCH;
1568 /* Anything other than NOMATCH causes the entire assertion to fail,
1569 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1570 uncaptured THEN, which means they take their normal effect. This
1571 consistent approach does not always have exactly the same effect as in
1574 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1575 ecode += GET(ecode, 1);
1577 while (*ecode == OP_ALT); /* Continue for next alternative */
1579 /* If we have tried all the alternative branches, the assertion has
1580 failed. If not, we broke out after a match. */
1582 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1584 /* If checking an assertion for a condition, return MATCH_MATCH. */
1586 if (condassert) RRETURN(MATCH_MATCH);
1588 /* Continue from after a successful assertion, updating the offsets high
1589 water mark, since extracts may have been taken during the assertion. */
1591 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1592 ecode += 1 + LINK_SIZE;
1593 offset_top = md->end_offset_top;
1596 /* Negative assertion: all branches must fail to match for the assertion to
1600 case OP_ASSERTBACK_NOT:
1601 save_mark = md->mark;
1602 if (md->match_function_type == MATCH_CONDASSERT)
1605 md->match_function_type = 0;
1607 else condassert = FALSE;
1609 /* Loop for each alternative branch. */
1613 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1614 md->mark = save_mark; /* Always restore the mark setting */
1618 case MATCH_MATCH: /* A successful match means */
1619 case MATCH_ACCEPT: /* the assertion has failed. */
1620 RRETURN(MATCH_NOMATCH);
1622 case MATCH_NOMATCH: /* Carry on with next branch */
1625 /* See comment in the code for capturing groups above about handling
1629 next = ecode + GET(ecode,1);
1630 if (md->start_match_ptr < next &&
1631 (*ecode == OP_ALT || *next == OP_ALT))
1633 rrc = MATCH_NOMATCH;
1636 /* Otherwise fall through. */
1638 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1639 assertion to fail to match, without considering any more alternatives.
1640 Failing to match means the assertion is true. This is a consistent
1641 approach, but does not always have the same effect as in Perl. */
1645 case MATCH_SKIP_ARG:
1647 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1648 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1650 /* Anything else is an error */
1656 /* Continue with next branch */
1658 ecode += GET(ecode,1);
1660 while (*ecode == OP_ALT);
1662 /* All branches in the assertion failed to match. */
1665 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1666 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1669 /* Move the subject pointer back. This occurs only at the start of
1670 each branch of a lookbehind assertion. If we are too close to the start to
1671 move back, this match function fails. When working with UTF-8 we move
1672 back a number of characters, not bytes. */
1682 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1689 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1692 eptr -= GET(ecode, 1);
1693 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1696 /* Save the earliest consulted character, then skip to next op code */
1698 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1699 ecode += 1 + LINK_SIZE;
1702 /* The callout item calls an external function, if one is provided, passing
1703 details of the match so far. This is mainly for debugging, though the
1704 function is able to force a failure. */
1707 if (PUBL(callout) != NULL)
1709 PUBL(callout_block) cb;
1710 cb.version = 2; /* Version 1 of the callout block */
1711 cb.callout_number = ecode[1];
1712 cb.offset_vector = md->offset_vector;
1713 #if defined COMPILE_PCRE8
1714 cb.subject = (PCRE_SPTR)md->start_subject;
1715 #elif defined COMPILE_PCRE16
1716 cb.subject = (PCRE_SPTR16)md->start_subject;
1717 #elif defined COMPILE_PCRE32
1718 cb.subject = (PCRE_SPTR32)md->start_subject;
1720 cb.subject_length = (int)(md->end_subject - md->start_subject);
1721 cb.start_match = (int)(mstart - md->start_subject);
1722 cb.current_position = (int)(eptr - md->start_subject);
1723 cb.pattern_position = GET(ecode, 2);
1724 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1725 cb.capture_top = offset_top/2;
1726 cb.capture_last = md->capture_last & CAPLMASK;
1727 /* Internal change requires this for API compatibility. */
1728 if (cb.capture_last == 0) cb.capture_last = -1;
1729 cb.callout_data = md->callout_data;
1730 cb.mark = md->nomatch_mark;
1731 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1732 if (rrc < 0) RRETURN(rrc);
1734 ecode += 2 + 2*LINK_SIZE;
1737 /* Recursion either matches the current regex, or some subexpression. The
1738 offset data is the offset to the starting bracket from the start of the
1739 whole pattern. (This is so that it works from duplicated subpatterns.)
1741 The state of the capturing groups is preserved over recursion, and
1742 re-instated afterwards. We don't know how many are started and not yet
1743 finished (offset_top records the completed total) so we just have to save
1744 all the potential data. There may be up to 65535 such values, which is too
1745 large to put on the stack, but using malloc for small numbers seems
1746 expensive. As a compromise, the stack is used when there are no more than
1747 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1749 There are also other values that have to be saved. We use a chained
1750 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1751 for the original version of this logic. It has, however, been hacked around
1752 a lot, so he is not to blame for the current way it works. */
1759 callpat = md->start_code + GET(ecode, 1);
1760 recno = (callpat == md->start_code)? 0 :
1761 GET2(callpat, 1 + LINK_SIZE);
1763 /* Check for repeating a recursion without advancing the subject pointer.
1764 This should catch convoluted mutual recursions. (Some simple cases are
1765 caught at compile time.) */
1767 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1768 if (recno == ri->group_num && eptr == ri->subject_position)
1769 RRETURN(PCRE_ERROR_RECURSELOOP);
1771 /* Add to "recursing stack" */
1773 new_recursive.group_num = recno;
1774 new_recursive.saved_capture_last = md->capture_last;
1775 new_recursive.subject_position = eptr;
1776 new_recursive.prevrec = md->recursive;
1777 md->recursive = &new_recursive;
1779 /* Where to continue from afterwards */
1781 ecode += 1 + LINK_SIZE;
1783 /* Now save the offset data */
1785 new_recursive.saved_max = md->offset_end;
1786 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1787 new_recursive.offset_save = stacksave;
1790 new_recursive.offset_save =
1791 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1792 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1794 memcpy(new_recursive.offset_save, md->offset_vector,
1795 new_recursive.saved_max * sizeof(int));
1797 /* OK, now we can do the recursion. After processing each alternative,
1798 restore the offset data and the last captured value. If there were nested
1799 recursions, md->recursive might be changed, so reset it before looping.
1802 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1803 cbegroup = (*callpat >= OP_SBRA);
1806 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1807 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1809 memcpy(md->offset_vector, new_recursive.offset_save,
1810 new_recursive.saved_max * sizeof(int));
1811 md->capture_last = new_recursive.saved_capture_last;
1812 md->recursive = new_recursive.prevrec;
1813 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1815 DPRINTF(("Recursion matched\n"));
1816 if (new_recursive.offset_save != stacksave)
1817 (PUBL(free))(new_recursive.offset_save);
1819 /* Set where we got to in the subject, and reset the start in case
1820 it was changed by \K. This *is* propagated back out of a recursion,
1821 for Perl compatibility. */
1823 eptr = md->end_match_ptr;
1824 mstart = md->start_match_ptr;
1825 goto RECURSION_MATCHED; /* Exit loop; end processing */
1828 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1829 recursion; they cause a NOMATCH for the entire recursion. These codes
1830 are defined in a range that can be tested for. */
1832 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1833 RRETURN(MATCH_NOMATCH);
1835 /* Any return code other than NOMATCH is an error. */
1837 if (rrc != MATCH_NOMATCH)
1839 DPRINTF(("Recursion gave error %d\n", rrc));
1840 if (new_recursive.offset_save != stacksave)
1841 (PUBL(free))(new_recursive.offset_save);
1845 md->recursive = &new_recursive;
1846 callpat += GET(callpat, 1);
1848 while (*callpat == OP_ALT);
1850 DPRINTF(("Recursion didn't match\n"));
1851 md->recursive = new_recursive.prevrec;
1852 if (new_recursive.offset_save != stacksave)
1853 (PUBL(free))(new_recursive.offset_save);
1854 RRETURN(MATCH_NOMATCH);
1860 /* An alternation is the end of a branch; scan along to find the end of the
1861 bracketed group and go to there. */
1864 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1867 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1868 indicating that it may occur zero times. It may repeat infinitely, or not
1869 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1870 with fixed upper repeat limits are compiled as a number of copies, with the
1871 optional ones preceded by BRAZERO or BRAMINZERO. */
1875 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1877 do next += GET(next, 1); while (*next == OP_ALT);
1878 ecode = next + 1 + LINK_SIZE;
1883 do next += GET(next, 1); while (*next == OP_ALT);
1884 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1891 do next += GET(next,1); while (*next == OP_ALT);
1892 ecode = next + 1 + LINK_SIZE;
1895 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1896 here; just jump to the group, with allow_zero set TRUE. */
1901 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1902 goto POSSESSIVE_NON_CAPTURE;
1904 /* End of a group, repeated or non-repeating. */
1910 prev = ecode - GET(ecode, 1);
1912 /* If this was a group that remembered the subject start, in order to break
1913 infinite repeats of empty string matches, retrieve the subject start from
1914 the chain. Otherwise, set it NULL. */
1916 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1918 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1919 eptrb = eptrb->epb_prev; /* Backup to previous group */
1921 else saved_eptr = NULL;
1923 /* If we are at the end of an assertion group or a non-capturing atomic
1924 group, stop matching and return MATCH_MATCH, but record the current high
1925 water mark for use by positive assertions. We also need to record the match
1926 start in case it was changed by \K. */
1928 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1929 *prev == OP_ONCE_NC)
1931 md->end_match_ptr = eptr; /* For ONCE_NC */
1932 md->end_offset_top = offset_top;
1933 md->start_match_ptr = mstart;
1934 RRETURN(MATCH_MATCH); /* Sets md->mark */
1937 /* For capturing groups we have to check the group number back at the start
1938 and if necessary complete handling an extraction by setting the offsets and
1939 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1940 into group 0, so it won't be picked up here. Instead, we catch it when the
1941 OP_END is reached. Other recursion is handled here. We just have to record
1942 the current subject position and start match pointer and give a MATCH
1945 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1946 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1948 number = GET2(prev, 1+LINK_SIZE);
1949 offset = number << 1;
1952 printf("end bracket %d", number);
1956 /* Handle a recursively called group. */
1958 if (md->recursive != NULL && md->recursive->group_num == number)
1960 md->end_match_ptr = eptr;
1961 md->start_match_ptr = mstart;
1962 RRETURN(MATCH_MATCH);
1965 /* Deal with capturing */
1967 md->capture_last = (md->capture_last & OVFLMASK) | number;
1968 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1970 /* If offset is greater than offset_top, it means that we are
1971 "skipping" a capturing group, and that group's offsets must be marked
1972 unset. In earlier versions of PCRE, all the offsets were unset at the
1973 start of matching, but this doesn't work because atomic groups and
1974 assertions can cause a value to be set that should later be unset.
1975 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1976 part of the atomic group, but this is not on the final matching path,
1977 so must be unset when 2 is set. (If there is no group 2, there is no
1978 problem, because offset_top will then be 2, indicating no capture.) */
1980 if (offset > offset_top)
1982 register int *iptr = md->offset_vector + offset_top;
1983 register int *iend = md->offset_vector + offset;
1984 while (iptr < iend) *iptr++ = -1;
1987 /* Now make the extraction */
1989 md->offset_vector[offset] =
1990 md->offset_vector[md->offset_end - number];
1991 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1992 if (offset_top <= offset) offset_top = offset + 2;
1996 /* For an ordinary non-repeating ket, just continue at this level. This
1997 also happens for a repeating ket if no characters were matched in the
1998 group. This is the forcible breaking of infinite loops as implemented in
1999 Perl 5.005. For a non-repeating atomic group that includes captures,
2000 establish a backup point by processing the rest of the pattern at a lower
2001 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2002 original OP_ONCE level, thereby bypassing intermediate backup points, but
2003 resetting any captures that happened along the way. */
2005 if (*ecode == OP_KET || eptr == saved_eptr)
2007 if (*prev == OP_ONCE)
2009 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2012 RRETURN(MATCH_ONCE);
2014 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2018 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2019 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2020 at a time from the outer level, thus saving stack. */
2022 if (*ecode == OP_KETRPOS)
2024 md->start_match_ptr = mstart; /* In case \K reset it */
2025 md->end_match_ptr = eptr;
2026 md->end_offset_top = offset_top;
2027 RRETURN(MATCH_KETRPOS);
2030 /* The normal repeating kets try the rest of the pattern or restart from
2031 the preceding bracket, in the appropriate order. In the second case, we can
2032 use tail recursion to avoid using another stack frame, unless we have an
2033 an atomic group or an unlimited repeat of a group that can match an empty
2036 if (*ecode == OP_KETRMIN)
2038 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2039 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2040 if (*prev == OP_ONCE)
2042 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2044 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2045 RRETURN(MATCH_ONCE);
2047 if (*prev >= OP_SBRA) /* Could match an empty string */
2049 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2055 else /* OP_KETRMAX */
2057 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2058 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2060 if (*prev == OP_ONCE)
2062 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2064 md->once_target = prev;
2065 RRETURN(MATCH_ONCE);
2067 ecode += 1 + LINK_SIZE;
2070 /* Control never gets here */
2072 /* Not multiline mode: start of subject assertion, unless notbol. */
2075 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2077 /* Start of subject assertion */
2080 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2084 /* Multiline mode: start of subject unless notbol, or after any newline. */
2087 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2088 if (eptr != md->start_subject &&
2089 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2090 RRETURN(MATCH_NOMATCH);
2094 /* Start of match assertion */
2097 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2101 /* Reset the start of match point */
2108 /* Multiline mode: assert before any newline, or before end of subject
2109 unless noteol is set. */
2112 if (eptr < md->end_subject)
2114 if (!IS_NEWLINE(eptr))
2116 if (md->partial != 0 &&
2117 eptr + 1 >= md->end_subject &&
2118 NLBLOCK->nltype == NLTYPE_FIXED &&
2119 NLBLOCK->nllen == 2 &&
2120 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2123 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2125 RRETURN(MATCH_NOMATCH);
2130 if (md->noteol) RRETURN(MATCH_NOMATCH);
2136 /* Not multiline mode: assert before a terminating newline or before end of
2137 subject unless noteol is set. */
2140 if (md->noteol) RRETURN(MATCH_NOMATCH);
2141 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2143 /* ... else fall through for endonly */
2145 /* End of subject assertion (\z) */
2148 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2153 /* End of subject or ending \n assertion (\Z) */
2157 if (eptr < md->end_subject &&
2158 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2160 if (md->partial != 0 &&
2161 eptr + 1 >= md->end_subject &&
2162 NLBLOCK->nltype == NLTYPE_FIXED &&
2163 NLBLOCK->nllen == 2 &&
2164 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2167 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2169 RRETURN(MATCH_NOMATCH);
2172 /* Either at end of string or \n before end. */
2178 /* Word boundary assertions */
2180 case OP_NOT_WORD_BOUNDARY:
2181 case OP_WORD_BOUNDARY:
2184 /* Find out if the previous and current characters are "word" characters.
2185 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2186 be "non-word" characters. Remember the earliest consulted character for
2187 partial matching. */
2192 /* Get status of previous character */
2194 if (eptr == md->start_subject) prev_is_word = FALSE; else
2196 PCRE_PUCHAR lastptr = eptr - 1;
2198 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2199 GETCHAR(c, lastptr);
2203 if (c == '_') prev_is_word = TRUE; else
2205 int cat = UCD_CATEGORY(c);
2206 prev_is_word = (cat == ucp_L || cat == ucp_N);
2211 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2214 /* Get status of next character */
2216 if (eptr >= md->end_subject)
2219 cur_is_word = FALSE;
2227 if (c == '_') cur_is_word = TRUE; else
2229 int cat = UCD_CATEGORY(c);
2230 cur_is_word = (cat == ucp_L || cat == ucp_N);
2235 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2241 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2242 consistency with the behaviour of \w we do use it in this case. */
2245 /* Get status of previous character */
2247 if (eptr == md->start_subject) prev_is_word = FALSE; else
2249 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2254 if (c == '_') prev_is_word = TRUE; else
2256 int cat = UCD_CATEGORY(c);
2257 prev_is_word = (cat == ucp_L || cat == ucp_N);
2262 prev_is_word = MAX_255(eptr[-1])
2263 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2266 /* Get status of next character */
2268 if (eptr >= md->end_subject)
2271 cur_is_word = FALSE;
2278 if (c == '_') cur_is_word = TRUE; else
2280 int cat = UCD_CATEGORY(c);
2281 cur_is_word = (cat == ucp_L || cat == ucp_N);
2286 cur_is_word = MAX_255(*eptr)
2287 && ((md->ctypes[*eptr] & ctype_word) != 0);
2290 /* Now see if the situation is what we want */
2292 if ((*ecode++ == OP_WORD_BOUNDARY)?
2293 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2294 RRETURN(MATCH_NOMATCH);
2298 /* Match any single character type except newline; have to take care with
2299 CRLF newlines and partial matching. */
2302 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2303 if (md->partial != 0 &&
2304 eptr + 1 >= md->end_subject &&
2305 NLBLOCK->nltype == NLTYPE_FIXED &&
2306 NLBLOCK->nllen == 2 &&
2307 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2310 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2315 /* Match any single character whatsoever. */
2318 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2319 { /* not be updated before SCHECK_PARTIAL. */
2321 RRETURN(MATCH_NOMATCH);
2325 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2330 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2331 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2334 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2335 { /* not be updated before SCHECK_PARTIAL. */
2337 RRETURN(MATCH_NOMATCH);
2344 if (eptr >= md->end_subject)
2347 RRETURN(MATCH_NOMATCH);
2349 GETCHARINCTEST(c, eptr);
2351 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2354 (md->ctypes[c] & ctype_digit) != 0
2356 RRETURN(MATCH_NOMATCH);
2361 if (eptr >= md->end_subject)
2364 RRETURN(MATCH_NOMATCH);
2366 GETCHARINCTEST(c, eptr);
2368 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2371 (md->ctypes[c] & ctype_digit) == 0
2373 RRETURN(MATCH_NOMATCH);
2377 case OP_NOT_WHITESPACE:
2378 if (eptr >= md->end_subject)
2381 RRETURN(MATCH_NOMATCH);
2383 GETCHARINCTEST(c, eptr);
2385 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2388 (md->ctypes[c] & ctype_space) != 0
2390 RRETURN(MATCH_NOMATCH);
2395 if (eptr >= md->end_subject)
2398 RRETURN(MATCH_NOMATCH);
2400 GETCHARINCTEST(c, eptr);
2402 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2405 (md->ctypes[c] & ctype_space) == 0
2407 RRETURN(MATCH_NOMATCH);
2411 case OP_NOT_WORDCHAR:
2412 if (eptr >= md->end_subject)
2415 RRETURN(MATCH_NOMATCH);
2417 GETCHARINCTEST(c, eptr);
2419 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2422 (md->ctypes[c] & ctype_word) != 0
2424 RRETURN(MATCH_NOMATCH);
2429 if (eptr >= md->end_subject)
2432 RRETURN(MATCH_NOMATCH);
2434 GETCHARINCTEST(c, eptr);
2436 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2439 (md->ctypes[c] & ctype_word) == 0
2441 RRETURN(MATCH_NOMATCH);
2446 if (eptr >= md->end_subject)
2449 RRETURN(MATCH_NOMATCH);
2451 GETCHARINCTEST(c, eptr);
2454 default: RRETURN(MATCH_NOMATCH);
2457 if (eptr >= md->end_subject)
2461 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2473 #endif /* Not EBCDIC */
2474 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2481 if (eptr >= md->end_subject)
2484 RRETURN(MATCH_NOMATCH);
2486 GETCHARINCTEST(c, eptr);
2489 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2496 if (eptr >= md->end_subject)
2499 RRETURN(MATCH_NOMATCH);
2501 GETCHARINCTEST(c, eptr);
2504 HSPACE_CASES: break; /* Byte and multibyte cases */
2505 default: RRETURN(MATCH_NOMATCH);
2511 if (eptr >= md->end_subject)
2514 RRETURN(MATCH_NOMATCH);
2516 GETCHARINCTEST(c, eptr);
2519 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2526 if (eptr >= md->end_subject)
2529 RRETURN(MATCH_NOMATCH);
2531 GETCHARINCTEST(c, eptr);
2534 VSPACE_CASES: break;
2535 default: RRETURN(MATCH_NOMATCH);
2541 /* Check the next character by Unicode property. We will get here only
2542 if the support is in the binary; otherwise a compile-time error occurs. */
2546 if (eptr >= md->end_subject)
2549 RRETURN(MATCH_NOMATCH);
2551 GETCHARINCTEST(c, eptr);
2553 const pcre_uint32 *cp;
2554 const ucd_record *prop = GET_UCD(c);
2559 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2563 if ((prop->chartype == ucp_Lu ||
2564 prop->chartype == ucp_Ll ||
2565 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2566 RRETURN(MATCH_NOMATCH);
2570 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2571 RRETURN(MATCH_NOMATCH);
2575 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2576 RRETURN(MATCH_NOMATCH);
2580 if ((ecode[2] != prop->script) == (op == OP_PROP))
2581 RRETURN(MATCH_NOMATCH);
2584 /* These are specials */
2587 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2588 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2589 RRETURN(MATCH_NOMATCH);
2592 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2593 which means that Perl space and POSIX space are now identical. PCRE
2594 was changed at release 8.34. */
2596 case PT_SPACE: /* Perl space */
2597 case PT_PXSPACE: /* POSIX space */
2602 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2606 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2607 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2613 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2614 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2615 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2616 RRETURN(MATCH_NOMATCH);
2620 cp = PRIV(ucd_caseless_sets) + ecode[2];
2624 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2626 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2631 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2632 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2633 c >= 0xe000) == (op == OP_NOTPROP))
2634 RRETURN(MATCH_NOMATCH);
2637 /* This should never occur */
2640 RRETURN(PCRE_ERROR_INTERNAL);
2647 /* Match an extended Unicode sequence. We will get here only if the support
2648 is in the binary; otherwise a compile-time error occurs. */
2651 if (eptr >= md->end_subject)
2654 RRETURN(MATCH_NOMATCH);
2659 GETCHARINCTEST(c, eptr);
2660 lgb = UCD_GRAPHBREAK(c);
2661 while (eptr < md->end_subject)
2664 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2665 rgb = UCD_GRAPHBREAK(c);
2666 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2674 #endif /* SUPPORT_UCP */
2677 /* Match a back reference, possibly repeatedly. Look past the end of the
2678 item to see if there is repeat information following. The code is similar
2679 to that for character classes, but repeated for efficiency. Then obey
2680 similar code to character type repeats - written out again for speed.
2681 However, if the referenced string is the empty string, always treat
2682 it as matched, any number of times (otherwise there could be infinite
2683 loops). If the reference is unset, there are two possibilities:
2685 (a) In the default, Perl-compatible state, set the length negative;
2686 this ensures that every attempt at a match fails. We can't just fail
2687 here, because of the possibility of quantifiers with zero minima.
2689 (b) If the JavaScript compatibility flag is set, set the length to zero
2690 so that the back reference matches an empty string.
2692 Otherwise, set the length to the length of what was matched by the
2693 referenced subpattern.
2695 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2696 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2697 and OP_DNREFI are used. In this case we must scan the list of groups to
2698 which the name refers, and use the first one that is set. */
2702 caseless = op == OP_DNREFI;
2704 int count = GET2(ecode, 1+IMM2_SIZE);
2705 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2706 ecode += 1 + 2*IMM2_SIZE;
2708 /* Setting the default length first and initializing 'offset' avoids
2709 compiler warnings in the REF_REPEAT code. */
2711 length = (md->jscript_compat)? 0 : -1;
2716 offset = GET2(slot, 0) << 1;
2717 if (offset < offset_top && md->offset_vector[offset] >= 0)
2719 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2722 slot += md->name_entry_size;
2729 caseless = op == OP_REFI;
2730 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2731 ecode += 1 + IMM2_SIZE;
2732 if (offset >= offset_top || md->offset_vector[offset] < 0)
2733 length = (md->jscript_compat)? 0 : -1;
2735 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2737 /* Set up for repetition, or handle the non-repeated case */
2748 c = *ecode++ - OP_CRSTAR;
2749 minimize = (c & 1) != 0;
2750 min = rep_min[c]; /* Pick up values from tables; */
2751 max = rep_max[c]; /* zero for max => infinity */
2752 if (max == 0) max = INT_MAX;
2757 minimize = (*ecode == OP_CRMINRANGE);
2758 min = GET2(ecode, 1);
2759 max = GET2(ecode, 1 + IMM2_SIZE);
2760 if (max == 0) max = INT_MAX;
2761 ecode += 1 + 2 * IMM2_SIZE;
2764 default: /* No repeat follows */
2765 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2767 if (length == -2) eptr = md->end_subject; /* Partial match */
2769 RRETURN(MATCH_NOMATCH);
2772 continue; /* With the main loop */
2775 /* Handle repeated back references. If the length of the reference is
2776 zero, just continue with the main loop. If the length is negative, it
2777 means the reference is unset in non-Java-compatible mode. If the minimum is
2778 zero, we can continue at the same level without recursion. For any other
2779 minimum, carrying on will result in NOMATCH. */
2781 if (length == 0) continue;
2782 if (length < 0 && min == 0) continue;
2784 /* First, ensure the minimum number of matches are present. We get back
2785 the length of the reference string explicitly rather than passing the
2786 address of eptr, so that eptr can be a register variable. */
2788 for (i = 1; i <= min; i++)
2791 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2793 if (slength == -2) eptr = md->end_subject; /* Partial match */
2795 RRETURN(MATCH_NOMATCH);
2800 /* If min = max, continue at the same level without recursion.
2801 They are not both allowed to be zero. */
2803 if (min == max) continue;
2805 /* If minimizing, keep trying and advancing the pointer */
2809 for (fi = min;; fi++)
2812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2814 if (fi >= max) RRETURN(MATCH_NOMATCH);
2815 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2817 if (slength == -2) eptr = md->end_subject; /* Partial match */
2819 RRETURN(MATCH_NOMATCH);
2823 /* Control never gets here */
2826 /* If maximizing, find the longest string and work backwards */
2831 for (i = min; i < max; i++)
2834 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2836 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2837 the soft partial matching case. */
2839 if (slength == -2 && md->partial != 0 &&
2840 md->end_subject > md->start_used_ptr)
2843 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2856 RRETURN(MATCH_NOMATCH);
2858 /* Control never gets here */
2860 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2861 used when all the characters in the class have values in the range 0-255,
2862 and either the matching is caseful, or the characters are in the range
2863 0-127 when UTF-8 processing is enabled. The only difference between
2864 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2867 First, look past the end of the item to see if there is repeat information
2868 following. Then obey similar code to character type repeats - written out
2874 /* The data variable is saved across frames, so the byte map needs to
2876 #define BYTE_MAP ((pcre_uint8 *)data)
2877 data = ecode + 1; /* Save for matching */
2878 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2891 c = *ecode++ - OP_CRSTAR;
2892 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2893 else possessive = TRUE;
2894 min = rep_min[c]; /* Pick up values from tables; */
2895 max = rep_max[c]; /* zero for max => infinity */
2896 if (max == 0) max = INT_MAX;
2902 minimize = (*ecode == OP_CRMINRANGE);
2903 possessive = (*ecode == OP_CRPOSRANGE);
2904 min = GET2(ecode, 1);
2905 max = GET2(ecode, 1 + IMM2_SIZE);
2906 if (max == 0) max = INT_MAX;
2907 ecode += 1 + 2 * IMM2_SIZE;
2910 default: /* No repeat follows */
2915 /* First, ensure the minimum number of matches are present. */
2920 for (i = 1; i <= min; i++)
2922 if (eptr >= md->end_subject)
2925 RRETURN(MATCH_NOMATCH);
2927 GETCHARINC(c, eptr);
2930 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2933 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2940 for (i = 1; i <= min; i++)
2942 if (eptr >= md->end_subject)
2945 RRETURN(MATCH_NOMATCH);
2948 #ifndef COMPILE_PCRE8
2951 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2955 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2959 /* If max == min we can continue with the main loop without the
2962 if (min == max) continue;
2964 /* If minimizing, keep testing the rest of the expression and advancing
2965 the pointer while it matches the class. */
2972 for (fi = min;; fi++)
2974 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976 if (fi >= max) RRETURN(MATCH_NOMATCH);
2977 if (eptr >= md->end_subject)
2980 RRETURN(MATCH_NOMATCH);
2982 GETCHARINC(c, eptr);
2985 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2988 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2995 for (fi = min;; fi++)
2997 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999 if (fi >= max) RRETURN(MATCH_NOMATCH);
3000 if (eptr >= md->end_subject)
3003 RRETURN(MATCH_NOMATCH);
3006 #ifndef COMPILE_PCRE8
3009 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3013 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3016 /* Control never gets here */
3019 /* If maximizing, find the longest possible run, then work backwards. */
3028 for (i = min; i < max; i++)
3031 if (eptr >= md->end_subject)
3036 GETCHARLEN(c, eptr, len);
3039 if (op == OP_CLASS) break;
3042 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3046 if (possessive) continue; /* No backtracking */
3050 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3052 if (eptr-- == pp) break; /* Stop if tried at original pos */
3060 for (i = min; i < max; i++)
3062 if (eptr >= md->end_subject)
3068 #ifndef COMPILE_PCRE8
3071 if (op == OP_CLASS) break;
3075 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3079 if (possessive) continue; /* No backtracking */
3083 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3084 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3089 RRETURN(MATCH_NOMATCH);
3093 /* Control never gets here */
3096 /* Match an extended character class. In the 8-bit library, this opcode is
3097 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3098 32-bit libraries, codepoints greater than 255 may be encountered even when
3099 UTF is not supported. */
3101 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3104 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3105 ecode += GET(ecode, 1); /* Advance past the item */
3118 c = *ecode++ - OP_CRSTAR;
3119 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3120 else possessive = TRUE;
3121 min = rep_min[c]; /* Pick up values from tables; */
3122 max = rep_max[c]; /* zero for max => infinity */
3123 if (max == 0) max = INT_MAX;
3129 minimize = (*ecode == OP_CRMINRANGE);
3130 possessive = (*ecode == OP_CRPOSRANGE);
3131 min = GET2(ecode, 1);
3132 max = GET2(ecode, 1 + IMM2_SIZE);
3133 if (max == 0) max = INT_MAX;
3134 ecode += 1 + 2 * IMM2_SIZE;
3137 default: /* No repeat follows */
3142 /* First, ensure the minimum number of matches are present. */
3144 for (i = 1; i <= min; i++)
3146 if (eptr >= md->end_subject)
3149 RRETURN(MATCH_NOMATCH);
3151 GETCHARINCTEST(c, eptr);
3152 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3155 /* If max == min we can continue with the main loop without the
3158 if (min == max) continue;
3160 /* If minimizing, keep testing the rest of the expression and advancing
3161 the pointer while it matches the class. */
3165 for (fi = min;; fi++)
3167 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3168 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3169 if (fi >= max) RRETURN(MATCH_NOMATCH);
3170 if (eptr >= md->end_subject)
3173 RRETURN(MATCH_NOMATCH);
3175 GETCHARINCTEST(c, eptr);
3176 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3178 /* Control never gets here */
3181 /* If maximizing, find the longest possible run, then work backwards. */
3186 for (i = min; i < max; i++)
3189 if (eptr >= md->end_subject)
3195 GETCHARLENTEST(c, eptr, len);
3199 if (!PRIV(xclass)(c, data, utf)) break;
3203 if (possessive) continue; /* No backtracking */
3207 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3209 if (eptr-- == pp) break; /* Stop if tried at original pos */
3211 if (utf) BACKCHAR(eptr);
3214 RRETURN(MATCH_NOMATCH);
3217 /* Control never gets here */
3219 #endif /* End of XCLASS */
3221 /* Match a single character, casefully */
3229 GETCHARLEN(fc, ecode, length);
3230 if (length > md->end_subject - eptr)
3232 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3233 RRETURN(MATCH_NOMATCH);
3235 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3241 if (md->end_subject - eptr < 1)
3243 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3244 RRETURN(MATCH_NOMATCH);
3246 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3251 /* Match a single character, caselessly. If we are at the end of the
3252 subject, give up immediately. */
3255 if (eptr >= md->end_subject)
3258 RRETURN(MATCH_NOMATCH);
3266 GETCHARLEN(fc, ecode, length);
3268 /* If the pattern character's value is < 128, we have only one byte, and
3269 we know that its other case must also be one byte long, so we can use the
3270 fast lookup table. We know that there is at least one byte left in the
3275 pcre_uint32 cc = UCHAR21(eptr);
3276 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3281 /* Otherwise we must pick up the subject character. Note that we cannot
3282 use the value of "length" to check for sufficient bytes left, because the
3283 other case of the character may have more or fewer bytes. */
3288 GETCHARINC(dc, eptr);
3291 /* If we have Unicode property support, we can use it to test the other
3292 case of the character, if there is one. */
3297 if (dc != UCD_OTHERCASE(fc))
3299 RRETURN(MATCH_NOMATCH);
3304 #endif /* SUPPORT_UTF */
3308 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3309 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3315 /* Match a single character repeatedly. */
3319 min = max = GET2(ecode, 1);
3320 ecode += 1 + IMM2_SIZE;
3333 max = GET2(ecode, 1);
3334 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3335 ecode += 1 + IMM2_SIZE;
3374 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3375 minimize = (c & 1) != 0;
3376 min = rep_min[c]; /* Pick up values from tables; */
3377 max = rep_max[c]; /* zero for max => infinity */
3378 if (max == 0) max = INT_MAX;
3380 /* Common code for all repeated single-character matches. We first check
3381 for the minimum number of characters. If the minimum equals the maximum, we
3382 are done. Otherwise, if minimizing, check the rest of the pattern for a
3383 match; if there isn't one, advance up to the maximum, one character at a
3386 If maximizing, advance up to the maximum number of matching characters,
3387 until eptr is past the end of the maximum run. If possessive, we are
3388 then done (no backing up). Otherwise, match at this position; anything
3389 other than no match is immediately returned. For nomatch, back up one
3390 character, unless we are matching \R and the last thing matched was
3391 \r\n, in which case, back up two bytes. When we reach the first optional
3392 character position, we can save stack by doing a tail recurse.
3394 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3403 GETCHARLEN(fc, ecode, length);
3406 /* Handle multibyte character matching specially here. There is
3407 support for caseless matching if UCP support is present. */
3412 pcre_uint32 othercase;
3413 if (op >= OP_STARI && /* Caseless */
3414 (othercase = UCD_OTHERCASE(fc)) != fc)
3415 oclength = PRIV(ord2utf)(othercase, occhars);
3417 #endif /* SUPPORT_UCP */
3419 for (i = 1; i <= min; i++)
3421 if (eptr <= md->end_subject - length &&
3422 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3424 else if (oclength > 0 &&
3425 eptr <= md->end_subject - oclength &&
3426 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3427 #endif /* SUPPORT_UCP */
3431 RRETURN(MATCH_NOMATCH);
3435 if (min == max) continue;
3439 for (fi = min;; fi++)
3441 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3442 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3443 if (fi >= max) RRETURN(MATCH_NOMATCH);
3444 if (eptr <= md->end_subject - length &&
3445 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3447 else if (oclength > 0 &&
3448 eptr <= md->end_subject - oclength &&
3449 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3450 #endif /* SUPPORT_UCP */
3454 RRETURN(MATCH_NOMATCH);
3457 /* Control never gets here */
3463 for (i = min; i < max; i++)
3465 if (eptr <= md->end_subject - length &&
3466 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3468 else if (oclength > 0 &&
3469 eptr <= md->end_subject - oclength &&
3470 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3471 #endif /* SUPPORT_UCP */
3479 if (possessive) continue; /* No backtracking */
3482 if (eptr == pp) goto TAIL_RECURSE;
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3488 #else /* without SUPPORT_UCP */
3490 #endif /* SUPPORT_UCP */
3493 /* Control never gets here */
3496 /* If the length of a UTF-8 character is 1, we fall through here, and
3497 obey the code as for non-UTF-8 characters below, though in this case the
3498 value of fc will always be < 128. */
3501 #endif /* SUPPORT_UTF */
3502 /* When not in UTF-8 mode, load a single-byte character. */
3505 /* The value of fc at this point is always one character, though we may
3506 or may not be in UTF mode. The code is duplicated for the caseless and
3507 caseful cases, for speed, since matching characters is likely to be quite
3508 common. First, ensure the minimum number of matches are present. If min =
3509 max, continue at the same level without recursing. Otherwise, if
3510 minimizing, keep trying the rest of the expression and advancing one
3511 matching character if failing, up to the maximum. Alternatively, if
3512 maximizing, find the maximum number of characters and work backwards. */
3514 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3515 max, (char *)eptr));
3517 if (op >= OP_STARI) /* Caseless */
3519 #ifdef COMPILE_PCRE8
3520 /* fc must be < 128 if UTF is enabled. */
3525 if (utf && fc > 127)
3526 foc = UCD_OTHERCASE(fc);
3528 if (utf && fc > 127)
3530 #endif /* SUPPORT_UCP */
3532 #endif /* SUPPORT_UTF */
3533 foc = TABLE_GET(fc, md->fcc, fc);
3534 #endif /* COMPILE_PCRE8 */
3536 for (i = 1; i <= min; i++)
3538 pcre_uint32 cc; /* Faster than pcre_uchar */
3539 if (eptr >= md->end_subject)
3542 RRETURN(MATCH_NOMATCH);
3544 cc = UCHAR21TEST(eptr);
3545 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3548 if (min == max) continue;
3551 for (fi = min;; fi++)
3553 pcre_uint32 cc; /* Faster than pcre_uchar */
3554 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 if (fi >= max) RRETURN(MATCH_NOMATCH);
3557 if (eptr >= md->end_subject)
3560 RRETURN(MATCH_NOMATCH);
3562 cc = UCHAR21TEST(eptr);
3563 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3566 /* Control never gets here */
3571 for (i = min; i < max; i++)
3573 pcre_uint32 cc; /* Faster than pcre_uchar */
3574 if (eptr >= md->end_subject)
3579 cc = UCHAR21TEST(eptr);
3580 if (fc != cc && foc != cc) break;
3583 if (possessive) continue; /* No backtracking */
3586 if (eptr == pp) goto TAIL_RECURSE;
3587 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3591 /* Control never gets here */
3595 /* Caseful comparisons (includes all multi-byte characters) */
3599 for (i = 1; i <= min; i++)
3601 if (eptr >= md->end_subject)
3604 RRETURN(MATCH_NOMATCH);
3606 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3609 if (min == max) continue;
3613 for (fi = min;; fi++)
3615 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3617 if (fi >= max) RRETURN(MATCH_NOMATCH);
3618 if (eptr >= md->end_subject)
3621 RRETURN(MATCH_NOMATCH);
3623 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3625 /* Control never gets here */
3630 for (i = min; i < max; i++)
3632 if (eptr >= md->end_subject)
3637 if (fc != UCHAR21TEST(eptr)) break;
3640 if (possessive) continue; /* No backtracking */
3643 if (eptr == pp) goto TAIL_RECURSE;
3644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3648 /* Control never gets here */
3651 /* Control never gets here */
3653 /* Match a negated single one-byte character. The character we are
3654 checking can be multibyte. */
3658 if (eptr >= md->end_subject)
3661 RRETURN(MATCH_NOMATCH);
3666 register pcre_uint32 ch, och;
3669 GETCHARINC(ch, ecode);
3670 GETCHARINC(c, eptr);
3674 if (ch == c) RRETURN(MATCH_NOMATCH);
3680 och = UCD_OTHERCASE(ch);
3684 #endif /* SUPPORT_UCP */
3686 och = TABLE_GET(ch, md->fcc, ch);
3687 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3693 register pcre_uint32 ch = ecode[1];
3695 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3696 RRETURN(MATCH_NOMATCH);
3701 /* Match a negated single one-byte character repeatedly. This is almost a
3702 repeat of the code for a repeated single character, but I haven't found a
3703 nice way of commoning these up that doesn't require a test of the
3704 positive/negative option for each character match. Maybe that wouldn't add
3705 very much to the time taken, but character matching *is* what this is all
3710 min = max = GET2(ecode, 1);
3711 ecode += 1 + IMM2_SIZE;
3717 case OP_NOTMINUPTOI:
3719 max = GET2(ecode, 1);
3720 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3721 ecode += 1 + IMM2_SIZE;
3725 case OP_NOTPOSSTARI:
3733 case OP_NOTPOSPLUSI:
3740 case OP_NOTPOSQUERY:
3741 case OP_NOTPOSQUERYI:
3749 case OP_NOTPOSUPTOI:
3752 max = GET2(ecode, 1);
3753 ecode += 1 + IMM2_SIZE;
3759 case OP_NOTMINSTARI:
3763 case OP_NOTMINPLUSI:
3766 case OP_NOTMINQUERY:
3767 case OP_NOTMINQUERYI:
3768 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3769 minimize = (c & 1) != 0;
3770 min = rep_min[c]; /* Pick up values from tables; */
3771 max = rep_max[c]; /* zero for max => infinity */
3772 if (max == 0) max = INT_MAX;
3774 /* Common code for all repeated single-byte matches. */
3777 GETCHARINCTEST(fc, ecode);
3779 /* The code is duplicated for the caseless and caseful cases, for speed,
3780 since matching characters is likely to be quite common. First, ensure the
3781 minimum number of matches are present. If min = max, continue at the same
3782 level without recursing. Otherwise, if minimizing, keep trying the rest of
3783 the expression and advancing one matching character if failing, up to the
3784 maximum. Alternatively, if maximizing, find the maximum number of
3785 characters and work backwards. */
3787 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3788 max, (char *)eptr));
3790 if (op >= OP_NOTSTARI) /* Caseless */
3794 if (utf && fc > 127)
3795 foc = UCD_OTHERCASE(fc);
3797 if (utf && fc > 127)
3799 #endif /* SUPPORT_UCP */
3801 #endif /* SUPPORT_UTF */
3802 foc = TABLE_GET(fc, md->fcc, fc);
3807 register pcre_uint32 d;
3808 for (i = 1; i <= min; i++)
3810 if (eptr >= md->end_subject)
3813 RRETURN(MATCH_NOMATCH);
3815 GETCHARINC(d, eptr);
3816 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3820 #endif /* SUPPORT_UTF */
3823 for (i = 1; i <= min; i++)
3825 if (eptr >= md->end_subject)
3828 RRETURN(MATCH_NOMATCH);
3830 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3835 if (min == max) continue;
3842 register pcre_uint32 d;
3843 for (fi = min;; fi++)
3845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3847 if (fi >= max) RRETURN(MATCH_NOMATCH);
3848 if (eptr >= md->end_subject)
3851 RRETURN(MATCH_NOMATCH);
3853 GETCHARINC(d, eptr);
3854 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3858 #endif /*SUPPORT_UTF */
3861 for (fi = min;; fi++)
3863 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3865 if (fi >= max) RRETURN(MATCH_NOMATCH);
3866 if (eptr >= md->end_subject)
3869 RRETURN(MATCH_NOMATCH);
3871 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3875 /* Control never gets here */
3887 register pcre_uint32 d;
3888 for (i = min; i < max; i++)
3891 if (eptr >= md->end_subject)
3896 GETCHARLEN(d, eptr, len);
3897 if (fc == d || (unsigned int)foc == d) break;
3900 if (possessive) continue; /* No backtracking */
3903 if (eptr == pp) goto TAIL_RECURSE;
3904 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3911 #endif /* SUPPORT_UTF */
3914 for (i = min; i < max; i++)
3916 if (eptr >= md->end_subject)
3921 if (fc == *eptr || foc == *eptr) break;
3924 if (possessive) continue; /* No backtracking */
3927 if (eptr == pp) goto TAIL_RECURSE;
3928 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3933 /* Control never gets here */
3937 /* Caseful comparisons */
3944 register pcre_uint32 d;
3945 for (i = 1; i <= min; i++)
3947 if (eptr >= md->end_subject)
3950 RRETURN(MATCH_NOMATCH);
3952 GETCHARINC(d, eptr);
3953 if (fc == d) RRETURN(MATCH_NOMATCH);
3960 for (i = 1; i <= min; i++)
3962 if (eptr >= md->end_subject)
3965 RRETURN(MATCH_NOMATCH);
3967 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3971 if (min == max) continue;
3978 register pcre_uint32 d;
3979 for (fi = min;; fi++)
3981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3983 if (fi >= max) RRETURN(MATCH_NOMATCH);
3984 if (eptr >= md->end_subject)
3987 RRETURN(MATCH_NOMATCH);
3989 GETCHARINC(d, eptr);
3990 if (fc == d) RRETURN(MATCH_NOMATCH);
3997 for (fi = min;; fi++)
3999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4001 if (fi >= max) RRETURN(MATCH_NOMATCH);
4002 if (eptr >= md->end_subject)
4005 RRETURN(MATCH_NOMATCH);
4007 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4010 /* Control never gets here */
4022 register pcre_uint32 d;
4023 for (i = min; i < max; i++)
4026 if (eptr >= md->end_subject)
4031 GETCHARLEN(d, eptr, len);
4035 if (possessive) continue; /* No backtracking */
4038 if (eptr == pp) goto TAIL_RECURSE;
4039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4049 for (i = min; i < max; i++)
4051 if (eptr >= md->end_subject)
4056 if (fc == *eptr) break;
4059 if (possessive) continue; /* No backtracking */
4062 if (eptr == pp) goto TAIL_RECURSE;
4063 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4068 /* Control never gets here */
4071 /* Control never gets here */
4073 /* Match a single character type repeatedly; several different opcodes
4074 share code. This is very similar to the code for single characters, but we
4075 repeat it in the interests of efficiency. */
4078 min = max = GET2(ecode, 1);
4080 ecode += 1 + IMM2_SIZE;
4084 case OP_TYPEMINUPTO:
4086 max = GET2(ecode, 1);
4087 minimize = *ecode == OP_TYPEMINUPTO;
4088 ecode += 1 + IMM2_SIZE;
4091 case OP_TYPEPOSSTAR:
4098 case OP_TYPEPOSPLUS:
4105 case OP_TYPEPOSQUERY:
4112 case OP_TYPEPOSUPTO:
4115 max = GET2(ecode, 1);
4116 ecode += 1 + IMM2_SIZE;
4120 case OP_TYPEMINSTAR:
4122 case OP_TYPEMINPLUS:
4124 case OP_TYPEMINQUERY:
4125 c = *ecode++ - OP_TYPESTAR;
4126 minimize = (c & 1) != 0;
4127 min = rep_min[c]; /* Pick up values from tables; */
4128 max = rep_max[c]; /* zero for max => infinity */
4129 if (max == 0) max = INT_MAX;
4131 /* Common code for all repeated single character type matches. Note that
4132 in UTF-8 mode, '.' matches a character of any length, but for the other
4133 character types, the valid characters are all one-byte long. */
4136 ctype = *ecode++; /* Code for the character type */
4139 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4141 prop_fail_result = ctype == OP_NOTPROP;
4142 prop_type = *ecode++;
4143 prop_value = *ecode++;
4145 else prop_type = -1;
4148 /* First, ensure the minimum number of matches are present. Use inline
4149 code for maximizing the speed, and do the type test once at the start
4150 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4151 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4152 and single-bytes. */
4162 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4163 for (i = 1; i <= min; i++)
4165 if (eptr >= md->end_subject)
4168 RRETURN(MATCH_NOMATCH);
4170 GETCHARINCTEST(c, eptr);
4175 for (i = 1; i <= min; i++)
4178 if (eptr >= md->end_subject)
4181 RRETURN(MATCH_NOMATCH);
4183 GETCHARINCTEST(c, eptr);
4184 chartype = UCD_CHARTYPE(c);
4185 if ((chartype == ucp_Lu ||
4186 chartype == ucp_Ll ||
4187 chartype == ucp_Lt) == prop_fail_result)
4188 RRETURN(MATCH_NOMATCH);
4193 for (i = 1; i <= min; i++)
4195 if (eptr >= md->end_subject)
4198 RRETURN(MATCH_NOMATCH);
4200 GETCHARINCTEST(c, eptr);
4201 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4202 RRETURN(MATCH_NOMATCH);
4207 for (i = 1; i <= min; i++)
4209 if (eptr >= md->end_subject)
4212 RRETURN(MATCH_NOMATCH);
4214 GETCHARINCTEST(c, eptr);
4215 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4216 RRETURN(MATCH_NOMATCH);
4221 for (i = 1; i <= min; i++)
4223 if (eptr >= md->end_subject)
4226 RRETURN(MATCH_NOMATCH);
4228 GETCHARINCTEST(c, eptr);
4229 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4230 RRETURN(MATCH_NOMATCH);
4235 for (i = 1; i <= min; i++)
4238 if (eptr >= md->end_subject)
4241 RRETURN(MATCH_NOMATCH);
4243 GETCHARINCTEST(c, eptr);
4244 category = UCD_CATEGORY(c);
4245 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4246 RRETURN(MATCH_NOMATCH);
4250 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4251 which means that Perl space and POSIX space are now identical. PCRE
4252 was changed at release 8.34. */
4254 case PT_SPACE: /* Perl space */
4255 case PT_PXSPACE: /* POSIX space */
4256 for (i = 1; i <= min; i++)
4258 if (eptr >= md->end_subject)
4261 RRETURN(MATCH_NOMATCH);
4263 GETCHARINCTEST(c, eptr);
4268 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4272 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4273 RRETURN(MATCH_NOMATCH);
4280 for (i = 1; i <= min; i++)
4283 if (eptr >= md->end_subject)
4286 RRETURN(MATCH_NOMATCH);
4288 GETCHARINCTEST(c, eptr);
4289 category = UCD_CATEGORY(c);
4290 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4291 == prop_fail_result)
4292 RRETURN(MATCH_NOMATCH);
4297 for (i = 1; i <= min; i++)
4299 const pcre_uint32 *cp;
4300 if (eptr >= md->end_subject)
4303 RRETURN(MATCH_NOMATCH);
4305 GETCHARINCTEST(c, eptr);
4306 cp = PRIV(ucd_caseless_sets) + prop_value;
4310 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4312 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4318 for (i = 1; i <= min; i++)
4320 if (eptr >= md->end_subject)
4323 RRETURN(MATCH_NOMATCH);
4325 GETCHARINCTEST(c, eptr);
4326 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4327 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4328 c >= 0xe000) == prop_fail_result)
4329 RRETURN(MATCH_NOMATCH);
4333 /* This should not occur */
4336 RRETURN(PCRE_ERROR_INTERNAL);
4340 /* Match extended Unicode sequences. We will get here only if the
4341 support is in the binary; otherwise a compile-time error occurs. */
4343 else if (ctype == OP_EXTUNI)
4345 for (i = 1; i <= min; i++)
4347 if (eptr >= md->end_subject)
4350 RRETURN(MATCH_NOMATCH);
4355 GETCHARINCTEST(c, eptr);
4356 lgb = UCD_GRAPHBREAK(c);
4357 while (eptr < md->end_subject)
4360 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4361 rgb = UCD_GRAPHBREAK(c);
4362 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4372 #endif /* SUPPORT_UCP */
4374 /* Handle all other cases when the coding is UTF-8 */
4377 if (utf) switch(ctype)
4380 for (i = 1; i <= min; i++)
4382 if (eptr >= md->end_subject)
4385 RRETURN(MATCH_NOMATCH);
4387 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4388 if (md->partial != 0 &&
4389 eptr + 1 >= md->end_subject &&
4390 NLBLOCK->nltype == NLTYPE_FIXED &&
4391 NLBLOCK->nllen == 2 &&
4392 UCHAR21(eptr) == NLBLOCK->nl[0])
4395 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4398 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4403 for (i = 1; i <= min; i++)
4405 if (eptr >= md->end_subject)
4408 RRETURN(MATCH_NOMATCH);
4411 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4416 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4421 for (i = 1; i <= min; i++)
4423 if (eptr >= md->end_subject)
4426 RRETURN(MATCH_NOMATCH);
4428 GETCHARINC(c, eptr);
4431 default: RRETURN(MATCH_NOMATCH);
4434 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4446 #endif /* Not EBCDIC */
4447 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4454 for (i = 1; i <= min; i++)
4456 if (eptr >= md->end_subject)
4459 RRETURN(MATCH_NOMATCH);
4461 GETCHARINC(c, eptr);
4464 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4471 for (i = 1; i <= min; i++)
4473 if (eptr >= md->end_subject)
4476 RRETURN(MATCH_NOMATCH);
4478 GETCHARINC(c, eptr);
4481 HSPACE_CASES: break; /* Byte and multibyte cases */
4482 default: RRETURN(MATCH_NOMATCH);
4488 for (i = 1; i <= min; i++)
4490 if (eptr >= md->end_subject)
4493 RRETURN(MATCH_NOMATCH);
4495 GETCHARINC(c, eptr);
4498 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4505 for (i = 1; i <= min; i++)
4507 if (eptr >= md->end_subject)
4510 RRETURN(MATCH_NOMATCH);
4512 GETCHARINC(c, eptr);
4515 VSPACE_CASES: break;
4516 default: RRETURN(MATCH_NOMATCH);
4522 for (i = 1; i <= min; i++)
4524 if (eptr >= md->end_subject)
4527 RRETURN(MATCH_NOMATCH);
4529 GETCHARINC(c, eptr);
4530 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4531 RRETURN(MATCH_NOMATCH);
4536 for (i = 1; i <= min; i++)
4539 if (eptr >= md->end_subject)
4542 RRETURN(MATCH_NOMATCH);
4545 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4546 RRETURN(MATCH_NOMATCH);
4548 /* No need to skip more bytes - we know it's a 1-byte character */
4552 case OP_NOT_WHITESPACE:
4553 for (i = 1; i <= min; i++)
4556 if (eptr >= md->end_subject)
4559 RRETURN(MATCH_NOMATCH);
4562 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4563 RRETURN(MATCH_NOMATCH);
4565 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4570 for (i = 1; i <= min; i++)
4573 if (eptr >= md->end_subject)
4576 RRETURN(MATCH_NOMATCH);
4579 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4580 RRETURN(MATCH_NOMATCH);
4582 /* No need to skip more bytes - we know it's a 1-byte character */
4586 case OP_NOT_WORDCHAR:
4587 for (i = 1; i <= min; i++)
4590 if (eptr >= md->end_subject)
4593 RRETURN(MATCH_NOMATCH);
4596 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4597 RRETURN(MATCH_NOMATCH);
4599 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4604 for (i = 1; i <= min; i++)
4607 if (eptr >= md->end_subject)
4610 RRETURN(MATCH_NOMATCH);
4613 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4614 RRETURN(MATCH_NOMATCH);
4616 /* No need to skip more bytes - we know it's a 1-byte character */
4621 RRETURN(PCRE_ERROR_INTERNAL);
4622 } /* End switch(ctype) */
4625 #endif /* SUPPORT_UTF */
4627 /* Code for the non-UTF-8 case for minimum matching of operators other
4628 than OP_PROP and OP_NOTPROP. */
4633 for (i = 1; i <= min; i++)
4635 if (eptr >= md->end_subject)
4638 RRETURN(MATCH_NOMATCH);
4640 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4641 if (md->partial != 0 &&
4642 eptr + 1 >= md->end_subject &&
4643 NLBLOCK->nltype == NLTYPE_FIXED &&
4644 NLBLOCK->nllen == 2 &&
4645 *eptr == NLBLOCK->nl[0])
4648 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4655 if (eptr > md->end_subject - min)
4658 RRETURN(MATCH_NOMATCH);
4664 if (eptr > md->end_subject - min)
4667 RRETURN(MATCH_NOMATCH);
4673 for (i = 1; i <= min; i++)
4675 if (eptr >= md->end_subject)
4678 RRETURN(MATCH_NOMATCH);
4682 default: RRETURN(MATCH_NOMATCH);
4685 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4694 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4698 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4705 for (i = 1; i <= min; i++)
4707 if (eptr >= md->end_subject)
4710 RRETURN(MATCH_NOMATCH);
4716 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4717 HSPACE_MULTIBYTE_CASES:
4719 RRETURN(MATCH_NOMATCH);
4725 for (i = 1; i <= min; i++)
4727 if (eptr >= md->end_subject)
4730 RRETURN(MATCH_NOMATCH);
4734 default: RRETURN(MATCH_NOMATCH);
4736 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4737 HSPACE_MULTIBYTE_CASES:
4745 for (i = 1; i <= min; i++)
4747 if (eptr >= md->end_subject)
4750 RRETURN(MATCH_NOMATCH);
4755 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4756 VSPACE_MULTIBYTE_CASES:
4758 RRETURN(MATCH_NOMATCH);
4765 for (i = 1; i <= min; i++)
4767 if (eptr >= md->end_subject)
4770 RRETURN(MATCH_NOMATCH);
4774 default: RRETURN(MATCH_NOMATCH);
4776 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4777 VSPACE_MULTIBYTE_CASES:
4785 for (i = 1; i <= min; i++)
4787 if (eptr >= md->end_subject)
4790 RRETURN(MATCH_NOMATCH);
4792 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4793 RRETURN(MATCH_NOMATCH);
4799 for (i = 1; i <= min; i++)
4801 if (eptr >= md->end_subject)
4804 RRETURN(MATCH_NOMATCH);
4806 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4807 RRETURN(MATCH_NOMATCH);
4812 case OP_NOT_WHITESPACE:
4813 for (i = 1; i <= min; i++)
4815 if (eptr >= md->end_subject)
4818 RRETURN(MATCH_NOMATCH);
4820 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4821 RRETURN(MATCH_NOMATCH);
4827 for (i = 1; i <= min; i++)
4829 if (eptr >= md->end_subject)
4832 RRETURN(MATCH_NOMATCH);
4834 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4835 RRETURN(MATCH_NOMATCH);
4840 case OP_NOT_WORDCHAR:
4841 for (i = 1; i <= min; i++)
4843 if (eptr >= md->end_subject)
4846 RRETURN(MATCH_NOMATCH);
4848 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4849 RRETURN(MATCH_NOMATCH);
4855 for (i = 1; i <= min; i++)
4857 if (eptr >= md->end_subject)
4860 RRETURN(MATCH_NOMATCH);
4862 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4863 RRETURN(MATCH_NOMATCH);
4869 RRETURN(PCRE_ERROR_INTERNAL);
4873 /* If min = max, continue at the same level without recursing */
4875 if (min == max) continue;
4877 /* If minimizing, we have to test the rest of the pattern before each
4878 subsequent match. Again, separate the UTF-8 case for speed, and also
4879 separate the UCP cases. */
4889 for (fi = min;; fi++)
4891 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4893 if (fi >= max) RRETURN(MATCH_NOMATCH);
4894 if (eptr >= md->end_subject)
4897 RRETURN(MATCH_NOMATCH);
4899 GETCHARINCTEST(c, eptr);
4900 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4902 /* Control never gets here */
4905 for (fi = min;; fi++)
4908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4910 if (fi >= max) RRETURN(MATCH_NOMATCH);
4911 if (eptr >= md->end_subject)
4914 RRETURN(MATCH_NOMATCH);
4916 GETCHARINCTEST(c, eptr);
4917 chartype = UCD_CHARTYPE(c);
4918 if ((chartype == ucp_Lu ||
4919 chartype == ucp_Ll ||
4920 chartype == ucp_Lt) == prop_fail_result)
4921 RRETURN(MATCH_NOMATCH);
4923 /* Control never gets here */
4926 for (fi = min;; fi++)
4928 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4930 if (fi >= max) RRETURN(MATCH_NOMATCH);
4931 if (eptr >= md->end_subject)
4934 RRETURN(MATCH_NOMATCH);
4936 GETCHARINCTEST(c, eptr);
4937 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4938 RRETURN(MATCH_NOMATCH);
4940 /* Control never gets here */
4943 for (fi = min;; fi++)
4945 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4947 if (fi >= max) RRETURN(MATCH_NOMATCH);
4948 if (eptr >= md->end_subject)
4951 RRETURN(MATCH_NOMATCH);
4953 GETCHARINCTEST(c, eptr);
4954 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4955 RRETURN(MATCH_NOMATCH);
4957 /* Control never gets here */
4960 for (fi = min;; fi++)
4962 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4964 if (fi >= max) RRETURN(MATCH_NOMATCH);
4965 if (eptr >= md->end_subject)
4968 RRETURN(MATCH_NOMATCH);
4970 GETCHARINCTEST(c, eptr);
4971 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4972 RRETURN(MATCH_NOMATCH);
4974 /* Control never gets here */
4977 for (fi = min;; fi++)
4980 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4982 if (fi >= max) RRETURN(MATCH_NOMATCH);
4983 if (eptr >= md->end_subject)
4986 RRETURN(MATCH_NOMATCH);
4988 GETCHARINCTEST(c, eptr);
4989 category = UCD_CATEGORY(c);
4990 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4991 RRETURN(MATCH_NOMATCH);
4993 /* Control never gets here */
4995 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4996 which means that Perl space and POSIX space are now identical. PCRE
4997 was changed at release 8.34. */
4999 case PT_SPACE: /* Perl space */
5000 case PT_PXSPACE: /* POSIX space */
5001 for (fi = min;; fi++)
5003 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5004 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5005 if (fi >= max) RRETURN(MATCH_NOMATCH);
5006 if (eptr >= md->end_subject)
5009 RRETURN(MATCH_NOMATCH);
5011 GETCHARINCTEST(c, eptr);
5016 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5020 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5021 RRETURN(MATCH_NOMATCH);
5025 /* Control never gets here */
5028 for (fi = min;; fi++)
5031 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5033 if (fi >= max) RRETURN(MATCH_NOMATCH);
5034 if (eptr >= md->end_subject)
5037 RRETURN(MATCH_NOMATCH);
5039 GETCHARINCTEST(c, eptr);
5040 category = UCD_CATEGORY(c);
5041 if ((category == ucp_L ||
5042 category == ucp_N ||
5043 c == CHAR_UNDERSCORE)
5044 == prop_fail_result)
5045 RRETURN(MATCH_NOMATCH);
5047 /* Control never gets here */
5050 for (fi = min;; fi++)
5052 const pcre_uint32 *cp;
5053 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5055 if (fi >= max) RRETURN(MATCH_NOMATCH);
5056 if (eptr >= md->end_subject)
5059 RRETURN(MATCH_NOMATCH);
5061 GETCHARINCTEST(c, eptr);
5062 cp = PRIV(ucd_caseless_sets) + prop_value;
5066 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5068 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5071 /* Control never gets here */
5074 for (fi = min;; fi++)
5076 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5078 if (fi >= max) RRETURN(MATCH_NOMATCH);
5079 if (eptr >= md->end_subject)
5082 RRETURN(MATCH_NOMATCH);
5084 GETCHARINCTEST(c, eptr);
5085 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5086 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5087 c >= 0xe000) == prop_fail_result)
5088 RRETURN(MATCH_NOMATCH);
5090 /* Control never gets here */
5092 /* This should never occur */
5094 RRETURN(PCRE_ERROR_INTERNAL);
5098 /* Match extended Unicode sequences. We will get here only if the
5099 support is in the binary; otherwise a compile-time error occurs. */
5101 else if (ctype == OP_EXTUNI)
5103 for (fi = min;; fi++)
5105 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5106 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5107 if (fi >= max) RRETURN(MATCH_NOMATCH);
5108 if (eptr >= md->end_subject)
5111 RRETURN(MATCH_NOMATCH);
5116 GETCHARINCTEST(c, eptr);
5117 lgb = UCD_GRAPHBREAK(c);
5118 while (eptr < md->end_subject)
5121 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5122 rgb = UCD_GRAPHBREAK(c);
5123 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5132 #endif /* SUPPORT_UCP */
5137 for (fi = min;; fi++)
5139 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5140 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5141 if (fi >= max) RRETURN(MATCH_NOMATCH);
5142 if (eptr >= md->end_subject)
5145 RRETURN(MATCH_NOMATCH);
5147 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5148 RRETURN(MATCH_NOMATCH);
5149 GETCHARINC(c, eptr);
5152 case OP_ANY: /* This is the non-NL case */
5153 if (md->partial != 0 && /* Take care with CRLF partial */
5154 eptr >= md->end_subject &&
5155 NLBLOCK->nltype == NLTYPE_FIXED &&
5156 NLBLOCK->nllen == 2 &&
5157 c == NLBLOCK->nl[0])
5160 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5171 default: RRETURN(MATCH_NOMATCH);
5173 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5185 #endif /* Not EBCDIC */
5186 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5194 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5202 HSPACE_CASES: break;
5203 default: RRETURN(MATCH_NOMATCH);
5210 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5218 VSPACE_CASES: break;
5219 default: RRETURN(MATCH_NOMATCH);
5224 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5225 RRETURN(MATCH_NOMATCH);
5229 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5230 RRETURN(MATCH_NOMATCH);
5233 case OP_NOT_WHITESPACE:
5234 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5235 RRETURN(MATCH_NOMATCH);
5239 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5240 RRETURN(MATCH_NOMATCH);
5243 case OP_NOT_WORDCHAR:
5244 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5245 RRETURN(MATCH_NOMATCH);
5249 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5250 RRETURN(MATCH_NOMATCH);
5254 RRETURN(PCRE_ERROR_INTERNAL);
5262 for (fi = min;; fi++)
5264 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5265 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5266 if (fi >= max) RRETURN(MATCH_NOMATCH);
5267 if (eptr >= md->end_subject)
5270 RRETURN(MATCH_NOMATCH);
5272 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5273 RRETURN(MATCH_NOMATCH);
5277 case OP_ANY: /* This is the non-NL case */
5278 if (md->partial != 0 && /* Take care with CRLF partial */
5279 eptr >= md->end_subject &&
5280 NLBLOCK->nltype == NLTYPE_FIXED &&
5281 NLBLOCK->nllen == 2 &&
5282 c == NLBLOCK->nl[0])
5285 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5296 default: RRETURN(MATCH_NOMATCH);
5298 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5307 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5311 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5321 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5322 HSPACE_MULTIBYTE_CASES:
5324 RRETURN(MATCH_NOMATCH);
5331 default: RRETURN(MATCH_NOMATCH);
5333 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5334 HSPACE_MULTIBYTE_CASES:
5345 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5346 VSPACE_MULTIBYTE_CASES:
5348 RRETURN(MATCH_NOMATCH);
5355 default: RRETURN(MATCH_NOMATCH);
5357 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5358 VSPACE_MULTIBYTE_CASES:
5365 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5369 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5372 case OP_NOT_WHITESPACE:
5373 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5377 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5380 case OP_NOT_WORDCHAR:
5381 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5385 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5389 RRETURN(PCRE_ERROR_INTERNAL);
5393 /* Control never gets here */
5396 /* If maximizing, it is worth using inline code for speed, doing the type
5397 test once at the start (i.e. keep it out of the loop). Again, keep the
5398 UTF-8 and UCP stuff separate. */
5402 pp = eptr; /* Remember where we started */
5410 for (i = min; i < max; i++)
5413 if (eptr >= md->end_subject)
5418 GETCHARLENTEST(c, eptr, len);
5419 if (prop_fail_result) break;
5425 for (i = min; i < max; i++)
5429 if (eptr >= md->end_subject)
5434 GETCHARLENTEST(c, eptr, len);
5435 chartype = UCD_CHARTYPE(c);
5436 if ((chartype == ucp_Lu ||
5437 chartype == ucp_Ll ||
5438 chartype == ucp_Lt) == prop_fail_result)
5445 for (i = min; i < max; i++)
5448 if (eptr >= md->end_subject)
5453 GETCHARLENTEST(c, eptr, len);
5454 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5460 for (i = min; i < max; i++)
5463 if (eptr >= md->end_subject)
5468 GETCHARLENTEST(c, eptr, len);
5469 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5475 for (i = min; i < max; i++)
5478 if (eptr >= md->end_subject)
5483 GETCHARLENTEST(c, eptr, len);
5484 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5490 for (i = min; i < max; i++)
5494 if (eptr >= md->end_subject)
5499 GETCHARLENTEST(c, eptr, len);
5500 category = UCD_CATEGORY(c);
5501 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5507 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5508 which means that Perl space and POSIX space are now identical. PCRE
5509 was changed at release 8.34. */
5511 case PT_SPACE: /* Perl space */
5512 case PT_PXSPACE: /* POSIX space */
5513 for (i = min; i < max; i++)
5516 if (eptr >= md->end_subject)
5521 GETCHARLENTEST(c, eptr, len);
5526 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5530 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5531 goto ENDLOOP99; /* Break the loop */
5540 for (i = min; i < max; i++)
5544 if (eptr >= md->end_subject)
5549 GETCHARLENTEST(c, eptr, len);
5550 category = UCD_CATEGORY(c);
5551 if ((category == ucp_L || category == ucp_N ||
5552 c == CHAR_UNDERSCORE) == prop_fail_result)
5559 for (i = min; i < max; i++)
5561 const pcre_uint32 *cp;
5563 if (eptr >= md->end_subject)
5568 GETCHARLENTEST(c, eptr, len);
5569 cp = PRIV(ucd_caseless_sets) + prop_value;
5573 { if (prop_fail_result) break; else goto GOT_MAX; }
5575 { if (prop_fail_result) goto GOT_MAX; else break; }
5583 for (i = min; i < max; i++)
5586 if (eptr >= md->end_subject)
5591 GETCHARLENTEST(c, eptr, len);
5592 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5593 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5594 c >= 0xe000) == prop_fail_result)
5601 RRETURN(PCRE_ERROR_INTERNAL);
5604 /* eptr is now past the end of the maximum run */
5606 if (possessive) continue; /* No backtracking */
5609 if (eptr == pp) goto TAIL_RECURSE;
5610 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5613 if (utf) BACKCHAR(eptr);
5617 /* Match extended Unicode grapheme clusters. We will get here only if the
5618 support is in the binary; otherwise a compile-time error occurs. */
5620 else if (ctype == OP_EXTUNI)
5622 for (i = min; i < max; i++)
5624 if (eptr >= md->end_subject)
5632 GETCHARINCTEST(c, eptr);
5633 lgb = UCD_GRAPHBREAK(c);
5634 while (eptr < md->end_subject)
5637 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5638 rgb = UCD_GRAPHBREAK(c);
5639 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5647 /* eptr is now past the end of the maximum run */
5649 if (possessive) continue; /* No backtracking */
5656 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5660 /* Backtracking over an extended grapheme cluster involves inspecting
5661 the previous two characters (if present) to see if a break is
5662 permitted between them. */
5665 if (!utf) c = *eptr; else
5670 rgb = UCD_GRAPHBREAK(c);
5674 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5676 if (!utf) c = *fptr; else
5681 lgb = UCD_GRAPHBREAK(c);
5682 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5690 #endif /* SUPPORT_UCP */
5700 for (i = min; i < max; i++)
5702 if (eptr >= md->end_subject)
5707 if (IS_NEWLINE(eptr)) break;
5708 if (md->partial != 0 && /* Take care with CRLF partial */
5709 eptr + 1 >= md->end_subject &&
5710 NLBLOCK->nltype == NLTYPE_FIXED &&
5711 NLBLOCK->nllen == 2 &&
5712 UCHAR21(eptr) == NLBLOCK->nl[0])
5715 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5718 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5722 /* Handle unlimited UTF-8 repeat */
5726 for (i = min; i < max; i++)
5728 if (eptr >= md->end_subject)
5733 if (IS_NEWLINE(eptr)) break;
5734 if (md->partial != 0 && /* Take care with CRLF partial */
5735 eptr + 1 >= md->end_subject &&
5736 NLBLOCK->nltype == NLTYPE_FIXED &&
5737 NLBLOCK->nllen == 2 &&
5738 UCHAR21(eptr) == NLBLOCK->nl[0])
5741 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5744 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5752 for (i = min; i < max; i++)
5754 if (eptr >= md->end_subject)
5760 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5765 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5770 /* The byte case is the same as non-UTF8 */
5774 if (c > (unsigned int)(md->end_subject - eptr))
5776 eptr = md->end_subject;
5783 for (i = min; i < max; i++)
5786 if (eptr >= md->end_subject)
5791 GETCHARLEN(c, eptr, len);
5794 if (++eptr >= md->end_subject) break;
5795 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5801 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5803 && c != 0x2028 && c != 0x2029
5804 #endif /* Not EBCDIC */
5814 for (i = min; i < max; i++)
5818 if (eptr >= md->end_subject)
5823 GETCHARLEN(c, eptr, len);
5826 HSPACE_CASES: gotspace = TRUE; break;
5827 default: gotspace = FALSE; break;
5829 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5836 for (i = min; i < max; i++)
5840 if (eptr >= md->end_subject)
5845 GETCHARLEN(c, eptr, len);
5848 VSPACE_CASES: gotspace = TRUE; break;
5849 default: gotspace = FALSE; break;
5851 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5857 for (i = min; i < max; i++)
5860 if (eptr >= md->end_subject)
5865 GETCHARLEN(c, eptr, len);
5866 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5872 for (i = min; i < max; i++)
5875 if (eptr >= md->end_subject)
5880 GETCHARLEN(c, eptr, len);
5881 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5886 case OP_NOT_WHITESPACE:
5887 for (i = min; i < max; i++)
5890 if (eptr >= md->end_subject)
5895 GETCHARLEN(c, eptr, len);
5896 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5902 for (i = min; i < max; i++)
5905 if (eptr >= md->end_subject)
5910 GETCHARLEN(c, eptr, len);
5911 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5916 case OP_NOT_WORDCHAR:
5917 for (i = min; i < max; i++)
5920 if (eptr >= md->end_subject)
5925 GETCHARLEN(c, eptr, len);
5926 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5932 for (i = min; i < max; i++)
5935 if (eptr >= md->end_subject)
5940 GETCHARLEN(c, eptr, len);
5941 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5947 RRETURN(PCRE_ERROR_INTERNAL);
5950 if (possessive) continue; /* No backtracking */
5953 if (eptr == pp) goto TAIL_RECURSE;
5954 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5958 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5959 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5963 #endif /* SUPPORT_UTF */
5969 for (i = min; i < max; i++)
5971 if (eptr >= md->end_subject)
5976 if (IS_NEWLINE(eptr)) break;
5977 if (md->partial != 0 && /* Take care with CRLF partial */
5978 eptr + 1 >= md->end_subject &&
5979 NLBLOCK->nltype == NLTYPE_FIXED &&
5980 NLBLOCK->nllen == 2 &&
5981 *eptr == NLBLOCK->nl[0])
5984 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5993 if (c > (unsigned int)(md->end_subject - eptr))
5995 eptr = md->end_subject;
6002 for (i = min; i < max; i++)
6004 if (eptr >= md->end_subject)
6012 if (++eptr >= md->end_subject) break;
6013 if (*eptr == CHAR_LF) eptr++;
6017 if (c != CHAR_LF && (md->bsr_anycrlf ||
6018 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6019 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6020 && c != 0x2028 && c != 0x2029
6029 for (i = min; i < max; i++)
6031 if (eptr >= md->end_subject)
6038 default: eptr++; break;
6040 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6041 HSPACE_MULTIBYTE_CASES:
6050 for (i = min; i < max; i++)
6052 if (eptr >= md->end_subject)
6059 default: goto ENDLOOP01;
6061 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6062 HSPACE_MULTIBYTE_CASES:
6071 for (i = min; i < max; i++)
6073 if (eptr >= md->end_subject)
6080 default: eptr++; break;
6082 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6083 VSPACE_MULTIBYTE_CASES:
6092 for (i = min; i < max; i++)
6094 if (eptr >= md->end_subject)
6101 default: goto ENDLOOP03;
6103 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6104 VSPACE_MULTIBYTE_CASES:
6113 for (i = min; i < max; i++)
6115 if (eptr >= md->end_subject)
6120 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6126 for (i = min; i < max; i++)
6128 if (eptr >= md->end_subject)
6133 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6138 case OP_NOT_WHITESPACE:
6139 for (i = min; i < max; i++)
6141 if (eptr >= md->end_subject)
6146 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6152 for (i = min; i < max; i++)
6154 if (eptr >= md->end_subject)
6159 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6164 case OP_NOT_WORDCHAR:
6165 for (i = min; i < max; i++)
6167 if (eptr >= md->end_subject)
6172 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6178 for (i = min; i < max; i++)
6180 if (eptr >= md->end_subject)
6185 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6191 RRETURN(PCRE_ERROR_INTERNAL);
6194 if (possessive) continue; /* No backtracking */
6197 if (eptr == pp) goto TAIL_RECURSE;
6198 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6201 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6202 eptr[-1] == CHAR_CR) eptr--;
6206 /* Control never gets here */
6209 /* There's been some horrible disaster. Arrival here can only mean there is
6210 something seriously wrong in the code above or the OP_xxx definitions. */
6213 DPRINTF(("Unknown opcode %d\n", *ecode));
6214 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6217 /* Do not stick any code in here without much thought; it is assumed
6218 that "continue" in the code above comes out to here to repeat the main
6221 } /* End of main loop */
6222 /* Control never reaches here */
6225 /* When compiling to use the heap rather than the stack for recursive calls to
6226 match(), the RRETURN() macro jumps here. The number that is saved in
6227 frame->Xwhere indicates which label we actually want to return to. */
6230 #define LBL(val) case val: goto L_RM##val;
6232 switch (frame->Xwhere)
6234 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6235 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6236 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6237 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6238 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6240 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6245 LBL(22) LBL(23) LBL(28) LBL(30)
6246 LBL(32) LBL(34) LBL(42) LBL(46)
6248 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6249 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6250 #endif /* SUPPORT_UCP */
6251 #endif /* SUPPORT_UTF */
6253 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6254 return PCRE_ERROR_INTERNAL;
6257 #endif /* NO_RECURSE */
6261 /***************************************************************************
6262 ****************************************************************************
6263 RECURSION IN THE match() FUNCTION
6265 Undefine all the macros that were defined above to handle this. */
6283 #undef new_recursive
6296 #undef save_capture_last
6306 /* These two are defined as macros in both cases */
6311 /***************************************************************************
6312 ***************************************************************************/
6316 /*************************************************
6317 * Release allocated heap frames *
6318 *************************************************/
6320 /* This function releases all the allocated frames. The base frame is on the
6321 machine stack, and so must not be freed.
6323 Argument: the address of the base frame
6328 release_match_heapframes (heapframe *frame_base)
6330 heapframe *nextframe = frame_base->Xnextframe;
6331 while (nextframe != NULL)
6333 heapframe *oldframe = nextframe;
6334 nextframe = nextframe->Xnextframe;
6335 (PUBL(stack_free))(oldframe);
6341 /*************************************************
6342 * Execute a Regular Expression *
6343 *************************************************/
6345 /* This function applies a compiled re to a subject string and picks out
6346 portions of the string if it matches. Two elements in the vector are set for
6347 each substring: the offsets to the start and end of the substring.
6350 argument_re points to the compiled expression
6351 extra_data points to extra data or is NULL
6352 subject points to the subject string
6353 length length of subject string (may contain binary zeros)
6354 start_offset where to start in the subject string
6356 offsets points to a vector of ints to be filled in with offsets
6357 offsetcount the number of elements in the vector
6359 Returns: > 0 => success; value is the number of elements filled in
6360 = 0 => success, but offsets is not big enough
6361 -1 => failed to match
6362 < -1 => some kind of unexpected problem
6365 #if defined COMPILE_PCRE8
6366 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6367 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6368 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6370 #elif defined COMPILE_PCRE16
6371 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6372 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6373 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6375 #elif defined COMPILE_PCRE32
6376 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6377 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6378 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6382 int rc, ocount, arg_offset_max;
6384 BOOL using_temporary_offsets = FALSE;
6389 BOOL has_first_char = FALSE;
6390 BOOL has_req_char = FALSE;
6391 pcre_uchar first_char = 0;
6392 pcre_uchar first_char2 = 0;
6393 pcre_uchar req_char = 0;
6394 pcre_uchar req_char2 = 0;
6395 match_data match_block;
6396 match_data *md = &match_block;
6397 const pcre_uint8 *tables;
6398 const pcre_uint8 *start_bits = NULL;
6399 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6400 PCRE_PUCHAR end_subject;
6401 PCRE_PUCHAR start_partial = NULL;
6402 PCRE_PUCHAR match_partial = NULL;
6403 PCRE_PUCHAR req_char_ptr = start_match - 1;
6405 const pcre_study_data *study;
6406 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6409 heapframe frame_zero;
6410 frame_zero.Xprevframe = NULL; /* Marks the top level */
6411 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6412 md->match_frames_base = &frame_zero;
6415 /* Check for the special magic call that measures the size of the stack used
6416 per recursive call of match(). Without the funny casting for sizeof, a Windows
6417 compiler gave this error: "unary minus operator applied to unsigned type,
6418 result still unsigned". Hopefully the cast fixes that. */
6420 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6421 start_offset == -999)
6423 return -((int)sizeof(heapframe));
6425 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6428 /* Plausibility checks */
6430 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6431 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6432 return PCRE_ERROR_NULL;
6433 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6434 if (length < 0) return PCRE_ERROR_BADLENGTH;
6435 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6437 /* Check that the first field in the block is the magic number. If it is not,
6438 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6439 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6440 means that the pattern is likely compiled with different endianness. */
6442 if (re->magic_number != MAGIC_NUMBER)
6443 return re->magic_number == REVERSED_MAGIC_NUMBER?
6444 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6445 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6447 /* These two settings are used in the code for checking a UTF-8 string that
6448 follows immediately afterwards. Other values in the md block are used only
6449 during "normal" pcre_exec() processing, not when the JIT support is in use,
6450 so they are set up later. */
6452 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6453 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6454 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6455 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6457 /* Check a UTF-8 string if required. Pass back the character offset and error
6458 code for an invalid string if a results vector is available. */
6461 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6464 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6467 if (offsetcount >= 2)
6469 offsets[0] = erroroffset;
6470 offsets[1] = errorcode;
6472 #if defined COMPILE_PCRE8
6473 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6474 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6475 #elif defined COMPILE_PCRE16
6476 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6477 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6478 #elif defined COMPILE_PCRE32
6479 return PCRE_ERROR_BADUTF32;
6482 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6483 /* Check that a start_offset points to the start of a UTF character. */
6484 if (start_offset > 0 && start_offset < length &&
6485 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6486 return PCRE_ERROR_BADUTF8_OFFSET;
6491 /* If the pattern was successfully studied with JIT support, run the JIT
6492 executable instead of the rest of this function. Most options must be set at
6493 compile time for the JIT code to be usable. Fallback to the normal code path if
6494 an unsupported flag is set. */
6497 if (extra_data != NULL
6498 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6499 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6500 && extra_data->executable_jit != NULL
6501 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6503 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6504 start_offset, options, offsets, offsetcount);
6506 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6507 mode is not compiled. In this case we simply fallback to interpreter. */
6509 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6513 /* Carry on with non-JIT matching. This information is for finding all the
6514 numbers associated with a given name, for condition testing. */
6516 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6517 md->name_count = re->name_count;
6518 md->name_entry_size = re->name_entry_size;
6520 /* Fish out the optional data from the extra_data structure, first setting
6521 the default values. */
6524 md->match_limit = MATCH_LIMIT;
6525 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6526 md->callout_data = NULL;
6528 /* The table pointer is always in native byte order. */
6530 tables = re->tables;
6532 /* The two limit values override the defaults, whatever their value. */
6534 if (extra_data != NULL)
6536 register unsigned int flags = extra_data->flags;
6537 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6538 study = (const pcre_study_data *)extra_data->study_data;
6539 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6540 md->match_limit = extra_data->match_limit;
6541 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6542 md->match_limit_recursion = extra_data->match_limit_recursion;
6543 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6544 md->callout_data = extra_data->callout_data;
6545 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6548 /* Limits in the regex override only if they are smaller. */
6550 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6551 md->match_limit = re->limit_match;
6553 if ((re->flags & PCRE_RLSET) != 0 &&
6554 re->limit_recursion < md->match_limit_recursion)
6555 md->match_limit_recursion = re->limit_recursion;
6557 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6558 is a feature that makes it possible to save compiled regex and re-use them
6559 in other programs later. */
6561 if (tables == NULL) tables = PRIV(default_tables);
6563 /* Set up other data */
6565 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6566 startline = (re->flags & PCRE_STARTLINE) != 0;
6567 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6569 /* The code starts after the real_pcre block and the capture name table. */
6571 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6572 re->name_count * re->name_entry_size;
6574 md->start_subject = (PCRE_PUCHAR)subject;
6575 md->start_offset = start_offset;
6576 md->end_subject = md->start_subject + length;
6577 end_subject = md->end_subject;
6579 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6580 md->use_ucp = (re->options & PCRE_UCP) != 0;
6581 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6582 md->ignore_skip_arg = 0;
6584 /* Some options are unpacked into BOOL variables in the hope that testing
6585 them will be faster than individual option bits. */
6587 md->notbol = (options & PCRE_NOTBOL) != 0;
6588 md->noteol = (options & PCRE_NOTEOL) != 0;
6589 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6590 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6593 md->mark = md->nomatch_mark = NULL; /* In case never set */
6595 md->recursive = NULL; /* No recursion at top level */
6596 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6598 md->lcc = tables + lcc_offset;
6599 md->fcc = tables + fcc_offset;
6600 md->ctypes = tables + ctypes_offset;
6602 /* Handle different \R options. */
6604 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6607 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6608 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6611 md->bsr_anycrlf = TRUE;
6613 md->bsr_anycrlf = FALSE;
6617 case PCRE_BSR_ANYCRLF:
6618 md->bsr_anycrlf = TRUE;
6621 case PCRE_BSR_UNICODE:
6622 md->bsr_anycrlf = FALSE;
6625 default: return PCRE_ERROR_BADNEWLINE;
6628 /* Handle different types of newline. The three bits give eight cases. If
6629 nothing is set at run time, whatever was used at compile time applies. */
6631 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6632 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6634 case 0: newline = NEWLINE; break; /* Compile-time default */
6635 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6636 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6637 case PCRE_NEWLINE_CR+
6638 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6639 case PCRE_NEWLINE_ANY: newline = -1; break;
6640 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6641 default: return PCRE_ERROR_BADNEWLINE;
6646 md->nltype = NLTYPE_ANYCRLF;
6648 else if (newline < 0)
6650 md->nltype = NLTYPE_ANY;
6654 md->nltype = NLTYPE_FIXED;
6658 md->nl[0] = (newline >> 8) & 255;
6659 md->nl[1] = newline & 255;
6664 md->nl[0] = newline;
6668 /* Partial matching was originally supported only for a restricted set of
6669 regexes; from release 8.00 there are no restrictions, but the bits are still
6670 defined (though never set). So there's no harm in leaving this code. */
6672 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6673 return PCRE_ERROR_BADPARTIAL;
6675 /* If the expression has got more back references than the offsets supplied can
6676 hold, we get a temporary chunk of working store to use during the matching.
6677 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6680 ocount = offsetcount - (offsetcount % 3);
6681 arg_offset_max = (2*ocount)/3;
6683 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6685 ocount = re->top_backref * 3 + 3;
6686 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6687 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6688 using_temporary_offsets = TRUE;
6689 DPRINTF(("Got memory to hold back references\n"));
6691 else md->offset_vector = offsets;
6692 md->offset_end = ocount;
6693 md->offset_max = (2*ocount)/3;
6694 md->capture_last = 0;
6696 /* Reset the working variable associated with each extraction. These should
6697 never be used unless previously set, but they get saved and restored, and so we
6698 initialize them to avoid reading uninitialized locations. Also, unset the
6699 offsets for the matched string. This is really just for tidiness with callouts,
6700 in case they inspect these fields. */
6702 if (md->offset_vector != NULL)
6704 register int *iptr = md->offset_vector + ocount;
6705 register int *iend = iptr - re->top_bracket;
6706 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6707 while (--iptr >= iend) *iptr = -1;
6708 md->offset_vector[0] = md->offset_vector[1] = -1;
6711 /* Set up the first character to match, if available. The first_char value is
6712 never set for an anchored regular expression, but the anchoring may be forced
6713 at run time, so we have to test for anchoring. The first char may be unset for
6714 an unanchored pattern, of course. If there's no first char and the pattern was
6715 studied, there may be a bitmap of possible first characters. */
6719 if ((re->flags & PCRE_FIRSTSET) != 0)
6721 has_first_char = TRUE;
6722 first_char = first_char2 = (pcre_uchar)(re->first_char);
6723 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6725 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6726 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6727 if (utf && first_char > 127)
6728 first_char2 = UCD_OTHERCASE(first_char);
6733 if (!startline && study != NULL &&
6734 (study->flags & PCRE_STUDY_MAPPED) != 0)
6735 start_bits = study->start_bits;
6738 /* For anchored or unanchored matches, there may be a "last known required
6741 if ((re->flags & PCRE_REQCHSET) != 0)
6743 has_req_char = TRUE;
6744 req_char = req_char2 = (pcre_uchar)(re->req_char);
6745 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6747 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6748 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6749 if (utf && req_char > 127)
6750 req_char2 = UCD_OTHERCASE(req_char);
6756 /* ==========================================================================*/
6758 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6759 the loop runs just once. */
6763 PCRE_PUCHAR save_end_subject = end_subject;
6764 PCRE_PUCHAR new_start_match;
6766 /* If firstline is TRUE, the start of the match is constrained to the first
6767 line of a multiline string. That is, the match must be before or at the first
6768 newline. Implement this by temporarily adjusting end_subject so that we stop
6769 scanning at a newline. If the match fails at the newline, later code breaks
6774 PCRE_PUCHAR t = start_match;
6778 while (t < md->end_subject && !IS_NEWLINE(t))
6781 ACROSSCHAR(t < end_subject, *t, t++);
6786 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6790 /* There are some optimizations that avoid running the match if a known
6791 starting point is not found, or if a known later character is not present.
6792 However, there is an option that disables these, for testing and for ensuring
6793 that all callouts do actually occur. The option can be set in the regex by
6794 (*NO_START_OPT) or passed in match-time options. */
6796 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6798 /* Advance to a unique first char if there is one. */
6804 if (first_char != first_char2)
6805 while (start_match < end_subject &&
6806 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6809 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6813 /* Or to just after a linebreak for a multiline match */
6817 if (start_match > md->start_subject + start_offset)
6822 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6825 ACROSSCHAR(start_match < end_subject, *start_match,
6831 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6834 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6835 and we are now at a LF, advance the match position by one more character.
6838 if (start_match[-1] == CHAR_CR &&
6839 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6840 start_match < end_subject &&
6841 UCHAR21TEST(start_match) == CHAR_NL)
6846 /* Or to a non-unique first byte after study */
6848 else if (start_bits != NULL)
6850 while (start_match < end_subject)
6852 register pcre_uint32 c = UCHAR21TEST(start_match);
6853 #ifndef COMPILE_PCRE8
6854 if (c > 255) c = 255;
6856 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6860 } /* Starting optimizations */
6862 /* Restore fudged end_subject */
6864 end_subject = save_end_subject;
6866 /* The following two optimizations are disabled for partial matching or if
6867 disabling is explicitly requested. */
6869 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6871 /* If the pattern was studied, a minimum subject length may be set. This is
6872 a lower bound; no actual string of that length may actually match the
6873 pattern. Although the value is, strictly, in characters, we treat it as
6874 bytes to avoid spending too much time in this optimization. */
6876 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6877 (pcre_uint32)(end_subject - start_match) < study->minlength)
6883 /* If req_char is set, we know that that character must appear in the
6884 subject for the match to succeed. If the first character is set, req_char
6885 must be later in the subject; otherwise the test starts at the match point.
6886 This optimization can save a huge amount of backtracking in patterns with
6887 nested unlimited repeats that aren't going to match. Writing separate code
6888 for cased/caseless versions makes it go faster, as does using an
6889 autoincrement and backing off on a match.
6891 HOWEVER: when the subject string is very, very long, searching to its end
6892 can take a long time, and give bad performance on quite ordinary patterns.
6893 This showed up when somebody was matching something like /^\d+C/ on a
6894 32-megabyte string... so we don't do this when the string is sufficiently
6897 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6899 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6901 /* We don't need to repeat the search if we haven't yet reached the
6902 place we found it at last time. */
6904 if (p > req_char_ptr)
6906 if (req_char != req_char2)
6908 while (p < end_subject)
6910 register pcre_uint32 pp = UCHAR21INCTEST(p);
6911 if (pp == req_char || pp == req_char2) { p--; break; }
6916 while (p < end_subject)
6918 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6922 /* If we can't find the required character, break the matching loop,
6923 forcing a match failure. */
6925 if (p >= end_subject)
6931 /* If we have found the required character, save the point where we
6932 found it, so that we don't search again next time round the loop if
6933 the start hasn't passed this character yet. */
6940 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6941 printf(">>>> Match against: ");
6942 pchars(start_match, end_subject - start_match, TRUE, md);
6946 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6947 first starting point for which a partial match was found. */
6949 md->start_match_ptr = start_match;
6950 md->start_used_ptr = start_match;
6951 md->match_call_count = 0;
6952 md->match_function_type = 0;
6953 md->end_offset_top = 0;
6954 md->skip_arg_count = 0;
6955 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6956 if (md->hitend && start_partial == NULL)
6958 start_partial = md->start_used_ptr;
6959 match_partial = start_match;
6964 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6965 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6966 entirely. The only way we can do that is to re-do the match at the same
6967 point, with a flag to force SKIP with an argument to be ignored. Just
6968 treating this case as NOMATCH does not work because it does not check other
6969 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6971 case MATCH_SKIP_ARG:
6972 new_start_match = start_match;
6973 md->ignore_skip_arg = md->skip_arg_count;
6976 /* SKIP passes back the next starting point explicitly, but if it is no
6977 greater than the match we have just done, treat it as NOMATCH. */
6980 if (md->start_match_ptr > start_match)
6982 new_start_match = md->start_match_ptr;
6987 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6988 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6993 md->ignore_skip_arg = 0;
6994 new_start_match = start_match + 1;
6997 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
7002 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7008 /* Any other return is either a match, or some kind of error. */
7014 /* Control reaches here for the various types of "no match at this point"
7015 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7019 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7020 newline in the subject (though it may continue over the newline). Therefore,
7021 if we have just failed to match, starting at a newline, do not continue. */
7023 if (firstline && IS_NEWLINE(start_match)) break;
7025 /* Advance to new matching position */
7027 start_match = new_start_match;
7029 /* Break the loop if the pattern is anchored or if we have passed the end of
7032 if (anchored || start_match > end_subject) break;
7034 /* If we have just passed a CR and we are now at a LF, and the pattern does
7035 not contain any explicit matches for \r or \n, and the newline option is CRLF
7036 or ANY or ANYCRLF, advance the match position by one more character. In
7037 normal matching start_match will aways be greater than the first position at
7038 this stage, but a failed *SKIP can cause a return at the same point, which is
7039 why the first test exists. */
7041 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7042 start_match[-1] == CHAR_CR &&
7043 start_match < end_subject &&
7044 *start_match == CHAR_NL &&
7045 (re->flags & PCRE_HASCRORLF) == 0 &&
7046 (md->nltype == NLTYPE_ANY ||
7047 md->nltype == NLTYPE_ANYCRLF ||
7051 md->mark = NULL; /* Reset for start of next match attempt */
7052 } /* End of for(;;) "bumpalong" loop */
7054 /* ==========================================================================*/
7056 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7059 (1) The pattern is anchored or the match was failed by (*COMMIT);
7061 (2) We are past the end of the subject;
7063 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7064 this option requests that a match occur at or before the first newline in
7067 When we have a match and the offset vector is big enough to deal with any
7068 backreferences, captured substring offsets will already be set up. In the case
7069 where we had to get some local store to hold offsets for backreference
7070 processing, copy those that we can. In this case there need not be overflow if
7071 certain parts of the pattern were not used, even though there are more
7072 capturing parentheses than vector slots. */
7076 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7078 if (using_temporary_offsets)
7080 if (arg_offset_max >= 4)
7082 memcpy(offsets + 2, md->offset_vector + 2,
7083 (arg_offset_max - 2) * sizeof(int));
7084 DPRINTF(("Copied offsets from temporary memory\n"));
7086 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7087 DPRINTF(("Freeing temporary memory\n"));
7088 (PUBL(free))(md->offset_vector);
7091 /* Set the return code to the number of captured strings, or 0 if there were
7092 too many to fit into the vector. */
7094 rc = ((md->capture_last & OVFLBIT) != 0 &&
7095 md->end_offset_top >= arg_offset_max)?
7096 0 : md->end_offset_top/2;
7098 /* If there is space in the offset vector, set any unused pairs at the end of
7099 the pattern to -1 for backwards compatibility. It is documented that this
7100 happens. In earlier versions, the whole set of potential capturing offsets
7101 was set to -1 each time round the loop, but this is handled differently now.
7102 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7103 those at the end that need unsetting here. We can't just unset them all at
7104 the start of the whole thing because they may get set in one branch that is
7105 not the final matching branch. */
7107 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7109 register int *iptr, *iend;
7110 int resetcount = 2 + re->top_bracket * 2;
7111 if (resetcount > offsetcount) resetcount = offsetcount;
7112 iptr = offsets + md->end_offset_top;
7113 iend = offsets + resetcount;
7114 while (iptr < iend) *iptr++ = -1;
7117 /* If there is space, set up the whole thing as substring 0. The value of
7118 md->start_match_ptr might be modified if \K was encountered on the success
7121 if (offsetcount < 2) rc = 0; else
7123 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7124 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7127 /* Return MARK data if requested */
7129 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7130 *(extra_data->mark) = (pcre_uchar *)md->mark;
7131 DPRINTF((">>>> returning %d\n", rc));
7133 release_match_heapframes(&frame_zero);
7138 /* Control gets here if there has been an error, or if the overall match
7139 attempt has failed at all permitted starting positions. */
7141 if (using_temporary_offsets)
7143 DPRINTF(("Freeing temporary memory\n"));
7144 (PUBL(free))(md->offset_vector);
7147 /* For anything other than nomatch or partial match, just return the code. */
7149 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7151 DPRINTF((">>>> error: returning %d\n", rc));
7153 release_match_heapframes(&frame_zero);
7158 /* Handle partial matches - disable any mark data */
7160 if (match_partial != NULL)
7162 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7164 if (offsetcount > 1)
7166 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7167 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7168 if (offsetcount > 2)
7169 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7171 rc = PCRE_ERROR_PARTIAL;
7174 /* This is the classic nomatch case */
7178 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7179 rc = PCRE_ERROR_NOMATCH;
7182 /* Return the MARK data if it has been requested. */
7184 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7185 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7187 release_match_heapframes(&frame_zero);
7192 /* End of pcre_exec.c */