1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
84 #define MATCH_NOMATCH 0
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
106 #define REC_STACK_SAVE_MAX 30
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
159 Returns: >= 0 the number of subject bytes matched
161 -2 partial match; always given if at end subject
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
190 if (length < 0) return -1;
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
210 PCRE_PUCHAR endptr = p + length;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
219 if (c != d && c != d + ur->other_case)
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
261 return (int)(eptr - eptr_start);
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
320 #define REGISTER register
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
329 #define RRETURN(ra) \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
387 /* Structure for remembering the local variables in a private frame */
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
393 /* Function arguments that may change */
396 const pcre_uchar *Xecode;
400 unsigned int Xrdepth;
402 /* Function local variables */
404 PCRE_PUCHAR Xcallpat;
406 PCRE_PUCHAR Xcharptr;
412 PCRE_PUCHAR Xsaved_eptr;
414 recursion_info Xnew_recursive;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
425 pcre_uchar Xocchars[6];
435 unsigned int Xnumber;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
444 /* Where to jump back to */
453 /***************************************************************************
454 ***************************************************************************/
458 /*************************************************
459 * Match from current position *
460 *************************************************/
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
526 BOOL minimize, possessive; /* Quantifier options */
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
538 heapframe *frame = (heapframe *)md->match_frames_base;
540 /* Copy in the original argument variables */
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
549 /* This is where control jumps back to to effect "recursion" */
553 /* Macros make the argument variables come from the current frame */
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
562 /* Ditto for the local variables */
565 #define charptr frame->Xcharptr
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
575 #define new_recursive frame->Xnew_recursive
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
604 #define newptrb frame->Xnewptrb
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
610 #else /* NO_RECURSE not defined */
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
623 const pcre_uchar *charptr;
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
632 recursion_info new_recursive;
640 unsigned int prop_value;
641 int prop_fail_result;
643 pcre_uchar occhars[6];
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
676 #endif /* NO_RECURSE */
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
689 #define save_mark data
691 /* These statements are here to stop the compiler complaining about unitialized
696 prop_fail_result = 0;
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
716 utf = md->utf; /* Local copy of the flag */
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
739 if (md->match_function_type == MATCH_CBEGROUP)
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
744 md->match_function_type = 0;
747 /* Now start processing the opcodes. */
751 minimize = possessive = FALSE;
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
774 md->start_match_ptr = eptr;
780 RRETURN(MATCH_NOMATCH);
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
876 save_mark = md->mark;
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
882 mstart = md->start_match_ptr;
885 if (rrc == MATCH_THEN)
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
897 while (*ecode == OP_ALT);
899 /* If hit the end of the group (which could be repeated), fail */
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
916 if (*ecode == OP_KET || eptr == saved_eptr)
918 ecode += 1+LINK_SIZE;
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
926 if (*ecode == OP_KETRMIN)
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
933 else /* OP_KETRMAX */
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
940 /* Control never gets here */
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
962 printf("start bracket %d\n", number);
964 pchars(eptr, 16, TRUE, md);
968 if (offset < md->offset_max)
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
998 if (rrc == MATCH_THEN)
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1006 /* Anything other than NOMATCH is passed back. */
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1059 DPRINTF(("start non-capturing bracket\n"));
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1072 ecode += PRIV(OP_lengths)[*ecode];
1076 /* In all other cases, we have to make another call to match(). */
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 /* See comment in the code for capturing groups above about handling
1086 if (rrc == MATCH_THEN)
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1094 if (rrc != MATCH_NOMATCH)
1096 if (rrc == MATCH_ONCE)
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1114 RRETURN(MATCH_NOMATCH);
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1133 printf("start possessive bracket %d\n", number);
1135 pchars(eptr, 16, TRUE, md);
1139 if (offset < md->offset_max)
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1167 if (rrc == MATCH_KETRPOS)
1169 offset_top = md->end_offset_top;
1170 eptr = md->end_match_ptr;
1171 ecode = md->start_code + code_offset;
1172 save_capture_last = md->capture_last;
1173 matched_once = TRUE;
1174 mstart = md->start_match_ptr; /* In case \K changed it */
1178 /* See comment in the code for capturing groups above about handling
1181 if (rrc == MATCH_THEN)
1183 next = ecode + GET(ecode,1);
1184 if (md->start_match_ptr < next &&
1185 (*ecode == OP_ALT || *next == OP_ALT))
1186 rrc = MATCH_NOMATCH;
1189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190 md->capture_last = save_capture_last;
1191 ecode += GET(ecode, 1);
1192 if (*ecode != OP_ALT) break;
1197 md->offset_vector[offset] = save_offset1;
1198 md->offset_vector[offset+1] = save_offset2;
1199 md->offset_vector[md->offset_end - number] = save_offset3;
1202 if (allow_zero || matched_once)
1204 ecode += 1 + LINK_SIZE;
1208 RRETURN(MATCH_NOMATCH);
1211 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1212 as a non-capturing bracket. */
1214 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222 /* Non-capturing possessive bracket with unlimited repeat. We come here
1223 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1224 without the capturing complication. It is written out separately for speed
1231 POSSESSIVE_NON_CAPTURE:
1232 matched_once = FALSE;
1233 code_offset = (int)(ecode - md->start_code);
1234 save_capture_last = md->capture_last;
1238 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1239 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 if (rrc == MATCH_KETRPOS)
1243 offset_top = md->end_offset_top;
1244 eptr = md->end_match_ptr;
1245 ecode = md->start_code + code_offset;
1246 matched_once = TRUE;
1247 mstart = md->start_match_ptr; /* In case \K reset it */
1251 /* See comment in the code for capturing groups above about handling
1254 if (rrc == MATCH_THEN)
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1268 if (matched_once || allow_zero)
1270 ecode += 1 + LINK_SIZE;
1273 RRETURN(MATCH_NOMATCH);
1275 /* Control never reaches here. */
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1296 if (*ecode == OP_CALLOUT)
1298 if (PUBL(callout) != NULL)
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1333 /* Test the various possible conditions */
1336 switch(condcode = *ecode)
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1366 case OP_DNCREF: /* Duplicate named group used test */
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1380 case OP_DEF: /* DEFINE - always false */
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. If the
1398 assertion has a quantifier that allows zero repeats we must skip over
1399 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1401 if (*ecode == OP_BRAZERO) ecode++;
1402 ecode += GET(ecode, 1);
1403 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1404 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1407 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1408 assertion; it is therefore treated as NOMATCH. Any other return is an
1411 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 RRETURN(rrc); /* Need braces because of following else */
1418 /* Choose branch according to the condition */
1420 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1422 /* We are now at the branch that is to be obeyed. As there is only one, we
1423 can use tail recursion to avoid using another stack frame, except when
1424 there is unlimited repeat of a possibly empty group. In the latter case, a
1425 recursive call to match() is always required, unless the second alternative
1426 doesn't exist, in which case we can just plough on. Note that, for
1427 compatibility with Perl, the | in a conditional group is NOT treated as
1428 creating two alternatives. If a THEN is encountered in the branch, it
1429 propagates out to the enclosing alternative (unless nested in a deeper set
1430 of alternatives, of course). */
1432 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1439 md->match_function_type = MATCH_CBEGROUP;
1440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1444 /* Condition false & no alternative; continue after the group. */
1452 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1453 to close any currently open capturing brackets. */
1456 number = GET2(ecode, 1); /* Must be less than 65536 */
1457 offset = number << 1;
1460 printf("end bracket %d at *ACCEPT", number);
1464 md->capture_last = (md->capture_last & OVFLMASK) | number;
1465 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1467 md->offset_vector[offset] =
1468 md->offset_vector[md->offset_end - number];
1469 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1470 if (offset_top <= offset) offset_top = offset + 2;
1472 ecode += 1 + IMM2_SIZE;
1476 /* End of the pattern, either real or forced. */
1480 case OP_ASSERT_ACCEPT:
1482 /* If we have matched an empty string, fail if not in an assertion and not
1483 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1484 is set and we have matched at the start of the subject. In both cases,
1485 backtracking will then try other alternatives, if any. */
1487 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1488 md->recursive == NULL &&
1490 (md->notempty_atstart &&
1491 mstart == md->start_subject + md->start_offset)))
1492 RRETURN(MATCH_NOMATCH);
1494 /* Otherwise, we have a match. */
1496 md->end_match_ptr = eptr; /* Record where we ended */
1497 md->end_offset_top = offset_top; /* and how many extracts were taken */
1498 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1500 /* For some reason, the macros don't work properly if an expression is
1501 given as the argument to RRETURN when the heap is in use. */
1503 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1506 /* Assertion brackets. Check the alternative branches in turn - the
1507 matching won't pass the KET for an assertion. If any one branch matches,
1508 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1509 start of each branch to move the current point backwards, so the code at
1510 this level is identical to the lookahead case. When the assertion is part
1511 of a condition, we want to return immediately afterwards. The caller of
1512 this incarnation of the match() function will have set MATCH_CONDASSERT in
1513 md->match_function type, and one of these opcodes will be the first opcode
1514 that is processed. We use a local variable that is preserved over calls to
1515 match() to remember this case. */
1519 save_mark = md->mark;
1520 if (md->match_function_type == MATCH_CONDASSERT)
1523 md->match_function_type = 0;
1525 else condassert = FALSE;
1527 /* Loop for each branch */
1531 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1533 /* A match means that the assertion is true; break out of the loop
1534 that matches its alternatives. */
1536 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1538 mstart = md->start_match_ptr; /* In case \K reset it */
1542 /* If not matched, restore the previous mark setting. */
1544 md->mark = save_mark;
1546 /* See comment in the code for capturing groups above about handling
1549 if (rrc == MATCH_THEN)
1551 next = ecode + GET(ecode,1);
1552 if (md->start_match_ptr < next &&
1553 (*ecode == OP_ALT || *next == OP_ALT))
1554 rrc = MATCH_NOMATCH;
1557 /* Anything other than NOMATCH causes the entire assertion to fail,
1558 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1559 uncaptured THEN, which means they take their normal effect. This
1560 consistent approach does not always have exactly the same effect as in
1563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1564 ecode += GET(ecode, 1);
1566 while (*ecode == OP_ALT); /* Continue for next alternative */
1568 /* If we have tried all the alternative branches, the assertion has
1569 failed. If not, we broke out after a match. */
1571 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1573 /* If checking an assertion for a condition, return MATCH_MATCH. */
1575 if (condassert) RRETURN(MATCH_MATCH);
1577 /* Continue from after a successful assertion, updating the offsets high
1578 water mark, since extracts may have been taken during the assertion. */
1580 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1581 ecode += 1 + LINK_SIZE;
1582 offset_top = md->end_offset_top;
1585 /* Negative assertion: all branches must fail to match for the assertion to
1589 case OP_ASSERTBACK_NOT:
1590 save_mark = md->mark;
1591 if (md->match_function_type == MATCH_CONDASSERT)
1594 md->match_function_type = 0;
1596 else condassert = FALSE;
1598 /* Loop for each alternative branch. */
1602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1603 md->mark = save_mark; /* Always restore the mark setting */
1607 case MATCH_MATCH: /* A successful match means */
1608 case MATCH_ACCEPT: /* the assertion has failed. */
1609 RRETURN(MATCH_NOMATCH);
1611 case MATCH_NOMATCH: /* Carry on with next branch */
1614 /* See comment in the code for capturing groups above about handling
1618 next = ecode + GET(ecode,1);
1619 if (md->start_match_ptr < next &&
1620 (*ecode == OP_ALT || *next == OP_ALT))
1622 rrc = MATCH_NOMATCH;
1625 /* Otherwise fall through. */
1627 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1628 assertion to fail to match, without considering any more alternatives.
1629 Failing to match means the assertion is true. This is a consistent
1630 approach, but does not always have the same effect as in Perl. */
1634 case MATCH_SKIP_ARG:
1636 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1637 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1639 /* Anything else is an error */
1645 /* Continue with next branch */
1647 ecode += GET(ecode,1);
1649 while (*ecode == OP_ALT);
1651 /* All branches in the assertion failed to match. */
1654 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1655 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1658 /* Move the subject pointer back. This occurs only at the start of
1659 each branch of a lookbehind assertion. If we are too close to the start to
1660 move back, this match function fails. When working with UTF-8 we move
1661 back a number of characters, not bytes. */
1671 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1678 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1681 eptr -= GET(ecode, 1);
1682 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1685 /* Save the earliest consulted character, then skip to next op code */
1687 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1688 ecode += 1 + LINK_SIZE;
1691 /* The callout item calls an external function, if one is provided, passing
1692 details of the match so far. This is mainly for debugging, though the
1693 function is able to force a failure. */
1696 if (PUBL(callout) != NULL)
1698 PUBL(callout_block) cb;
1699 cb.version = 2; /* Version 1 of the callout block */
1700 cb.callout_number = ecode[1];
1701 cb.offset_vector = md->offset_vector;
1702 #if defined COMPILE_PCRE8
1703 cb.subject = (PCRE_SPTR)md->start_subject;
1704 #elif defined COMPILE_PCRE16
1705 cb.subject = (PCRE_SPTR16)md->start_subject;
1706 #elif defined COMPILE_PCRE32
1707 cb.subject = (PCRE_SPTR32)md->start_subject;
1709 cb.subject_length = (int)(md->end_subject - md->start_subject);
1710 cb.start_match = (int)(mstart - md->start_subject);
1711 cb.current_position = (int)(eptr - md->start_subject);
1712 cb.pattern_position = GET(ecode, 2);
1713 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1714 cb.capture_top = offset_top/2;
1715 cb.capture_last = md->capture_last & CAPLMASK;
1716 /* Internal change requires this for API compatibility. */
1717 if (cb.capture_last == 0) cb.capture_last = -1;
1718 cb.callout_data = md->callout_data;
1719 cb.mark = md->nomatch_mark;
1720 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1721 if (rrc < 0) RRETURN(rrc);
1723 ecode += 2 + 2*LINK_SIZE;
1726 /* Recursion either matches the current regex, or some subexpression. The
1727 offset data is the offset to the starting bracket from the start of the
1728 whole pattern. (This is so that it works from duplicated subpatterns.)
1730 The state of the capturing groups is preserved over recursion, and
1731 re-instated afterwards. We don't know how many are started and not yet
1732 finished (offset_top records the completed total) so we just have to save
1733 all the potential data. There may be up to 65535 such values, which is too
1734 large to put on the stack, but using malloc for small numbers seems
1735 expensive. As a compromise, the stack is used when there are no more than
1736 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1738 There are also other values that have to be saved. We use a chained
1739 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1740 for the original version of this logic. It has, however, been hacked around
1741 a lot, so he is not to blame for the current way it works. */
1748 callpat = md->start_code + GET(ecode, 1);
1749 recno = (callpat == md->start_code)? 0 :
1750 GET2(callpat, 1 + LINK_SIZE);
1752 /* Check for repeating a recursion without advancing the subject pointer.
1753 This should catch convoluted mutual recursions. (Some simple cases are
1754 caught at compile time.) */
1756 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1757 if (recno == ri->group_num && eptr == ri->subject_position)
1758 RRETURN(PCRE_ERROR_RECURSELOOP);
1760 /* Add to "recursing stack" */
1762 new_recursive.group_num = recno;
1763 new_recursive.saved_capture_last = md->capture_last;
1764 new_recursive.subject_position = eptr;
1765 new_recursive.prevrec = md->recursive;
1766 md->recursive = &new_recursive;
1768 /* Where to continue from afterwards */
1770 ecode += 1 + LINK_SIZE;
1772 /* Now save the offset data */
1774 new_recursive.saved_max = md->offset_end;
1775 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1776 new_recursive.offset_save = stacksave;
1779 new_recursive.offset_save =
1780 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1781 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1783 memcpy(new_recursive.offset_save, md->offset_vector,
1784 new_recursive.saved_max * sizeof(int));
1786 /* OK, now we can do the recursion. After processing each alternative,
1787 restore the offset data and the last captured value. If there were nested
1788 recursions, md->recursive might be changed, so reset it before looping.
1791 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1792 cbegroup = (*callpat >= OP_SBRA);
1795 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1798 memcpy(md->offset_vector, new_recursive.offset_save,
1799 new_recursive.saved_max * sizeof(int));
1800 md->capture_last = new_recursive.saved_capture_last;
1801 md->recursive = new_recursive.prevrec;
1802 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1804 DPRINTF(("Recursion matched\n"));
1805 if (new_recursive.offset_save != stacksave)
1806 (PUBL(free))(new_recursive.offset_save);
1808 /* Set where we got to in the subject, and reset the start in case
1809 it was changed by \K. This *is* propagated back out of a recursion,
1810 for Perl compatibility. */
1812 eptr = md->end_match_ptr;
1813 mstart = md->start_match_ptr;
1814 goto RECURSION_MATCHED; /* Exit loop; end processing */
1817 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1818 recursion; they cause a NOMATCH for the entire recursion. These codes
1819 are defined in a range that can be tested for. */
1821 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1822 RRETURN(MATCH_NOMATCH);
1824 /* Any return code other than NOMATCH is an error. */
1826 if (rrc != MATCH_NOMATCH)
1828 DPRINTF(("Recursion gave error %d\n", rrc));
1829 if (new_recursive.offset_save != stacksave)
1830 (PUBL(free))(new_recursive.offset_save);
1834 md->recursive = &new_recursive;
1835 callpat += GET(callpat, 1);
1837 while (*callpat == OP_ALT);
1839 DPRINTF(("Recursion didn't match\n"));
1840 md->recursive = new_recursive.prevrec;
1841 if (new_recursive.offset_save != stacksave)
1842 (PUBL(free))(new_recursive.offset_save);
1843 RRETURN(MATCH_NOMATCH);
1849 /* An alternation is the end of a branch; scan along to find the end of the
1850 bracketed group and go to there. */
1853 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1856 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1857 indicating that it may occur zero times. It may repeat infinitely, or not
1858 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1859 with fixed upper repeat limits are compiled as a number of copies, with the
1860 optional ones preceded by BRAZERO or BRAMINZERO. */
1864 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1866 do next += GET(next, 1); while (*next == OP_ALT);
1867 ecode = next + 1 + LINK_SIZE;
1872 do next += GET(next, 1); while (*next == OP_ALT);
1873 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1880 do next += GET(next,1); while (*next == OP_ALT);
1881 ecode = next + 1 + LINK_SIZE;
1884 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1885 here; just jump to the group, with allow_zero set TRUE. */
1890 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1891 goto POSSESSIVE_NON_CAPTURE;
1893 /* End of a group, repeated or non-repeating. */
1899 prev = ecode - GET(ecode, 1);
1901 /* If this was a group that remembered the subject start, in order to break
1902 infinite repeats of empty string matches, retrieve the subject start from
1903 the chain. Otherwise, set it NULL. */
1905 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1907 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1908 eptrb = eptrb->epb_prev; /* Backup to previous group */
1910 else saved_eptr = NULL;
1912 /* If we are at the end of an assertion group or a non-capturing atomic
1913 group, stop matching and return MATCH_MATCH, but record the current high
1914 water mark for use by positive assertions. We also need to record the match
1915 start in case it was changed by \K. */
1917 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1918 *prev == OP_ONCE_NC)
1920 md->end_match_ptr = eptr; /* For ONCE_NC */
1921 md->end_offset_top = offset_top;
1922 md->start_match_ptr = mstart;
1923 RRETURN(MATCH_MATCH); /* Sets md->mark */
1926 /* For capturing groups we have to check the group number back at the start
1927 and if necessary complete handling an extraction by setting the offsets and
1928 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1929 into group 0, so it won't be picked up here. Instead, we catch it when the
1930 OP_END is reached. Other recursion is handled here. We just have to record
1931 the current subject position and start match pointer and give a MATCH
1934 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1935 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1937 number = GET2(prev, 1+LINK_SIZE);
1938 offset = number << 1;
1941 printf("end bracket %d", number);
1945 /* Handle a recursively called group. */
1947 if (md->recursive != NULL && md->recursive->group_num == number)
1949 md->end_match_ptr = eptr;
1950 md->start_match_ptr = mstart;
1951 RRETURN(MATCH_MATCH);
1954 /* Deal with capturing */
1956 md->capture_last = (md->capture_last & OVFLMASK) | number;
1957 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1959 /* If offset is greater than offset_top, it means that we are
1960 "skipping" a capturing group, and that group's offsets must be marked
1961 unset. In earlier versions of PCRE, all the offsets were unset at the
1962 start of matching, but this doesn't work because atomic groups and
1963 assertions can cause a value to be set that should later be unset.
1964 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1965 part of the atomic group, but this is not on the final matching path,
1966 so must be unset when 2 is set. (If there is no group 2, there is no
1967 problem, because offset_top will then be 2, indicating no capture.) */
1969 if (offset > offset_top)
1971 register int *iptr = md->offset_vector + offset_top;
1972 register int *iend = md->offset_vector + offset;
1973 while (iptr < iend) *iptr++ = -1;
1976 /* Now make the extraction */
1978 md->offset_vector[offset] =
1979 md->offset_vector[md->offset_end - number];
1980 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1981 if (offset_top <= offset) offset_top = offset + 2;
1985 /* For an ordinary non-repeating ket, just continue at this level. This
1986 also happens for a repeating ket if no characters were matched in the
1987 group. This is the forcible breaking of infinite loops as implemented in
1988 Perl 5.005. For a non-repeating atomic group that includes captures,
1989 establish a backup point by processing the rest of the pattern at a lower
1990 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1991 original OP_ONCE level, thereby bypassing intermediate backup points, but
1992 resetting any captures that happened along the way. */
1994 if (*ecode == OP_KET || eptr == saved_eptr)
1996 if (*prev == OP_ONCE)
1998 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1999 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2000 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2001 RRETURN(MATCH_ONCE);
2003 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2007 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2008 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2009 at a time from the outer level, thus saving stack. */
2011 if (*ecode == OP_KETRPOS)
2013 md->start_match_ptr = mstart; /* In case \K reset it */
2014 md->end_match_ptr = eptr;
2015 md->end_offset_top = offset_top;
2016 RRETURN(MATCH_KETRPOS);
2019 /* The normal repeating kets try the rest of the pattern or restart from
2020 the preceding bracket, in the appropriate order. In the second case, we can
2021 use tail recursion to avoid using another stack frame, unless we have an
2022 an atomic group or an unlimited repeat of a group that can match an empty
2025 if (*ecode == OP_KETRMIN)
2027 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2029 if (*prev == OP_ONCE)
2031 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2033 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2034 RRETURN(MATCH_ONCE);
2036 if (*prev >= OP_SBRA) /* Could match an empty string */
2038 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2044 else /* OP_KETRMAX */
2046 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2047 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2049 if (*prev == OP_ONCE)
2051 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2053 md->once_target = prev;
2054 RRETURN(MATCH_ONCE);
2056 ecode += 1 + LINK_SIZE;
2059 /* Control never gets here */
2061 /* Not multiline mode: start of subject assertion, unless notbol. */
2064 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2066 /* Start of subject assertion */
2069 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2073 /* Multiline mode: start of subject unless notbol, or after any newline. */
2076 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2077 if (eptr != md->start_subject &&
2078 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2079 RRETURN(MATCH_NOMATCH);
2083 /* Start of match assertion */
2086 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2090 /* Reset the start of match point */
2097 /* Multiline mode: assert before any newline, or before end of subject
2098 unless noteol is set. */
2101 if (eptr < md->end_subject)
2103 if (!IS_NEWLINE(eptr))
2105 if (md->partial != 0 &&
2106 eptr + 1 >= md->end_subject &&
2107 NLBLOCK->nltype == NLTYPE_FIXED &&
2108 NLBLOCK->nllen == 2 &&
2109 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2112 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2114 RRETURN(MATCH_NOMATCH);
2119 if (md->noteol) RRETURN(MATCH_NOMATCH);
2125 /* Not multiline mode: assert before a terminating newline or before end of
2126 subject unless noteol is set. */
2129 if (md->noteol) RRETURN(MATCH_NOMATCH);
2130 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2132 /* ... else fall through for endonly */
2134 /* End of subject assertion (\z) */
2137 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2142 /* End of subject or ending \n assertion (\Z) */
2146 if (eptr < md->end_subject &&
2147 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2149 if (md->partial != 0 &&
2150 eptr + 1 >= md->end_subject &&
2151 NLBLOCK->nltype == NLTYPE_FIXED &&
2152 NLBLOCK->nllen == 2 &&
2153 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2156 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2158 RRETURN(MATCH_NOMATCH);
2161 /* Either at end of string or \n before end. */
2167 /* Word boundary assertions */
2169 case OP_NOT_WORD_BOUNDARY:
2170 case OP_WORD_BOUNDARY:
2173 /* Find out if the previous and current characters are "word" characters.
2174 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2175 be "non-word" characters. Remember the earliest consulted character for
2176 partial matching. */
2181 /* Get status of previous character */
2183 if (eptr == md->start_subject) prev_is_word = FALSE; else
2185 PCRE_PUCHAR lastptr = eptr - 1;
2187 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2188 GETCHAR(c, lastptr);
2192 if (c == '_') prev_is_word = TRUE; else
2194 int cat = UCD_CATEGORY(c);
2195 prev_is_word = (cat == ucp_L || cat == ucp_N);
2200 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2203 /* Get status of next character */
2205 if (eptr >= md->end_subject)
2208 cur_is_word = FALSE;
2216 if (c == '_') cur_is_word = TRUE; else
2218 int cat = UCD_CATEGORY(c);
2219 cur_is_word = (cat == ucp_L || cat == ucp_N);
2224 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2230 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2231 consistency with the behaviour of \w we do use it in this case. */
2234 /* Get status of previous character */
2236 if (eptr == md->start_subject) prev_is_word = FALSE; else
2238 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2243 if (c == '_') prev_is_word = TRUE; else
2245 int cat = UCD_CATEGORY(c);
2246 prev_is_word = (cat == ucp_L || cat == ucp_N);
2251 prev_is_word = MAX_255(eptr[-1])
2252 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2255 /* Get status of next character */
2257 if (eptr >= md->end_subject)
2260 cur_is_word = FALSE;
2267 if (c == '_') cur_is_word = TRUE; else
2269 int cat = UCD_CATEGORY(c);
2270 cur_is_word = (cat == ucp_L || cat == ucp_N);
2275 cur_is_word = MAX_255(*eptr)
2276 && ((md->ctypes[*eptr] & ctype_word) != 0);
2279 /* Now see if the situation is what we want */
2281 if ((*ecode++ == OP_WORD_BOUNDARY)?
2282 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2283 RRETURN(MATCH_NOMATCH);
2287 /* Match any single character type except newline; have to take care with
2288 CRLF newlines and partial matching. */
2291 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2292 if (md->partial != 0 &&
2293 eptr + 1 >= md->end_subject &&
2294 NLBLOCK->nltype == NLTYPE_FIXED &&
2295 NLBLOCK->nllen == 2 &&
2296 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2299 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2304 /* Match any single character whatsoever. */
2307 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2308 { /* not be updated before SCHECK_PARTIAL. */
2310 RRETURN(MATCH_NOMATCH);
2314 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2319 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2320 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2323 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2324 { /* not be updated before SCHECK_PARTIAL. */
2326 RRETURN(MATCH_NOMATCH);
2333 if (eptr >= md->end_subject)
2336 RRETURN(MATCH_NOMATCH);
2338 GETCHARINCTEST(c, eptr);
2340 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2343 (md->ctypes[c] & ctype_digit) != 0
2345 RRETURN(MATCH_NOMATCH);
2350 if (eptr >= md->end_subject)
2353 RRETURN(MATCH_NOMATCH);
2355 GETCHARINCTEST(c, eptr);
2357 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2360 (md->ctypes[c] & ctype_digit) == 0
2362 RRETURN(MATCH_NOMATCH);
2366 case OP_NOT_WHITESPACE:
2367 if (eptr >= md->end_subject)
2370 RRETURN(MATCH_NOMATCH);
2372 GETCHARINCTEST(c, eptr);
2374 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2377 (md->ctypes[c] & ctype_space) != 0
2379 RRETURN(MATCH_NOMATCH);
2384 if (eptr >= md->end_subject)
2387 RRETURN(MATCH_NOMATCH);
2389 GETCHARINCTEST(c, eptr);
2391 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2394 (md->ctypes[c] & ctype_space) == 0
2396 RRETURN(MATCH_NOMATCH);
2400 case OP_NOT_WORDCHAR:
2401 if (eptr >= md->end_subject)
2404 RRETURN(MATCH_NOMATCH);
2406 GETCHARINCTEST(c, eptr);
2408 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2411 (md->ctypes[c] & ctype_word) != 0
2413 RRETURN(MATCH_NOMATCH);
2418 if (eptr >= md->end_subject)
2421 RRETURN(MATCH_NOMATCH);
2423 GETCHARINCTEST(c, eptr);
2425 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2428 (md->ctypes[c] & ctype_word) == 0
2430 RRETURN(MATCH_NOMATCH);
2435 if (eptr >= md->end_subject)
2438 RRETURN(MATCH_NOMATCH);
2440 GETCHARINCTEST(c, eptr);
2443 default: RRETURN(MATCH_NOMATCH);
2446 if (eptr >= md->end_subject)
2450 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2462 #endif /* Not EBCDIC */
2463 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2470 if (eptr >= md->end_subject)
2473 RRETURN(MATCH_NOMATCH);
2475 GETCHARINCTEST(c, eptr);
2478 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2485 if (eptr >= md->end_subject)
2488 RRETURN(MATCH_NOMATCH);
2490 GETCHARINCTEST(c, eptr);
2493 HSPACE_CASES: break; /* Byte and multibyte cases */
2494 default: RRETURN(MATCH_NOMATCH);
2500 if (eptr >= md->end_subject)
2503 RRETURN(MATCH_NOMATCH);
2505 GETCHARINCTEST(c, eptr);
2508 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2515 if (eptr >= md->end_subject)
2518 RRETURN(MATCH_NOMATCH);
2520 GETCHARINCTEST(c, eptr);
2523 VSPACE_CASES: break;
2524 default: RRETURN(MATCH_NOMATCH);
2530 /* Check the next character by Unicode property. We will get here only
2531 if the support is in the binary; otherwise a compile-time error occurs. */
2535 if (eptr >= md->end_subject)
2538 RRETURN(MATCH_NOMATCH);
2540 GETCHARINCTEST(c, eptr);
2542 const pcre_uint32 *cp;
2543 const ucd_record *prop = GET_UCD(c);
2548 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2552 if ((prop->chartype == ucp_Lu ||
2553 prop->chartype == ucp_Ll ||
2554 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2555 RRETURN(MATCH_NOMATCH);
2559 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2560 RRETURN(MATCH_NOMATCH);
2564 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2565 RRETURN(MATCH_NOMATCH);
2569 if ((ecode[2] != prop->script) == (op == OP_PROP))
2570 RRETURN(MATCH_NOMATCH);
2573 /* These are specials */
2576 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2577 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2578 RRETURN(MATCH_NOMATCH);
2581 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2582 which means that Perl space and POSIX space are now identical. PCRE
2583 was changed at release 8.34. */
2585 case PT_SPACE: /* Perl space */
2586 case PT_PXSPACE: /* POSIX space */
2591 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2595 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2596 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2602 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2603 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2604 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2605 RRETURN(MATCH_NOMATCH);
2609 cp = PRIV(ucd_caseless_sets) + ecode[2];
2613 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2615 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2620 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2621 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2622 c >= 0xe000) == (op == OP_NOTPROP))
2623 RRETURN(MATCH_NOMATCH);
2626 /* This should never occur */
2629 RRETURN(PCRE_ERROR_INTERNAL);
2636 /* Match an extended Unicode sequence. We will get here only if the support
2637 is in the binary; otherwise a compile-time error occurs. */
2640 if (eptr >= md->end_subject)
2643 RRETURN(MATCH_NOMATCH);
2648 GETCHARINCTEST(c, eptr);
2649 lgb = UCD_GRAPHBREAK(c);
2650 while (eptr < md->end_subject)
2653 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2654 rgb = UCD_GRAPHBREAK(c);
2655 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2663 #endif /* SUPPORT_UCP */
2666 /* Match a back reference, possibly repeatedly. Look past the end of the
2667 item to see if there is repeat information following. The code is similar
2668 to that for character classes, but repeated for efficiency. Then obey
2669 similar code to character type repeats - written out again for speed.
2670 However, if the referenced string is the empty string, always treat
2671 it as matched, any number of times (otherwise there could be infinite
2672 loops). If the reference is unset, there are two possibilities:
2674 (a) In the default, Perl-compatible state, set the length negative;
2675 this ensures that every attempt at a match fails. We can't just fail
2676 here, because of the possibility of quantifiers with zero minima.
2678 (b) If the JavaScript compatibility flag is set, set the length to zero
2679 so that the back reference matches an empty string.
2681 Otherwise, set the length to the length of what was matched by the
2682 referenced subpattern.
2684 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2685 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2686 and OP_DNREFI are used. In this case we must scan the list of groups to
2687 which the name refers, and use the first one that is set. */
2691 caseless = op == OP_DNREFI;
2693 int count = GET2(ecode, 1+IMM2_SIZE);
2694 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2695 ecode += 1 + 2*IMM2_SIZE;
2697 /* Setting the default length first and initializing 'offset' avoids
2698 compiler warnings in the REF_REPEAT code. */
2700 length = (md->jscript_compat)? 0 : -1;
2705 offset = GET2(slot, 0) << 1;
2706 if (offset < offset_top && md->offset_vector[offset] >= 0)
2708 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2711 slot += md->name_entry_size;
2718 caseless = op == OP_REFI;
2719 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2720 ecode += 1 + IMM2_SIZE;
2721 if (offset >= offset_top || md->offset_vector[offset] < 0)
2722 length = (md->jscript_compat)? 0 : -1;
2724 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2726 /* Set up for repetition, or handle the non-repeated case */
2737 c = *ecode++ - OP_CRSTAR;
2738 minimize = (c & 1) != 0;
2739 min = rep_min[c]; /* Pick up values from tables; */
2740 max = rep_max[c]; /* zero for max => infinity */
2741 if (max == 0) max = INT_MAX;
2746 minimize = (*ecode == OP_CRMINRANGE);
2747 min = GET2(ecode, 1);
2748 max = GET2(ecode, 1 + IMM2_SIZE);
2749 if (max == 0) max = INT_MAX;
2750 ecode += 1 + 2 * IMM2_SIZE;
2753 default: /* No repeat follows */
2754 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2756 if (length == -2) eptr = md->end_subject; /* Partial match */
2758 RRETURN(MATCH_NOMATCH);
2761 continue; /* With the main loop */
2764 /* Handle repeated back references. If the length of the reference is
2765 zero, just continue with the main loop. If the length is negative, it
2766 means the reference is unset in non-Java-compatible mode. If the minimum is
2767 zero, we can continue at the same level without recursion. For any other
2768 minimum, carrying on will result in NOMATCH. */
2770 if (length == 0) continue;
2771 if (length < 0 && min == 0) continue;
2773 /* First, ensure the minimum number of matches are present. We get back
2774 the length of the reference string explicitly rather than passing the
2775 address of eptr, so that eptr can be a register variable. */
2777 for (i = 1; i <= min; i++)
2780 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2782 if (slength == -2) eptr = md->end_subject; /* Partial match */
2784 RRETURN(MATCH_NOMATCH);
2789 /* If min = max, continue at the same level without recursion.
2790 They are not both allowed to be zero. */
2792 if (min == max) continue;
2794 /* If minimizing, keep trying and advancing the pointer */
2798 for (fi = min;; fi++)
2801 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803 if (fi >= max) RRETURN(MATCH_NOMATCH);
2804 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2806 if (slength == -2) eptr = md->end_subject; /* Partial match */
2808 RRETURN(MATCH_NOMATCH);
2812 /* Control never gets here */
2815 /* If maximizing, find the longest string and work backwards */
2820 for (i = min; i < max; i++)
2823 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2825 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2826 the soft partial matching case. */
2828 if (slength == -2 && md->partial != 0 &&
2829 md->end_subject > md->start_used_ptr)
2832 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2845 RRETURN(MATCH_NOMATCH);
2847 /* Control never gets here */
2849 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2850 used when all the characters in the class have values in the range 0-255,
2851 and either the matching is caseful, or the characters are in the range
2852 0-127 when UTF-8 processing is enabled. The only difference between
2853 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2856 First, look past the end of the item to see if there is repeat information
2857 following. Then obey similar code to character type repeats - written out
2863 /* The data variable is saved across frames, so the byte map needs to
2865 #define BYTE_MAP ((pcre_uint8 *)data)
2866 data = ecode + 1; /* Save for matching */
2867 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2880 c = *ecode++ - OP_CRSTAR;
2881 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2882 else possessive = TRUE;
2883 min = rep_min[c]; /* Pick up values from tables; */
2884 max = rep_max[c]; /* zero for max => infinity */
2885 if (max == 0) max = INT_MAX;
2891 minimize = (*ecode == OP_CRMINRANGE);
2892 possessive = (*ecode == OP_CRPOSRANGE);
2893 min = GET2(ecode, 1);
2894 max = GET2(ecode, 1 + IMM2_SIZE);
2895 if (max == 0) max = INT_MAX;
2896 ecode += 1 + 2 * IMM2_SIZE;
2899 default: /* No repeat follows */
2904 /* First, ensure the minimum number of matches are present. */
2909 for (i = 1; i <= min; i++)
2911 if (eptr >= md->end_subject)
2914 RRETURN(MATCH_NOMATCH);
2916 GETCHARINC(c, eptr);
2919 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2922 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2929 for (i = 1; i <= min; i++)
2931 if (eptr >= md->end_subject)
2934 RRETURN(MATCH_NOMATCH);
2937 #ifndef COMPILE_PCRE8
2940 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2944 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2948 /* If max == min we can continue with the main loop without the
2951 if (min == max) continue;
2953 /* If minimizing, keep testing the rest of the expression and advancing
2954 the pointer while it matches the class. */
2961 for (fi = min;; fi++)
2963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2965 if (fi >= max) RRETURN(MATCH_NOMATCH);
2966 if (eptr >= md->end_subject)
2969 RRETURN(MATCH_NOMATCH);
2971 GETCHARINC(c, eptr);
2974 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2977 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2984 for (fi = min;; fi++)
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) RRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2992 RRETURN(MATCH_NOMATCH);
2995 #ifndef COMPILE_PCRE8
2998 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3002 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3005 /* Control never gets here */
3008 /* If maximizing, find the longest possible run, then work backwards. */
3017 for (i = min; i < max; i++)
3020 if (eptr >= md->end_subject)
3025 GETCHARLEN(c, eptr, len);
3028 if (op == OP_CLASS) break;
3031 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3035 if (possessive) continue; /* No backtracking */
3039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3041 if (eptr-- == pp) break; /* Stop if tried at original pos */
3049 for (i = min; i < max; i++)
3051 if (eptr >= md->end_subject)
3057 #ifndef COMPILE_PCRE8
3060 if (op == OP_CLASS) break;
3064 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3068 if (possessive) continue; /* No backtracking */
3072 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3073 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3078 RRETURN(MATCH_NOMATCH);
3082 /* Control never gets here */
3085 /* Match an extended character class. In the 8-bit library, this opcode is
3086 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3087 32-bit libraries, codepoints greater than 255 may be encountered even when
3088 UTF is not supported. */
3090 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3093 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3094 ecode += GET(ecode, 1); /* Advance past the item */
3107 c = *ecode++ - OP_CRSTAR;
3108 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3109 else possessive = TRUE;
3110 min = rep_min[c]; /* Pick up values from tables; */
3111 max = rep_max[c]; /* zero for max => infinity */
3112 if (max == 0) max = INT_MAX;
3118 minimize = (*ecode == OP_CRMINRANGE);
3119 possessive = (*ecode == OP_CRPOSRANGE);
3120 min = GET2(ecode, 1);
3121 max = GET2(ecode, 1 + IMM2_SIZE);
3122 if (max == 0) max = INT_MAX;
3123 ecode += 1 + 2 * IMM2_SIZE;
3126 default: /* No repeat follows */
3131 /* First, ensure the minimum number of matches are present. */
3133 for (i = 1; i <= min; i++)
3135 if (eptr >= md->end_subject)
3138 RRETURN(MATCH_NOMATCH);
3140 GETCHARINCTEST(c, eptr);
3141 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3144 /* If max == min we can continue with the main loop without the
3147 if (min == max) continue;
3149 /* If minimizing, keep testing the rest of the expression and advancing
3150 the pointer while it matches the class. */
3154 for (fi = min;; fi++)
3156 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3158 if (fi >= max) RRETURN(MATCH_NOMATCH);
3159 if (eptr >= md->end_subject)
3162 RRETURN(MATCH_NOMATCH);
3164 GETCHARINCTEST(c, eptr);
3165 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3167 /* Control never gets here */
3170 /* If maximizing, find the longest possible run, then work backwards. */
3175 for (i = min; i < max; i++)
3178 if (eptr >= md->end_subject)
3184 GETCHARLENTEST(c, eptr, len);
3188 if (!PRIV(xclass)(c, data, utf)) break;
3192 if (possessive) continue; /* No backtracking */
3196 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3197 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3198 if (eptr-- == pp) break; /* Stop if tried at original pos */
3200 if (utf) BACKCHAR(eptr);
3203 RRETURN(MATCH_NOMATCH);
3206 /* Control never gets here */
3208 #endif /* End of XCLASS */
3210 /* Match a single character, casefully */
3218 GETCHARLEN(fc, ecode, length);
3219 if (length > md->end_subject - eptr)
3221 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3222 RRETURN(MATCH_NOMATCH);
3224 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3230 if (md->end_subject - eptr < 1)
3232 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3233 RRETURN(MATCH_NOMATCH);
3235 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3240 /* Match a single character, caselessly. If we are at the end of the
3241 subject, give up immediately. */
3244 if (eptr >= md->end_subject)
3247 RRETURN(MATCH_NOMATCH);
3255 GETCHARLEN(fc, ecode, length);
3257 /* If the pattern character's value is < 128, we have only one byte, and
3258 we know that its other case must also be one byte long, so we can use the
3259 fast lookup table. We know that there is at least one byte left in the
3264 pcre_uint32 cc = UCHAR21(eptr);
3265 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3270 /* Otherwise we must pick up the subject character. Note that we cannot
3271 use the value of "length" to check for sufficient bytes left, because the
3272 other case of the character may have more or fewer bytes. */
3277 GETCHARINC(dc, eptr);
3280 /* If we have Unicode property support, we can use it to test the other
3281 case of the character, if there is one. */
3286 if (dc != UCD_OTHERCASE(fc))
3288 RRETURN(MATCH_NOMATCH);
3293 #endif /* SUPPORT_UTF */
3297 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3298 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3304 /* Match a single character repeatedly. */
3308 min = max = GET2(ecode, 1);
3309 ecode += 1 + IMM2_SIZE;
3322 max = GET2(ecode, 1);
3323 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3324 ecode += 1 + IMM2_SIZE;
3363 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3364 minimize = (c & 1) != 0;
3365 min = rep_min[c]; /* Pick up values from tables; */
3366 max = rep_max[c]; /* zero for max => infinity */
3367 if (max == 0) max = INT_MAX;
3369 /* Common code for all repeated single-character matches. We first check
3370 for the minimum number of characters. If the minimum equals the maximum, we
3371 are done. Otherwise, if minimizing, check the rest of the pattern for a
3372 match; if there isn't one, advance up to the maximum, one character at a
3375 If maximizing, advance up to the maximum number of matching characters,
3376 until eptr is past the end of the maximum run. If possessive, we are
3377 then done (no backing up). Otherwise, match at this position; anything
3378 other than no match is immediately returned. For nomatch, back up one
3379 character, unless we are matching \R and the last thing matched was
3380 \r\n, in which case, back up two bytes. When we reach the first optional
3381 character position, we can save stack by doing a tail recurse.
3383 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3392 GETCHARLEN(fc, ecode, length);
3395 /* Handle multibyte character matching specially here. There is
3396 support for caseless matching if UCP support is present. */
3401 pcre_uint32 othercase;
3402 if (op >= OP_STARI && /* Caseless */
3403 (othercase = UCD_OTHERCASE(fc)) != fc)
3404 oclength = PRIV(ord2utf)(othercase, occhars);
3406 #endif /* SUPPORT_UCP */
3408 for (i = 1; i <= min; i++)
3410 if (eptr <= md->end_subject - length &&
3411 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3413 else if (oclength > 0 &&
3414 eptr <= md->end_subject - oclength &&
3415 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3416 #endif /* SUPPORT_UCP */
3420 RRETURN(MATCH_NOMATCH);
3424 if (min == max) continue;
3428 for (fi = min;; fi++)
3430 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3431 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3432 if (fi >= max) RRETURN(MATCH_NOMATCH);
3433 if (eptr <= md->end_subject - length &&
3434 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3436 else if (oclength > 0 &&
3437 eptr <= md->end_subject - oclength &&
3438 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3439 #endif /* SUPPORT_UCP */
3443 RRETURN(MATCH_NOMATCH);
3446 /* Control never gets here */
3452 for (i = min; i < max; i++)
3454 if (eptr <= md->end_subject - length &&
3455 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3457 else if (oclength > 0 &&
3458 eptr <= md->end_subject - oclength &&
3459 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3460 #endif /* SUPPORT_UCP */
3468 if (possessive) continue; /* No backtracking */
3471 if (eptr == pp) goto TAIL_RECURSE;
3472 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477 #else /* without SUPPORT_UCP */
3479 #endif /* SUPPORT_UCP */
3482 /* Control never gets here */
3485 /* If the length of a UTF-8 character is 1, we fall through here, and
3486 obey the code as for non-UTF-8 characters below, though in this case the
3487 value of fc will always be < 128. */
3490 #endif /* SUPPORT_UTF */
3491 /* When not in UTF-8 mode, load a single-byte character. */
3494 /* The value of fc at this point is always one character, though we may
3495 or may not be in UTF mode. The code is duplicated for the caseless and
3496 caseful cases, for speed, since matching characters is likely to be quite
3497 common. First, ensure the minimum number of matches are present. If min =
3498 max, continue at the same level without recursing. Otherwise, if
3499 minimizing, keep trying the rest of the expression and advancing one
3500 matching character if failing, up to the maximum. Alternatively, if
3501 maximizing, find the maximum number of characters and work backwards. */
3503 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3504 max, (char *)eptr));
3506 if (op >= OP_STARI) /* Caseless */
3508 #ifdef COMPILE_PCRE8
3509 /* fc must be < 128 if UTF is enabled. */
3514 if (utf && fc > 127)
3515 foc = UCD_OTHERCASE(fc);
3517 if (utf && fc > 127)
3519 #endif /* SUPPORT_UCP */
3521 #endif /* SUPPORT_UTF */
3522 foc = TABLE_GET(fc, md->fcc, fc);
3523 #endif /* COMPILE_PCRE8 */
3525 for (i = 1; i <= min; i++)
3527 pcre_uint32 cc; /* Faster than pcre_uchar */
3528 if (eptr >= md->end_subject)
3531 RRETURN(MATCH_NOMATCH);
3533 cc = UCHAR21TEST(eptr);
3534 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3537 if (min == max) continue;
3540 for (fi = min;; fi++)
3542 pcre_uint32 cc; /* Faster than pcre_uchar */
3543 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3544 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3545 if (fi >= max) RRETURN(MATCH_NOMATCH);
3546 if (eptr >= md->end_subject)
3549 RRETURN(MATCH_NOMATCH);
3551 cc = UCHAR21TEST(eptr);
3552 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3555 /* Control never gets here */
3560 for (i = min; i < max; i++)
3562 pcre_uint32 cc; /* Faster than pcre_uchar */
3563 if (eptr >= md->end_subject)
3568 cc = UCHAR21TEST(eptr);
3569 if (fc != cc && foc != cc) break;
3572 if (possessive) continue; /* No backtracking */
3575 if (eptr == pp) goto TAIL_RECURSE;
3576 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3580 /* Control never gets here */
3584 /* Caseful comparisons (includes all multi-byte characters) */
3588 for (i = 1; i <= min; i++)
3590 if (eptr >= md->end_subject)
3593 RRETURN(MATCH_NOMATCH);
3595 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3598 if (min == max) continue;
3602 for (fi = min;; fi++)
3604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3606 if (fi >= max) RRETURN(MATCH_NOMATCH);
3607 if (eptr >= md->end_subject)
3610 RRETURN(MATCH_NOMATCH);
3612 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3614 /* Control never gets here */
3619 for (i = min; i < max; i++)
3621 if (eptr >= md->end_subject)
3626 if (fc != UCHAR21TEST(eptr)) break;
3629 if (possessive) continue; /* No backtracking */
3632 if (eptr == pp) goto TAIL_RECURSE;
3633 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3637 /* Control never gets here */
3640 /* Control never gets here */
3642 /* Match a negated single one-byte character. The character we are
3643 checking can be multibyte. */
3647 if (eptr >= md->end_subject)
3650 RRETURN(MATCH_NOMATCH);
3655 register pcre_uint32 ch, och;
3658 GETCHARINC(ch, ecode);
3659 GETCHARINC(c, eptr);
3663 if (ch == c) RRETURN(MATCH_NOMATCH);
3669 och = UCD_OTHERCASE(ch);
3673 #endif /* SUPPORT_UCP */
3675 och = TABLE_GET(ch, md->fcc, ch);
3676 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3682 register pcre_uint32 ch = ecode[1];
3684 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3685 RRETURN(MATCH_NOMATCH);
3690 /* Match a negated single one-byte character repeatedly. This is almost a
3691 repeat of the code for a repeated single character, but I haven't found a
3692 nice way of commoning these up that doesn't require a test of the
3693 positive/negative option for each character match. Maybe that wouldn't add
3694 very much to the time taken, but character matching *is* what this is all
3699 min = max = GET2(ecode, 1);
3700 ecode += 1 + IMM2_SIZE;
3706 case OP_NOTMINUPTOI:
3708 max = GET2(ecode, 1);
3709 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3710 ecode += 1 + IMM2_SIZE;
3714 case OP_NOTPOSSTARI:
3722 case OP_NOTPOSPLUSI:
3729 case OP_NOTPOSQUERY:
3730 case OP_NOTPOSQUERYI:
3738 case OP_NOTPOSUPTOI:
3741 max = GET2(ecode, 1);
3742 ecode += 1 + IMM2_SIZE;
3748 case OP_NOTMINSTARI:
3752 case OP_NOTMINPLUSI:
3755 case OP_NOTMINQUERY:
3756 case OP_NOTMINQUERYI:
3757 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3758 minimize = (c & 1) != 0;
3759 min = rep_min[c]; /* Pick up values from tables; */
3760 max = rep_max[c]; /* zero for max => infinity */
3761 if (max == 0) max = INT_MAX;
3763 /* Common code for all repeated single-byte matches. */
3766 GETCHARINCTEST(fc, ecode);
3768 /* The code is duplicated for the caseless and caseful cases, for speed,
3769 since matching characters is likely to be quite common. First, ensure the
3770 minimum number of matches are present. If min = max, continue at the same
3771 level without recursing. Otherwise, if minimizing, keep trying the rest of
3772 the expression and advancing one matching character if failing, up to the
3773 maximum. Alternatively, if maximizing, find the maximum number of
3774 characters and work backwards. */
3776 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3777 max, (char *)eptr));
3779 if (op >= OP_NOTSTARI) /* Caseless */
3783 if (utf && fc > 127)
3784 foc = UCD_OTHERCASE(fc);
3786 if (utf && fc > 127)
3788 #endif /* SUPPORT_UCP */
3790 #endif /* SUPPORT_UTF */
3791 foc = TABLE_GET(fc, md->fcc, fc);
3796 register pcre_uint32 d;
3797 for (i = 1; i <= min; i++)
3799 if (eptr >= md->end_subject)
3802 RRETURN(MATCH_NOMATCH);
3804 GETCHARINC(d, eptr);
3805 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3809 #endif /* SUPPORT_UTF */
3812 for (i = 1; i <= min; i++)
3814 if (eptr >= md->end_subject)
3817 RRETURN(MATCH_NOMATCH);
3819 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3824 if (min == max) continue;
3831 register pcre_uint32 d;
3832 for (fi = min;; fi++)
3834 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3837 if (eptr >= md->end_subject)
3840 RRETURN(MATCH_NOMATCH);
3842 GETCHARINC(d, eptr);
3843 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3847 #endif /*SUPPORT_UTF */
3850 for (fi = min;; fi++)
3852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3854 if (fi >= max) RRETURN(MATCH_NOMATCH);
3855 if (eptr >= md->end_subject)
3858 RRETURN(MATCH_NOMATCH);
3860 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3864 /* Control never gets here */
3876 register pcre_uint32 d;
3877 for (i = min; i < max; i++)
3880 if (eptr >= md->end_subject)
3885 GETCHARLEN(d, eptr, len);
3886 if (fc == d || (unsigned int)foc == d) break;
3889 if (possessive) continue; /* No backtracking */
3892 if (eptr == pp) goto TAIL_RECURSE;
3893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3900 #endif /* SUPPORT_UTF */
3903 for (i = min; i < max; i++)
3905 if (eptr >= md->end_subject)
3910 if (fc == *eptr || foc == *eptr) break;
3913 if (possessive) continue; /* No backtracking */
3916 if (eptr == pp) goto TAIL_RECURSE;
3917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3922 /* Control never gets here */
3926 /* Caseful comparisons */
3933 register pcre_uint32 d;
3934 for (i = 1; i <= min; i++)
3936 if (eptr >= md->end_subject)
3939 RRETURN(MATCH_NOMATCH);
3941 GETCHARINC(d, eptr);
3942 if (fc == d) RRETURN(MATCH_NOMATCH);
3949 for (i = 1; i <= min; i++)
3951 if (eptr >= md->end_subject)
3954 RRETURN(MATCH_NOMATCH);
3956 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3960 if (min == max) continue;
3967 register pcre_uint32 d;
3968 for (fi = min;; fi++)
3970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3972 if (fi >= max) RRETURN(MATCH_NOMATCH);
3973 if (eptr >= md->end_subject)
3976 RRETURN(MATCH_NOMATCH);
3978 GETCHARINC(d, eptr);
3979 if (fc == d) RRETURN(MATCH_NOMATCH);
3986 for (fi = min;; fi++)
3988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3990 if (fi >= max) RRETURN(MATCH_NOMATCH);
3991 if (eptr >= md->end_subject)
3994 RRETURN(MATCH_NOMATCH);
3996 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3999 /* Control never gets here */
4011 register pcre_uint32 d;
4012 for (i = min; i < max; i++)
4015 if (eptr >= md->end_subject)
4020 GETCHARLEN(d, eptr, len);
4024 if (possessive) continue; /* No backtracking */
4027 if (eptr == pp) goto TAIL_RECURSE;
4028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4038 for (i = min; i < max; i++)
4040 if (eptr >= md->end_subject)
4045 if (fc == *eptr) break;
4048 if (possessive) continue; /* No backtracking */
4051 if (eptr == pp) goto TAIL_RECURSE;
4052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4057 /* Control never gets here */
4060 /* Control never gets here */
4062 /* Match a single character type repeatedly; several different opcodes
4063 share code. This is very similar to the code for single characters, but we
4064 repeat it in the interests of efficiency. */
4067 min = max = GET2(ecode, 1);
4069 ecode += 1 + IMM2_SIZE;
4073 case OP_TYPEMINUPTO:
4075 max = GET2(ecode, 1);
4076 minimize = *ecode == OP_TYPEMINUPTO;
4077 ecode += 1 + IMM2_SIZE;
4080 case OP_TYPEPOSSTAR:
4087 case OP_TYPEPOSPLUS:
4094 case OP_TYPEPOSQUERY:
4101 case OP_TYPEPOSUPTO:
4104 max = GET2(ecode, 1);
4105 ecode += 1 + IMM2_SIZE;
4109 case OP_TYPEMINSTAR:
4111 case OP_TYPEMINPLUS:
4113 case OP_TYPEMINQUERY:
4114 c = *ecode++ - OP_TYPESTAR;
4115 minimize = (c & 1) != 0;
4116 min = rep_min[c]; /* Pick up values from tables; */
4117 max = rep_max[c]; /* zero for max => infinity */
4118 if (max == 0) max = INT_MAX;
4120 /* Common code for all repeated single character type matches. Note that
4121 in UTF-8 mode, '.' matches a character of any length, but for the other
4122 character types, the valid characters are all one-byte long. */
4125 ctype = *ecode++; /* Code for the character type */
4128 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4130 prop_fail_result = ctype == OP_NOTPROP;
4131 prop_type = *ecode++;
4132 prop_value = *ecode++;
4134 else prop_type = -1;
4137 /* First, ensure the minimum number of matches are present. Use inline
4138 code for maximizing the speed, and do the type test once at the start
4139 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4140 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4141 and single-bytes. */
4151 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4152 for (i = 1; i <= min; i++)
4154 if (eptr >= md->end_subject)
4157 RRETURN(MATCH_NOMATCH);
4159 GETCHARINCTEST(c, eptr);
4164 for (i = 1; i <= min; i++)
4167 if (eptr >= md->end_subject)
4170 RRETURN(MATCH_NOMATCH);
4172 GETCHARINCTEST(c, eptr);
4173 chartype = UCD_CHARTYPE(c);
4174 if ((chartype == ucp_Lu ||
4175 chartype == ucp_Ll ||
4176 chartype == ucp_Lt) == prop_fail_result)
4177 RRETURN(MATCH_NOMATCH);
4182 for (i = 1; i <= min; i++)
4184 if (eptr >= md->end_subject)
4187 RRETURN(MATCH_NOMATCH);
4189 GETCHARINCTEST(c, eptr);
4190 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4191 RRETURN(MATCH_NOMATCH);
4196 for (i = 1; i <= min; i++)
4198 if (eptr >= md->end_subject)
4201 RRETURN(MATCH_NOMATCH);
4203 GETCHARINCTEST(c, eptr);
4204 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4205 RRETURN(MATCH_NOMATCH);
4210 for (i = 1; i <= min; i++)
4212 if (eptr >= md->end_subject)
4215 RRETURN(MATCH_NOMATCH);
4217 GETCHARINCTEST(c, eptr);
4218 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4219 RRETURN(MATCH_NOMATCH);
4224 for (i = 1; i <= min; i++)
4227 if (eptr >= md->end_subject)
4230 RRETURN(MATCH_NOMATCH);
4232 GETCHARINCTEST(c, eptr);
4233 category = UCD_CATEGORY(c);
4234 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4235 RRETURN(MATCH_NOMATCH);
4239 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4240 which means that Perl space and POSIX space are now identical. PCRE
4241 was changed at release 8.34. */
4243 case PT_SPACE: /* Perl space */
4244 case PT_PXSPACE: /* POSIX space */
4245 for (i = 1; i <= min; i++)
4247 if (eptr >= md->end_subject)
4250 RRETURN(MATCH_NOMATCH);
4252 GETCHARINCTEST(c, eptr);
4257 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4261 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4262 RRETURN(MATCH_NOMATCH);
4269 for (i = 1; i <= min; i++)
4272 if (eptr >= md->end_subject)
4275 RRETURN(MATCH_NOMATCH);
4277 GETCHARINCTEST(c, eptr);
4278 category = UCD_CATEGORY(c);
4279 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4280 == prop_fail_result)
4281 RRETURN(MATCH_NOMATCH);
4286 for (i = 1; i <= min; i++)
4288 const pcre_uint32 *cp;
4289 if (eptr >= md->end_subject)
4292 RRETURN(MATCH_NOMATCH);
4294 GETCHARINCTEST(c, eptr);
4295 cp = PRIV(ucd_caseless_sets) + prop_value;
4299 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4301 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4307 for (i = 1; i <= min; i++)
4309 if (eptr >= md->end_subject)
4312 RRETURN(MATCH_NOMATCH);
4314 GETCHARINCTEST(c, eptr);
4315 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4316 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4317 c >= 0xe000) == prop_fail_result)
4318 RRETURN(MATCH_NOMATCH);
4322 /* This should not occur */
4325 RRETURN(PCRE_ERROR_INTERNAL);
4329 /* Match extended Unicode sequences. We will get here only if the
4330 support is in the binary; otherwise a compile-time error occurs. */
4332 else if (ctype == OP_EXTUNI)
4334 for (i = 1; i <= min; i++)
4336 if (eptr >= md->end_subject)
4339 RRETURN(MATCH_NOMATCH);
4344 GETCHARINCTEST(c, eptr);
4345 lgb = UCD_GRAPHBREAK(c);
4346 while (eptr < md->end_subject)
4349 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4350 rgb = UCD_GRAPHBREAK(c);
4351 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4361 #endif /* SUPPORT_UCP */
4363 /* Handle all other cases when the coding is UTF-8 */
4366 if (utf) switch(ctype)
4369 for (i = 1; i <= min; i++)
4371 if (eptr >= md->end_subject)
4374 RRETURN(MATCH_NOMATCH);
4376 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4377 if (md->partial != 0 &&
4378 eptr + 1 >= md->end_subject &&
4379 NLBLOCK->nltype == NLTYPE_FIXED &&
4380 NLBLOCK->nllen == 2 &&
4381 UCHAR21(eptr) == NLBLOCK->nl[0])
4384 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4387 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4392 for (i = 1; i <= min; i++)
4394 if (eptr >= md->end_subject)
4397 RRETURN(MATCH_NOMATCH);
4400 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4405 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4410 for (i = 1; i <= min; i++)
4412 if (eptr >= md->end_subject)
4415 RRETURN(MATCH_NOMATCH);
4417 GETCHARINC(c, eptr);
4420 default: RRETURN(MATCH_NOMATCH);
4423 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4435 #endif /* Not EBCDIC */
4436 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4443 for (i = 1; i <= min; i++)
4445 if (eptr >= md->end_subject)
4448 RRETURN(MATCH_NOMATCH);
4450 GETCHARINC(c, eptr);
4453 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4460 for (i = 1; i <= min; i++)
4462 if (eptr >= md->end_subject)
4465 RRETURN(MATCH_NOMATCH);
4467 GETCHARINC(c, eptr);
4470 HSPACE_CASES: break; /* Byte and multibyte cases */
4471 default: RRETURN(MATCH_NOMATCH);
4477 for (i = 1; i <= min; i++)
4479 if (eptr >= md->end_subject)
4482 RRETURN(MATCH_NOMATCH);
4484 GETCHARINC(c, eptr);
4487 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4494 for (i = 1; i <= min; i++)
4496 if (eptr >= md->end_subject)
4499 RRETURN(MATCH_NOMATCH);
4501 GETCHARINC(c, eptr);
4504 VSPACE_CASES: break;
4505 default: RRETURN(MATCH_NOMATCH);
4511 for (i = 1; i <= min; i++)
4513 if (eptr >= md->end_subject)
4516 RRETURN(MATCH_NOMATCH);
4518 GETCHARINC(c, eptr);
4519 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4520 RRETURN(MATCH_NOMATCH);
4525 for (i = 1; i <= min; i++)
4528 if (eptr >= md->end_subject)
4531 RRETURN(MATCH_NOMATCH);
4534 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4535 RRETURN(MATCH_NOMATCH);
4537 /* No need to skip more bytes - we know it's a 1-byte character */
4541 case OP_NOT_WHITESPACE:
4542 for (i = 1; i <= min; i++)
4545 if (eptr >= md->end_subject)
4548 RRETURN(MATCH_NOMATCH);
4551 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4552 RRETURN(MATCH_NOMATCH);
4554 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4559 for (i = 1; i <= min; i++)
4562 if (eptr >= md->end_subject)
4565 RRETURN(MATCH_NOMATCH);
4568 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4569 RRETURN(MATCH_NOMATCH);
4571 /* No need to skip more bytes - we know it's a 1-byte character */
4575 case OP_NOT_WORDCHAR:
4576 for (i = 1; i <= min; i++)
4579 if (eptr >= md->end_subject)
4582 RRETURN(MATCH_NOMATCH);
4585 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4586 RRETURN(MATCH_NOMATCH);
4588 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4593 for (i = 1; i <= min; i++)
4596 if (eptr >= md->end_subject)
4599 RRETURN(MATCH_NOMATCH);
4602 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4603 RRETURN(MATCH_NOMATCH);
4605 /* No need to skip more bytes - we know it's a 1-byte character */
4610 RRETURN(PCRE_ERROR_INTERNAL);
4611 } /* End switch(ctype) */
4614 #endif /* SUPPORT_UTF */
4616 /* Code for the non-UTF-8 case for minimum matching of operators other
4617 than OP_PROP and OP_NOTPROP. */
4622 for (i = 1; i <= min; i++)
4624 if (eptr >= md->end_subject)
4627 RRETURN(MATCH_NOMATCH);
4629 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4630 if (md->partial != 0 &&
4631 eptr + 1 >= md->end_subject &&
4632 NLBLOCK->nltype == NLTYPE_FIXED &&
4633 NLBLOCK->nllen == 2 &&
4634 *eptr == NLBLOCK->nl[0])
4637 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4644 if (eptr > md->end_subject - min)
4647 RRETURN(MATCH_NOMATCH);
4653 if (eptr > md->end_subject - min)
4656 RRETURN(MATCH_NOMATCH);
4662 for (i = 1; i <= min; i++)
4664 if (eptr >= md->end_subject)
4667 RRETURN(MATCH_NOMATCH);
4671 default: RRETURN(MATCH_NOMATCH);
4674 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4683 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4687 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4694 for (i = 1; i <= min; i++)
4696 if (eptr >= md->end_subject)
4699 RRETURN(MATCH_NOMATCH);
4705 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4706 HSPACE_MULTIBYTE_CASES:
4708 RRETURN(MATCH_NOMATCH);
4714 for (i = 1; i <= min; i++)
4716 if (eptr >= md->end_subject)
4719 RRETURN(MATCH_NOMATCH);
4723 default: RRETURN(MATCH_NOMATCH);
4725 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4726 HSPACE_MULTIBYTE_CASES:
4734 for (i = 1; i <= min; i++)
4736 if (eptr >= md->end_subject)
4739 RRETURN(MATCH_NOMATCH);
4744 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4745 VSPACE_MULTIBYTE_CASES:
4747 RRETURN(MATCH_NOMATCH);
4754 for (i = 1; i <= min; i++)
4756 if (eptr >= md->end_subject)
4759 RRETURN(MATCH_NOMATCH);
4763 default: RRETURN(MATCH_NOMATCH);
4765 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4766 VSPACE_MULTIBYTE_CASES:
4774 for (i = 1; i <= min; i++)
4776 if (eptr >= md->end_subject)
4779 RRETURN(MATCH_NOMATCH);
4781 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4782 RRETURN(MATCH_NOMATCH);
4788 for (i = 1; i <= min; i++)
4790 if (eptr >= md->end_subject)
4793 RRETURN(MATCH_NOMATCH);
4795 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4796 RRETURN(MATCH_NOMATCH);
4801 case OP_NOT_WHITESPACE:
4802 for (i = 1; i <= min; i++)
4804 if (eptr >= md->end_subject)
4807 RRETURN(MATCH_NOMATCH);
4809 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4810 RRETURN(MATCH_NOMATCH);
4816 for (i = 1; i <= min; i++)
4818 if (eptr >= md->end_subject)
4821 RRETURN(MATCH_NOMATCH);
4823 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4824 RRETURN(MATCH_NOMATCH);
4829 case OP_NOT_WORDCHAR:
4830 for (i = 1; i <= min; i++)
4832 if (eptr >= md->end_subject)
4835 RRETURN(MATCH_NOMATCH);
4837 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4838 RRETURN(MATCH_NOMATCH);
4844 for (i = 1; i <= min; i++)
4846 if (eptr >= md->end_subject)
4849 RRETURN(MATCH_NOMATCH);
4851 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4852 RRETURN(MATCH_NOMATCH);
4858 RRETURN(PCRE_ERROR_INTERNAL);
4862 /* If min = max, continue at the same level without recursing */
4864 if (min == max) continue;
4866 /* If minimizing, we have to test the rest of the pattern before each
4867 subsequent match. Again, separate the UTF-8 case for speed, and also
4868 separate the UCP cases. */
4878 for (fi = min;; fi++)
4880 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4882 if (fi >= max) RRETURN(MATCH_NOMATCH);
4883 if (eptr >= md->end_subject)
4886 RRETURN(MATCH_NOMATCH);
4888 GETCHARINCTEST(c, eptr);
4889 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4891 /* Control never gets here */
4894 for (fi = min;; fi++)
4897 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4899 if (fi >= max) RRETURN(MATCH_NOMATCH);
4900 if (eptr >= md->end_subject)
4903 RRETURN(MATCH_NOMATCH);
4905 GETCHARINCTEST(c, eptr);
4906 chartype = UCD_CHARTYPE(c);
4907 if ((chartype == ucp_Lu ||
4908 chartype == ucp_Ll ||
4909 chartype == ucp_Lt) == prop_fail_result)
4910 RRETURN(MATCH_NOMATCH);
4912 /* Control never gets here */
4915 for (fi = min;; fi++)
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4923 RRETURN(MATCH_NOMATCH);
4925 GETCHARINCTEST(c, eptr);
4926 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4927 RRETURN(MATCH_NOMATCH);
4929 /* Control never gets here */
4932 for (fi = min;; fi++)
4934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4936 if (fi >= max) RRETURN(MATCH_NOMATCH);
4937 if (eptr >= md->end_subject)
4940 RRETURN(MATCH_NOMATCH);
4942 GETCHARINCTEST(c, eptr);
4943 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4944 RRETURN(MATCH_NOMATCH);
4946 /* Control never gets here */
4949 for (fi = min;; fi++)
4951 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4953 if (fi >= max) RRETURN(MATCH_NOMATCH);
4954 if (eptr >= md->end_subject)
4957 RRETURN(MATCH_NOMATCH);
4959 GETCHARINCTEST(c, eptr);
4960 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4961 RRETURN(MATCH_NOMATCH);
4963 /* Control never gets here */
4966 for (fi = min;; fi++)
4969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4971 if (fi >= max) RRETURN(MATCH_NOMATCH);
4972 if (eptr >= md->end_subject)
4975 RRETURN(MATCH_NOMATCH);
4977 GETCHARINCTEST(c, eptr);
4978 category = UCD_CATEGORY(c);
4979 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4980 RRETURN(MATCH_NOMATCH);
4982 /* Control never gets here */
4984 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4985 which means that Perl space and POSIX space are now identical. PCRE
4986 was changed at release 8.34. */
4988 case PT_SPACE: /* Perl space */
4989 case PT_PXSPACE: /* POSIX space */
4990 for (fi = min;; fi++)
4992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4994 if (fi >= max) RRETURN(MATCH_NOMATCH);
4995 if (eptr >= md->end_subject)
4998 RRETURN(MATCH_NOMATCH);
5000 GETCHARINCTEST(c, eptr);
5005 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5009 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5010 RRETURN(MATCH_NOMATCH);
5014 /* Control never gets here */
5017 for (fi = min;; fi++)
5020 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5022 if (fi >= max) RRETURN(MATCH_NOMATCH);
5023 if (eptr >= md->end_subject)
5026 RRETURN(MATCH_NOMATCH);
5028 GETCHARINCTEST(c, eptr);
5029 category = UCD_CATEGORY(c);
5030 if ((category == ucp_L ||
5031 category == ucp_N ||
5032 c == CHAR_UNDERSCORE)
5033 == prop_fail_result)
5034 RRETURN(MATCH_NOMATCH);
5036 /* Control never gets here */
5039 for (fi = min;; fi++)
5041 const pcre_uint32 *cp;
5042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5044 if (fi >= max) RRETURN(MATCH_NOMATCH);
5045 if (eptr >= md->end_subject)
5048 RRETURN(MATCH_NOMATCH);
5050 GETCHARINCTEST(c, eptr);
5051 cp = PRIV(ucd_caseless_sets) + prop_value;
5055 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5057 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5060 /* Control never gets here */
5063 for (fi = min;; fi++)
5065 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5067 if (fi >= max) RRETURN(MATCH_NOMATCH);
5068 if (eptr >= md->end_subject)
5071 RRETURN(MATCH_NOMATCH);
5073 GETCHARINCTEST(c, eptr);
5074 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5075 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5076 c >= 0xe000) == prop_fail_result)
5077 RRETURN(MATCH_NOMATCH);
5079 /* Control never gets here */
5081 /* This should never occur */
5083 RRETURN(PCRE_ERROR_INTERNAL);
5087 /* Match extended Unicode sequences. We will get here only if the
5088 support is in the binary; otherwise a compile-time error occurs. */
5090 else if (ctype == OP_EXTUNI)
5092 for (fi = min;; fi++)
5094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5096 if (fi >= max) RRETURN(MATCH_NOMATCH);
5097 if (eptr >= md->end_subject)
5100 RRETURN(MATCH_NOMATCH);
5105 GETCHARINCTEST(c, eptr);
5106 lgb = UCD_GRAPHBREAK(c);
5107 while (eptr < md->end_subject)
5110 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5111 rgb = UCD_GRAPHBREAK(c);
5112 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5121 #endif /* SUPPORT_UCP */
5126 for (fi = min;; fi++)
5128 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5130 if (fi >= max) RRETURN(MATCH_NOMATCH);
5131 if (eptr >= md->end_subject)
5134 RRETURN(MATCH_NOMATCH);
5136 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5137 RRETURN(MATCH_NOMATCH);
5138 GETCHARINC(c, eptr);
5141 case OP_ANY: /* This is the non-NL case */
5142 if (md->partial != 0 && /* Take care with CRLF partial */
5143 eptr >= md->end_subject &&
5144 NLBLOCK->nltype == NLTYPE_FIXED &&
5145 NLBLOCK->nllen == 2 &&
5146 c == NLBLOCK->nl[0])
5149 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5160 default: RRETURN(MATCH_NOMATCH);
5162 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5174 #endif /* Not EBCDIC */
5175 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5183 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5191 HSPACE_CASES: break;
5192 default: RRETURN(MATCH_NOMATCH);
5199 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5207 VSPACE_CASES: break;
5208 default: RRETURN(MATCH_NOMATCH);
5213 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5214 RRETURN(MATCH_NOMATCH);
5218 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5219 RRETURN(MATCH_NOMATCH);
5222 case OP_NOT_WHITESPACE:
5223 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5224 RRETURN(MATCH_NOMATCH);
5228 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5229 RRETURN(MATCH_NOMATCH);
5232 case OP_NOT_WORDCHAR:
5233 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5234 RRETURN(MATCH_NOMATCH);
5238 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5239 RRETURN(MATCH_NOMATCH);
5243 RRETURN(PCRE_ERROR_INTERNAL);
5251 for (fi = min;; fi++)
5253 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5254 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5255 if (fi >= max) RRETURN(MATCH_NOMATCH);
5256 if (eptr >= md->end_subject)
5259 RRETURN(MATCH_NOMATCH);
5261 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5262 RRETURN(MATCH_NOMATCH);
5266 case OP_ANY: /* This is the non-NL case */
5267 if (md->partial != 0 && /* Take care with CRLF partial */
5268 eptr >= md->end_subject &&
5269 NLBLOCK->nltype == NLTYPE_FIXED &&
5270 NLBLOCK->nllen == 2 &&
5271 c == NLBLOCK->nl[0])
5274 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5285 default: RRETURN(MATCH_NOMATCH);
5287 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5296 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5300 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5310 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5311 HSPACE_MULTIBYTE_CASES:
5313 RRETURN(MATCH_NOMATCH);
5320 default: RRETURN(MATCH_NOMATCH);
5322 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5323 HSPACE_MULTIBYTE_CASES:
5334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5335 VSPACE_MULTIBYTE_CASES:
5337 RRETURN(MATCH_NOMATCH);
5344 default: RRETURN(MATCH_NOMATCH);
5346 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5347 VSPACE_MULTIBYTE_CASES:
5354 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5358 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5361 case OP_NOT_WHITESPACE:
5362 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5366 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5369 case OP_NOT_WORDCHAR:
5370 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5374 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5378 RRETURN(PCRE_ERROR_INTERNAL);
5382 /* Control never gets here */
5385 /* If maximizing, it is worth using inline code for speed, doing the type
5386 test once at the start (i.e. keep it out of the loop). Again, keep the
5387 UTF-8 and UCP stuff separate. */
5391 pp = eptr; /* Remember where we started */
5399 for (i = min; i < max; i++)
5402 if (eptr >= md->end_subject)
5407 GETCHARLENTEST(c, eptr, len);
5408 if (prop_fail_result) break;
5414 for (i = min; i < max; i++)
5418 if (eptr >= md->end_subject)
5423 GETCHARLENTEST(c, eptr, len);
5424 chartype = UCD_CHARTYPE(c);
5425 if ((chartype == ucp_Lu ||
5426 chartype == ucp_Ll ||
5427 chartype == ucp_Lt) == prop_fail_result)
5434 for (i = min; i < max; i++)
5437 if (eptr >= md->end_subject)
5442 GETCHARLENTEST(c, eptr, len);
5443 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5449 for (i = min; i < max; i++)
5452 if (eptr >= md->end_subject)
5457 GETCHARLENTEST(c, eptr, len);
5458 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5464 for (i = min; i < max; i++)
5467 if (eptr >= md->end_subject)
5472 GETCHARLENTEST(c, eptr, len);
5473 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5479 for (i = min; i < max; i++)
5483 if (eptr >= md->end_subject)
5488 GETCHARLENTEST(c, eptr, len);
5489 category = UCD_CATEGORY(c);
5490 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5496 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5497 which means that Perl space and POSIX space are now identical. PCRE
5498 was changed at release 8.34. */
5500 case PT_SPACE: /* Perl space */
5501 case PT_PXSPACE: /* POSIX space */
5502 for (i = min; i < max; i++)
5505 if (eptr >= md->end_subject)
5510 GETCHARLENTEST(c, eptr, len);
5515 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5519 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5520 goto ENDLOOP99; /* Break the loop */
5529 for (i = min; i < max; i++)
5533 if (eptr >= md->end_subject)
5538 GETCHARLENTEST(c, eptr, len);
5539 category = UCD_CATEGORY(c);
5540 if ((category == ucp_L || category == ucp_N ||
5541 c == CHAR_UNDERSCORE) == prop_fail_result)
5548 for (i = min; i < max; i++)
5550 const pcre_uint32 *cp;
5552 if (eptr >= md->end_subject)
5557 GETCHARLENTEST(c, eptr, len);
5558 cp = PRIV(ucd_caseless_sets) + prop_value;
5562 { if (prop_fail_result) break; else goto GOT_MAX; }
5564 { if (prop_fail_result) goto GOT_MAX; else break; }
5572 for (i = min; i < max; i++)
5575 if (eptr >= md->end_subject)
5580 GETCHARLENTEST(c, eptr, len);
5581 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5582 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5583 c >= 0xe000) == prop_fail_result)
5590 RRETURN(PCRE_ERROR_INTERNAL);
5593 /* eptr is now past the end of the maximum run */
5595 if (possessive) continue; /* No backtracking */
5598 if (eptr == pp) goto TAIL_RECURSE;
5599 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5602 if (utf) BACKCHAR(eptr);
5606 /* Match extended Unicode grapheme clusters. We will get here only if the
5607 support is in the binary; otherwise a compile-time error occurs. */
5609 else if (ctype == OP_EXTUNI)
5611 for (i = min; i < max; i++)
5613 if (eptr >= md->end_subject)
5621 GETCHARINCTEST(c, eptr);
5622 lgb = UCD_GRAPHBREAK(c);
5623 while (eptr < md->end_subject)
5626 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5627 rgb = UCD_GRAPHBREAK(c);
5628 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5636 /* eptr is now past the end of the maximum run */
5638 if (possessive) continue; /* No backtracking */
5645 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5646 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5647 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5649 /* Backtracking over an extended grapheme cluster involves inspecting
5650 the previous two characters (if present) to see if a break is
5651 permitted between them. */
5654 if (!utf) c = *eptr; else
5659 rgb = UCD_GRAPHBREAK(c);
5663 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5665 if (!utf) c = *fptr; else
5670 lgb = UCD_GRAPHBREAK(c);
5671 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5679 #endif /* SUPPORT_UCP */
5689 for (i = min; i < max; i++)
5691 if (eptr >= md->end_subject)
5696 if (IS_NEWLINE(eptr)) break;
5697 if (md->partial != 0 && /* Take care with CRLF partial */
5698 eptr + 1 >= md->end_subject &&
5699 NLBLOCK->nltype == NLTYPE_FIXED &&
5700 NLBLOCK->nllen == 2 &&
5701 UCHAR21(eptr) == NLBLOCK->nl[0])
5704 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5707 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5711 /* Handle unlimited UTF-8 repeat */
5715 for (i = min; i < max; i++)
5717 if (eptr >= md->end_subject)
5722 if (IS_NEWLINE(eptr)) break;
5723 if (md->partial != 0 && /* Take care with CRLF partial */
5724 eptr + 1 >= md->end_subject &&
5725 NLBLOCK->nltype == NLTYPE_FIXED &&
5726 NLBLOCK->nllen == 2 &&
5727 UCHAR21(eptr) == NLBLOCK->nl[0])
5730 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5733 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5741 for (i = min; i < max; i++)
5743 if (eptr >= md->end_subject)
5749 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5754 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5759 /* The byte case is the same as non-UTF8 */
5763 if (c > (unsigned int)(md->end_subject - eptr))
5765 eptr = md->end_subject;
5772 for (i = min; i < max; i++)
5775 if (eptr >= md->end_subject)
5780 GETCHARLEN(c, eptr, len);
5783 if (++eptr >= md->end_subject) break;
5784 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5790 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5792 && c != 0x2028 && c != 0x2029
5793 #endif /* Not EBCDIC */
5803 for (i = min; i < max; i++)
5807 if (eptr >= md->end_subject)
5812 GETCHARLEN(c, eptr, len);
5815 HSPACE_CASES: gotspace = TRUE; break;
5816 default: gotspace = FALSE; break;
5818 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5825 for (i = min; i < max; i++)
5829 if (eptr >= md->end_subject)
5834 GETCHARLEN(c, eptr, len);
5837 VSPACE_CASES: gotspace = TRUE; break;
5838 default: gotspace = FALSE; break;
5840 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5846 for (i = min; i < max; i++)
5849 if (eptr >= md->end_subject)
5854 GETCHARLEN(c, eptr, len);
5855 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5861 for (i = min; i < max; i++)
5864 if (eptr >= md->end_subject)
5869 GETCHARLEN(c, eptr, len);
5870 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5875 case OP_NOT_WHITESPACE:
5876 for (i = min; i < max; i++)
5879 if (eptr >= md->end_subject)
5884 GETCHARLEN(c, eptr, len);
5885 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5891 for (i = min; i < max; i++)
5894 if (eptr >= md->end_subject)
5899 GETCHARLEN(c, eptr, len);
5900 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5905 case OP_NOT_WORDCHAR:
5906 for (i = min; i < max; i++)
5909 if (eptr >= md->end_subject)
5914 GETCHARLEN(c, eptr, len);
5915 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5921 for (i = min; i < max; i++)
5924 if (eptr >= md->end_subject)
5929 GETCHARLEN(c, eptr, len);
5930 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5936 RRETURN(PCRE_ERROR_INTERNAL);
5939 if (possessive) continue; /* No backtracking */
5942 if (eptr == pp) goto TAIL_RECURSE;
5943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5947 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5948 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5952 #endif /* SUPPORT_UTF */
5958 for (i = min; i < max; i++)
5960 if (eptr >= md->end_subject)
5965 if (IS_NEWLINE(eptr)) break;
5966 if (md->partial != 0 && /* Take care with CRLF partial */
5967 eptr + 1 >= md->end_subject &&
5968 NLBLOCK->nltype == NLTYPE_FIXED &&
5969 NLBLOCK->nllen == 2 &&
5970 *eptr == NLBLOCK->nl[0])
5973 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5982 if (c > (unsigned int)(md->end_subject - eptr))
5984 eptr = md->end_subject;
5991 for (i = min; i < max; i++)
5993 if (eptr >= md->end_subject)
6001 if (++eptr >= md->end_subject) break;
6002 if (*eptr == CHAR_LF) eptr++;
6006 if (c != CHAR_LF && (md->bsr_anycrlf ||
6007 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6008 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6009 && c != 0x2028 && c != 0x2029
6018 for (i = min; i < max; i++)
6020 if (eptr >= md->end_subject)
6027 default: eptr++; break;
6029 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6030 HSPACE_MULTIBYTE_CASES:
6039 for (i = min; i < max; i++)
6041 if (eptr >= md->end_subject)
6048 default: goto ENDLOOP01;
6050 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6051 HSPACE_MULTIBYTE_CASES:
6060 for (i = min; i < max; i++)
6062 if (eptr >= md->end_subject)
6069 default: eptr++; break;
6071 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6072 VSPACE_MULTIBYTE_CASES:
6081 for (i = min; i < max; i++)
6083 if (eptr >= md->end_subject)
6090 default: goto ENDLOOP03;
6092 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6093 VSPACE_MULTIBYTE_CASES:
6102 for (i = min; i < max; i++)
6104 if (eptr >= md->end_subject)
6109 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6115 for (i = min; i < max; i++)
6117 if (eptr >= md->end_subject)
6122 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6127 case OP_NOT_WHITESPACE:
6128 for (i = min; i < max; i++)
6130 if (eptr >= md->end_subject)
6135 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6141 for (i = min; i < max; i++)
6143 if (eptr >= md->end_subject)
6148 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6153 case OP_NOT_WORDCHAR:
6154 for (i = min; i < max; i++)
6156 if (eptr >= md->end_subject)
6161 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6167 for (i = min; i < max; i++)
6169 if (eptr >= md->end_subject)
6174 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6180 RRETURN(PCRE_ERROR_INTERNAL);
6183 if (possessive) continue; /* No backtracking */
6186 if (eptr == pp) goto TAIL_RECURSE;
6187 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6188 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6190 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6191 eptr[-1] == CHAR_CR) eptr--;
6195 /* Control never gets here */
6198 /* There's been some horrible disaster. Arrival here can only mean there is
6199 something seriously wrong in the code above or the OP_xxx definitions. */
6202 DPRINTF(("Unknown opcode %d\n", *ecode));
6203 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6206 /* Do not stick any code in here without much thought; it is assumed
6207 that "continue" in the code above comes out to here to repeat the main
6210 } /* End of main loop */
6211 /* Control never reaches here */
6214 /* When compiling to use the heap rather than the stack for recursive calls to
6215 match(), the RRETURN() macro jumps here. The number that is saved in
6216 frame->Xwhere indicates which label we actually want to return to. */
6219 #define LBL(val) case val: goto L_RM##val;
6221 switch (frame->Xwhere)
6223 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6224 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6225 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6226 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6227 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6229 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6234 LBL(22) LBL(23) LBL(28) LBL(30)
6235 LBL(32) LBL(34) LBL(42) LBL(46)
6237 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6238 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6239 #endif /* SUPPORT_UCP */
6240 #endif /* SUPPORT_UTF */
6242 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6243 return PCRE_ERROR_INTERNAL;
6246 #endif /* NO_RECURSE */
6250 /***************************************************************************
6251 ****************************************************************************
6252 RECURSION IN THE match() FUNCTION
6254 Undefine all the macros that were defined above to handle this. */
6272 #undef new_recursive
6285 #undef save_capture_last
6295 /* These two are defined as macros in both cases */
6300 /***************************************************************************
6301 ***************************************************************************/
6305 /*************************************************
6306 * Release allocated heap frames *
6307 *************************************************/
6309 /* This function releases all the allocated frames. The base frame is on the
6310 machine stack, and so must not be freed.
6312 Argument: the address of the base frame
6317 release_match_heapframes (heapframe *frame_base)
6319 heapframe *nextframe = frame_base->Xnextframe;
6320 while (nextframe != NULL)
6322 heapframe *oldframe = nextframe;
6323 nextframe = nextframe->Xnextframe;
6324 (PUBL(stack_free))(oldframe);
6330 /*************************************************
6331 * Execute a Regular Expression *
6332 *************************************************/
6334 /* This function applies a compiled re to a subject string and picks out
6335 portions of the string if it matches. Two elements in the vector are set for
6336 each substring: the offsets to the start and end of the substring.
6339 argument_re points to the compiled expression
6340 extra_data points to extra data or is NULL
6341 subject points to the subject string
6342 length length of subject string (may contain binary zeros)
6343 start_offset where to start in the subject string
6345 offsets points to a vector of ints to be filled in with offsets
6346 offsetcount the number of elements in the vector
6348 Returns: > 0 => success; value is the number of elements filled in
6349 = 0 => success, but offsets is not big enough
6350 -1 => failed to match
6351 < -1 => some kind of unexpected problem
6354 #if defined COMPILE_PCRE8
6355 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6356 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6357 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6359 #elif defined COMPILE_PCRE16
6360 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6361 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6362 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6364 #elif defined COMPILE_PCRE32
6365 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6366 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6367 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6371 int rc, ocount, arg_offset_max;
6373 BOOL using_temporary_offsets = FALSE;
6378 BOOL has_first_char = FALSE;
6379 BOOL has_req_char = FALSE;
6380 pcre_uchar first_char = 0;
6381 pcre_uchar first_char2 = 0;
6382 pcre_uchar req_char = 0;
6383 pcre_uchar req_char2 = 0;
6384 match_data match_block;
6385 match_data *md = &match_block;
6386 const pcre_uint8 *tables;
6387 const pcre_uint8 *start_bits = NULL;
6388 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6389 PCRE_PUCHAR end_subject;
6390 PCRE_PUCHAR start_partial = NULL;
6391 PCRE_PUCHAR match_partial = NULL;
6392 PCRE_PUCHAR req_char_ptr = start_match - 1;
6394 const pcre_study_data *study;
6395 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6398 heapframe frame_zero;
6399 frame_zero.Xprevframe = NULL; /* Marks the top level */
6400 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6401 md->match_frames_base = &frame_zero;
6404 /* Check for the special magic call that measures the size of the stack used
6405 per recursive call of match(). Without the funny casting for sizeof, a Windows
6406 compiler gave this error: "unary minus operator applied to unsigned type,
6407 result still unsigned". Hopefully the cast fixes that. */
6409 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6410 start_offset == -999)
6412 return -((int)sizeof(heapframe));
6414 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6417 /* Plausibility checks */
6419 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6420 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6421 return PCRE_ERROR_NULL;
6422 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6423 if (length < 0) return PCRE_ERROR_BADLENGTH;
6424 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6426 /* Check that the first field in the block is the magic number. If it is not,
6427 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6428 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6429 means that the pattern is likely compiled with different endianness. */
6431 if (re->magic_number != MAGIC_NUMBER)
6432 return re->magic_number == REVERSED_MAGIC_NUMBER?
6433 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6434 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6436 /* These two settings are used in the code for checking a UTF-8 string that
6437 follows immediately afterwards. Other values in the md block are used only
6438 during "normal" pcre_exec() processing, not when the JIT support is in use,
6439 so they are set up later. */
6441 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6442 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6443 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6444 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6446 /* Check a UTF-8 string if required. Pass back the character offset and error
6447 code for an invalid string if a results vector is available. */
6450 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6453 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6456 if (offsetcount >= 2)
6458 offsets[0] = erroroffset;
6459 offsets[1] = errorcode;
6461 #if defined COMPILE_PCRE8
6462 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6463 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6464 #elif defined COMPILE_PCRE16
6465 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6466 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6467 #elif defined COMPILE_PCRE32
6468 return PCRE_ERROR_BADUTF32;
6471 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6472 /* Check that a start_offset points to the start of a UTF character. */
6473 if (start_offset > 0 && start_offset < length &&
6474 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6475 return PCRE_ERROR_BADUTF8_OFFSET;
6480 /* If the pattern was successfully studied with JIT support, run the JIT
6481 executable instead of the rest of this function. Most options must be set at
6482 compile time for the JIT code to be usable. Fallback to the normal code path if
6483 an unsupported flag is set. */
6486 if (extra_data != NULL
6487 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6488 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6489 && extra_data->executable_jit != NULL
6490 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6492 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6493 start_offset, options, offsets, offsetcount);
6495 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6496 mode is not compiled. In this case we simply fallback to interpreter. */
6498 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6502 /* Carry on with non-JIT matching. This information is for finding all the
6503 numbers associated with a given name, for condition testing. */
6505 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6506 md->name_count = re->name_count;
6507 md->name_entry_size = re->name_entry_size;
6509 /* Fish out the optional data from the extra_data structure, first setting
6510 the default values. */
6513 md->match_limit = MATCH_LIMIT;
6514 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6515 md->callout_data = NULL;
6517 /* The table pointer is always in native byte order. */
6519 tables = re->tables;
6521 /* The two limit values override the defaults, whatever their value. */
6523 if (extra_data != NULL)
6525 register unsigned int flags = extra_data->flags;
6526 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6527 study = (const pcre_study_data *)extra_data->study_data;
6528 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6529 md->match_limit = extra_data->match_limit;
6530 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6531 md->match_limit_recursion = extra_data->match_limit_recursion;
6532 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6533 md->callout_data = extra_data->callout_data;
6534 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6537 /* Limits in the regex override only if they are smaller. */
6539 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6540 md->match_limit = re->limit_match;
6542 if ((re->flags & PCRE_RLSET) != 0 &&
6543 re->limit_recursion < md->match_limit_recursion)
6544 md->match_limit_recursion = re->limit_recursion;
6546 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6547 is a feature that makes it possible to save compiled regex and re-use them
6548 in other programs later. */
6550 if (tables == NULL) tables = PRIV(default_tables);
6552 /* Set up other data */
6554 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6555 startline = (re->flags & PCRE_STARTLINE) != 0;
6556 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6558 /* The code starts after the real_pcre block and the capture name table. */
6560 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6561 re->name_count * re->name_entry_size;
6563 md->start_subject = (PCRE_PUCHAR)subject;
6564 md->start_offset = start_offset;
6565 md->end_subject = md->start_subject + length;
6566 end_subject = md->end_subject;
6568 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6569 md->use_ucp = (re->options & PCRE_UCP) != 0;
6570 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6571 md->ignore_skip_arg = 0;
6573 /* Some options are unpacked into BOOL variables in the hope that testing
6574 them will be faster than individual option bits. */
6576 md->notbol = (options & PCRE_NOTBOL) != 0;
6577 md->noteol = (options & PCRE_NOTEOL) != 0;
6578 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6579 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6582 md->mark = md->nomatch_mark = NULL; /* In case never set */
6584 md->recursive = NULL; /* No recursion at top level */
6585 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6587 md->lcc = tables + lcc_offset;
6588 md->fcc = tables + fcc_offset;
6589 md->ctypes = tables + ctypes_offset;
6591 /* Handle different \R options. */
6593 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6596 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6597 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6600 md->bsr_anycrlf = TRUE;
6602 md->bsr_anycrlf = FALSE;
6606 case PCRE_BSR_ANYCRLF:
6607 md->bsr_anycrlf = TRUE;
6610 case PCRE_BSR_UNICODE:
6611 md->bsr_anycrlf = FALSE;
6614 default: return PCRE_ERROR_BADNEWLINE;
6617 /* Handle different types of newline. The three bits give eight cases. If
6618 nothing is set at run time, whatever was used at compile time applies. */
6620 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6621 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6623 case 0: newline = NEWLINE; break; /* Compile-time default */
6624 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6625 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6626 case PCRE_NEWLINE_CR+
6627 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6628 case PCRE_NEWLINE_ANY: newline = -1; break;
6629 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6630 default: return PCRE_ERROR_BADNEWLINE;
6635 md->nltype = NLTYPE_ANYCRLF;
6637 else if (newline < 0)
6639 md->nltype = NLTYPE_ANY;
6643 md->nltype = NLTYPE_FIXED;
6647 md->nl[0] = (newline >> 8) & 255;
6648 md->nl[1] = newline & 255;
6653 md->nl[0] = newline;
6657 /* Partial matching was originally supported only for a restricted set of
6658 regexes; from release 8.00 there are no restrictions, but the bits are still
6659 defined (though never set). So there's no harm in leaving this code. */
6661 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6662 return PCRE_ERROR_BADPARTIAL;
6664 /* If the expression has got more back references than the offsets supplied can
6665 hold, we get a temporary chunk of working store to use during the matching.
6666 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6669 ocount = offsetcount - (offsetcount % 3);
6670 arg_offset_max = (2*ocount)/3;
6672 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6674 ocount = re->top_backref * 3 + 3;
6675 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6676 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6677 using_temporary_offsets = TRUE;
6678 DPRINTF(("Got memory to hold back references\n"));
6680 else md->offset_vector = offsets;
6681 md->offset_end = ocount;
6682 md->offset_max = (2*ocount)/3;
6683 md->capture_last = 0;
6685 /* Reset the working variable associated with each extraction. These should
6686 never be used unless previously set, but they get saved and restored, and so we
6687 initialize them to avoid reading uninitialized locations. Also, unset the
6688 offsets for the matched string. This is really just for tidiness with callouts,
6689 in case they inspect these fields. */
6691 if (md->offset_vector != NULL)
6693 register int *iptr = md->offset_vector + ocount;
6694 register int *iend = iptr - re->top_bracket;
6695 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6696 while (--iptr >= iend) *iptr = -1;
6697 md->offset_vector[0] = md->offset_vector[1] = -1;
6700 /* Set up the first character to match, if available. The first_char value is
6701 never set for an anchored regular expression, but the anchoring may be forced
6702 at run time, so we have to test for anchoring. The first char may be unset for
6703 an unanchored pattern, of course. If there's no first char and the pattern was
6704 studied, there may be a bitmap of possible first characters. */
6708 if ((re->flags & PCRE_FIRSTSET) != 0)
6710 has_first_char = TRUE;
6711 first_char = first_char2 = (pcre_uchar)(re->first_char);
6712 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6714 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6715 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6716 if (utf && first_char > 127)
6717 first_char2 = UCD_OTHERCASE(first_char);
6722 if (!startline && study != NULL &&
6723 (study->flags & PCRE_STUDY_MAPPED) != 0)
6724 start_bits = study->start_bits;
6727 /* For anchored or unanchored matches, there may be a "last known required
6730 if ((re->flags & PCRE_REQCHSET) != 0)
6732 has_req_char = TRUE;
6733 req_char = req_char2 = (pcre_uchar)(re->req_char);
6734 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6736 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6737 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6738 if (utf && req_char > 127)
6739 req_char2 = UCD_OTHERCASE(req_char);
6745 /* ==========================================================================*/
6747 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6748 the loop runs just once. */
6752 PCRE_PUCHAR save_end_subject = end_subject;
6753 PCRE_PUCHAR new_start_match;
6755 /* If firstline is TRUE, the start of the match is constrained to the first
6756 line of a multiline string. That is, the match must be before or at the first
6757 newline. Implement this by temporarily adjusting end_subject so that we stop
6758 scanning at a newline. If the match fails at the newline, later code breaks
6763 PCRE_PUCHAR t = start_match;
6767 while (t < md->end_subject && !IS_NEWLINE(t))
6770 ACROSSCHAR(t < end_subject, *t, t++);
6775 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6779 /* There are some optimizations that avoid running the match if a known
6780 starting point is not found, or if a known later character is not present.
6781 However, there is an option that disables these, for testing and for ensuring
6782 that all callouts do actually occur. The option can be set in the regex by
6783 (*NO_START_OPT) or passed in match-time options. */
6785 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6787 /* Advance to a unique first char if there is one. */
6793 if (first_char != first_char2)
6794 while (start_match < end_subject &&
6795 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6798 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6802 /* Or to just after a linebreak for a multiline match */
6806 if (start_match > md->start_subject + start_offset)
6811 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6814 ACROSSCHAR(start_match < end_subject, *start_match,
6820 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6823 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6824 and we are now at a LF, advance the match position by one more character.
6827 if (start_match[-1] == CHAR_CR &&
6828 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6829 start_match < end_subject &&
6830 UCHAR21TEST(start_match) == CHAR_NL)
6835 /* Or to a non-unique first byte after study */
6837 else if (start_bits != NULL)
6839 while (start_match < end_subject)
6841 register pcre_uint32 c = UCHAR21TEST(start_match);
6842 #ifndef COMPILE_PCRE8
6843 if (c > 255) c = 255;
6845 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6849 } /* Starting optimizations */
6851 /* Restore fudged end_subject */
6853 end_subject = save_end_subject;
6855 /* The following two optimizations are disabled for partial matching or if
6856 disabling is explicitly requested. */
6858 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6860 /* If the pattern was studied, a minimum subject length may be set. This is
6861 a lower bound; no actual string of that length may actually match the
6862 pattern. Although the value is, strictly, in characters, we treat it as
6863 bytes to avoid spending too much time in this optimization. */
6865 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6866 (pcre_uint32)(end_subject - start_match) < study->minlength)
6872 /* If req_char is set, we know that that character must appear in the
6873 subject for the match to succeed. If the first character is set, req_char
6874 must be later in the subject; otherwise the test starts at the match point.
6875 This optimization can save a huge amount of backtracking in patterns with
6876 nested unlimited repeats that aren't going to match. Writing separate code
6877 for cased/caseless versions makes it go faster, as does using an
6878 autoincrement and backing off on a match.
6880 HOWEVER: when the subject string is very, very long, searching to its end
6881 can take a long time, and give bad performance on quite ordinary patterns.
6882 This showed up when somebody was matching something like /^\d+C/ on a
6883 32-megabyte string... so we don't do this when the string is sufficiently
6886 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6888 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6890 /* We don't need to repeat the search if we haven't yet reached the
6891 place we found it at last time. */
6893 if (p > req_char_ptr)
6895 if (req_char != req_char2)
6897 while (p < end_subject)
6899 register pcre_uint32 pp = UCHAR21INCTEST(p);
6900 if (pp == req_char || pp == req_char2) { p--; break; }
6905 while (p < end_subject)
6907 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6911 /* If we can't find the required character, break the matching loop,
6912 forcing a match failure. */
6914 if (p >= end_subject)
6920 /* If we have found the required character, save the point where we
6921 found it, so that we don't search again next time round the loop if
6922 the start hasn't passed this character yet. */
6929 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6930 printf(">>>> Match against: ");
6931 pchars(start_match, end_subject - start_match, TRUE, md);
6935 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6936 first starting point for which a partial match was found. */
6938 md->start_match_ptr = start_match;
6939 md->start_used_ptr = start_match;
6940 md->match_call_count = 0;
6941 md->match_function_type = 0;
6942 md->end_offset_top = 0;
6943 md->skip_arg_count = 0;
6944 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6945 if (md->hitend && start_partial == NULL)
6947 start_partial = md->start_used_ptr;
6948 match_partial = start_match;
6953 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6954 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6955 entirely. The only way we can do that is to re-do the match at the same
6956 point, with a flag to force SKIP with an argument to be ignored. Just
6957 treating this case as NOMATCH does not work because it does not check other
6958 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6960 case MATCH_SKIP_ARG:
6961 new_start_match = start_match;
6962 md->ignore_skip_arg = md->skip_arg_count;
6965 /* SKIP passes back the next starting point explicitly, but if it is no
6966 greater than the match we have just done, treat it as NOMATCH. */
6969 if (md->start_match_ptr > start_match)
6971 new_start_match = md->start_match_ptr;
6976 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6977 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6982 md->ignore_skip_arg = 0;
6983 new_start_match = start_match + 1;
6986 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6991 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6997 /* Any other return is either a match, or some kind of error. */
7003 /* Control reaches here for the various types of "no match at this point"
7004 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7008 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7009 newline in the subject (though it may continue over the newline). Therefore,
7010 if we have just failed to match, starting at a newline, do not continue. */
7012 if (firstline && IS_NEWLINE(start_match)) break;
7014 /* Advance to new matching position */
7016 start_match = new_start_match;
7018 /* Break the loop if the pattern is anchored or if we have passed the end of
7021 if (anchored || start_match > end_subject) break;
7023 /* If we have just passed a CR and we are now at a LF, and the pattern does
7024 not contain any explicit matches for \r or \n, and the newline option is CRLF
7025 or ANY or ANYCRLF, advance the match position by one more character. In
7026 normal matching start_match will aways be greater than the first position at
7027 this stage, but a failed *SKIP can cause a return at the same point, which is
7028 why the first test exists. */
7030 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7031 start_match[-1] == CHAR_CR &&
7032 start_match < end_subject &&
7033 *start_match == CHAR_NL &&
7034 (re->flags & PCRE_HASCRORLF) == 0 &&
7035 (md->nltype == NLTYPE_ANY ||
7036 md->nltype == NLTYPE_ANYCRLF ||
7040 md->mark = NULL; /* Reset for start of next match attempt */
7041 } /* End of for(;;) "bumpalong" loop */
7043 /* ==========================================================================*/
7045 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7048 (1) The pattern is anchored or the match was failed by (*COMMIT);
7050 (2) We are past the end of the subject;
7052 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7053 this option requests that a match occur at or before the first newline in
7056 When we have a match and the offset vector is big enough to deal with any
7057 backreferences, captured substring offsets will already be set up. In the case
7058 where we had to get some local store to hold offsets for backreference
7059 processing, copy those that we can. In this case there need not be overflow if
7060 certain parts of the pattern were not used, even though there are more
7061 capturing parentheses than vector slots. */
7065 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7067 if (using_temporary_offsets)
7069 if (arg_offset_max >= 4)
7071 memcpy(offsets + 2, md->offset_vector + 2,
7072 (arg_offset_max - 2) * sizeof(int));
7073 DPRINTF(("Copied offsets from temporary memory\n"));
7075 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7076 DPRINTF(("Freeing temporary memory\n"));
7077 (PUBL(free))(md->offset_vector);
7080 /* Set the return code to the number of captured strings, or 0 if there were
7081 too many to fit into the vector. */
7083 rc = ((md->capture_last & OVFLBIT) != 0 &&
7084 md->end_offset_top >= arg_offset_max)?
7085 0 : md->end_offset_top/2;
7087 /* If there is space in the offset vector, set any unused pairs at the end of
7088 the pattern to -1 for backwards compatibility. It is documented that this
7089 happens. In earlier versions, the whole set of potential capturing offsets
7090 was set to -1 each time round the loop, but this is handled differently now.
7091 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7092 those at the end that need unsetting here. We can't just unset them all at
7093 the start of the whole thing because they may get set in one branch that is
7094 not the final matching branch. */
7096 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7098 register int *iptr, *iend;
7099 int resetcount = 2 + re->top_bracket * 2;
7100 if (resetcount > offsetcount) resetcount = offsetcount;
7101 iptr = offsets + md->end_offset_top;
7102 iend = offsets + resetcount;
7103 while (iptr < iend) *iptr++ = -1;
7106 /* If there is space, set up the whole thing as substring 0. The value of
7107 md->start_match_ptr might be modified if \K was encountered on the success
7110 if (offsetcount < 2) rc = 0; else
7112 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7113 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7116 /* Return MARK data if requested */
7118 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7119 *(extra_data->mark) = (pcre_uchar *)md->mark;
7120 DPRINTF((">>>> returning %d\n", rc));
7122 release_match_heapframes(&frame_zero);
7127 /* Control gets here if there has been an error, or if the overall match
7128 attempt has failed at all permitted starting positions. */
7130 if (using_temporary_offsets)
7132 DPRINTF(("Freeing temporary memory\n"));
7133 (PUBL(free))(md->offset_vector);
7136 /* For anything other than nomatch or partial match, just return the code. */
7138 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7140 DPRINTF((">>>> error: returning %d\n", rc));
7142 release_match_heapframes(&frame_zero);
7147 /* Handle partial matches - disable any mark data */
7149 if (match_partial != NULL)
7151 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7153 if (offsetcount > 1)
7155 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7156 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7157 if (offsetcount > 2)
7158 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7160 rc = PCRE_ERROR_PARTIAL;
7163 /* This is the classic nomatch case */
7167 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7168 rc = PCRE_ERROR_NOMATCH;
7171 /* Return the MARK data if it has been requested. */
7173 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7174 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7176 release_match_heapframes(&frame_zero);
7181 /* End of pcre_exec.c */