blob: aedabdc844f23afc0b2798509c3974fec7388972 [file] [log] [blame]
darind7737ab2005-09-09 00:51:07 +00001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
darince72b7a2007-02-06 19:42:35 +000011 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
12
darind7737ab2005-09-09 00:51:07 +000013-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains pcre_exec(), the externally visible function that does
44pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45possible. There are also some static supporting functions. */
46
darind7737ab2005-09-09 00:51:07 +000047#include "pcre_internal.h"
48
darinb847b442006-10-27 16:48:28 +000049/* Avoid warnings on Windows. */
50#undef min
51#undef max
darind7737ab2005-09-09 00:51:07 +000052
53/* Structure for building a chain of data that actually lives on the
54stack, for holding the values of the subject pointer at the start of each
55subpattern, so as to detect when an empty string has been matched by a
56subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
57are on the heap, not on the stack. */
58
59typedef struct eptrblock {
60 struct eptrblock *epb_prev;
61 const pcre_uchar *epb_saved_eptr;
62} eptrblock;
63
64/* Flag bits for the match() function */
65
66#define match_condassert 0x01 /* Called to check a condition assertion */
67#define match_isgroup 0x02 /* Set if start of bracketed group */
68
69/* Non-error returns from the match() function. Error returns are externally
70defined PCRE_ERROR_xxx codes, which are all negative. */
71
72#define MATCH_MATCH 1
73#define MATCH_NOMATCH 0
74
75/* Maximum number of ints of offset to save on the stack for recursive calls.
76If the offset vector is bigger, malloc is used. This should be a multiple of 3,
77because the offset vector is always a multiple of 3 long. */
78
79#define REC_STACK_SAVE_MAX 30
80
81/* Min and max values for the common repeats; for the maxima, 0 => infinity */
82
83static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
84static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
85
86
87
88#ifdef DEBUG
89/*************************************************
90* Debugging function to print chars *
91*************************************************/
92
93/* Print a sequence of chars in printable format, stopping at the end of the
94subject if the requested.
95
96Arguments:
97 p points to characters
98 length number to print
99 is_subject TRUE if printing from within md->start_subject
100 md pointer to matching data block, if is_subject is TRUE
101
102Returns: nothing
103*/
104
105static void
106pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
107{
108int c;
109if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
110while (length-- > 0)
111 if (isprint(c = *(p++))) printf("%c", c);
112#if PCRE_UTF16
113 else if (c < 256) printf("\\x%02x", c);
114 else printf("\\x{%x}", c);
115#else
116 else printf("\\x%02x", c);
117#endif
118}
119#endif
120
121
122
123/*************************************************
124* Match a back-reference *
125*************************************************/
126
127/* If a back reference hasn't been set, the length that is passed is greater
128than the number of characters left in the string, so the match fails.
129
130Arguments:
131 offset index into the offset vector
132 eptr points into the subject
133 length length to be matched
134 md points to match data block
135 ims the ims flags
136
137Returns: TRUE if matched
138*/
139
140static BOOL
141match_ref(int offset, register const pcre_uchar *eptr, int length, match_data *md,
142 unsigned long int ims)
143{
144const pcre_uchar *p = md->start_subject + md->offset_vector[offset];
145
146#ifdef DEBUG
147if (eptr >= md->end_subject)
148 printf("matching subject <null>");
149else
150 {
151 printf("matching subject ");
152 pchars(eptr, length, TRUE, md);
153 }
154printf(" against backref ");
155pchars(p, length, FALSE, md);
156printf("\n");
157#endif
158
159/* Always fail if not enough characters left */
160
161if (length > md->end_subject - eptr) return FALSE;
162
163/* Separate the caselesss case for speed */
164
165if ((ims & PCRE_CASELESS) != 0)
166 {
167 while (length-- > 0)
168 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
169 }
170else
171 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
172
173return TRUE;
174}
175
176
177
178/***************************************************************************
179****************************************************************************
180 RECURSION IN THE match() FUNCTION
181
182The match() function is highly recursive. Some regular expressions can cause
183it to recurse thousands of times. I was writing for Unix, so I just let it
184call itself recursively. This uses the stack for saving everything that has
185to be saved for a recursive call. On Unix, the stack can be large, and this
186works fine.
187
188It turns out that on non-Unix systems there are problems with programs that
189use a lot of stack. (This despite the fact that every last chip has oodles
190of memory these days, and techniques for extending the stack have been known
191for decades.) So....
192
193There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
194calls by keeping local variables that need to be preserved in blocks of memory
195obtained from malloc instead instead of on the stack. Macros are used to
196achieve this so that the actual code doesn't look very different to what it
197always used to.
198****************************************************************************
199***************************************************************************/
200
201
202/* These versions of the macros use the stack, as normal */
203
204#ifndef NO_RECURSE
205#define REGISTER register
darined76fb52007-02-06 21:55:25 +0000206#define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
darind7737ab2005-09-09 00:51:07 +0000207#define RRETURN(ra) return ra
208#else
209
210
211/* These versions of the macros manage a private stack on the heap. Note
212that the rd argument of RMATCH isn't actually used. It's the md argument of
213match(), which never changes. */
214
215#define REGISTER
216
darince72b7a2007-02-06 19:42:35 +0000217#ifndef __GNUC__
218
darined76fb52007-02-06 21:55:25 +0000219/* Use numbered labels and switch statement at the bottom of the match function. */
darince72b7a2007-02-06 19:42:35 +0000220
darined76fb52007-02-06 21:55:25 +0000221#define RMATCH_WHERE(num) num
222#define RRETURN_LABEL RRETURN_SWITCH
darind7737ab2005-09-09 00:51:07 +0000223
darince72b7a2007-02-06 19:42:35 +0000224#else
225
darined76fb52007-02-06 21:55:25 +0000226/* Use GCC's computed goto extension. */
darince72b7a2007-02-06 19:42:35 +0000227
darined76fb52007-02-06 21:55:25 +0000228/* For one test case this is more than 40% faster than the switch statement.
229We could avoid the use of the num argument entirely by using local labels,
230but using it for the GCC case as well as the non-GCC case allows us to share
231a bit more code and notice if we use conflicting numbers.*/
232
233#define RMATCH_WHERE(num) &&RRETURN_##num
234#define RRETURN_LABEL *frame->Xwhere
235
236#endif
237
238
239#define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg)\
darince72b7a2007-02-06 19:42:35 +0000240 {\
darince72b7a2007-02-06 19:42:35 +0000241 heapframe *newframe;\
242 if (frame >= stackframes && frame + 1 < stackframesend)\
243 newframe = frame + 1;\
244 else\
245 newframe = (pcre_stack_malloc)(sizeof(heapframe));\
darined76fb52007-02-06 21:55:25 +0000246 frame->Xwhere = RMATCH_WHERE(num);\
darince72b7a2007-02-06 19:42:35 +0000247 newframe->Xeptr = ra;\
248 newframe->Xecode = rb;\
249 newframe->Xoffset_top = rc;\
250 newframe->Xims = re;\
251 newframe->Xeptrb = rf;\
252 newframe->Xflags = rg;\
253 newframe->Xprevframe = frame;\
254 frame = newframe;\
255 DPRINTF(("restarting from line %d\n", __LINE__));\
256 goto HEAP_RECURSE;\
darined76fb52007-02-06 21:55:25 +0000257RRETURN_##num:\
darince72b7a2007-02-06 19:42:35 +0000258 DPRINTF(("did a goto back to line %d\n", __LINE__));\
259 frame = md->thisframe;\
260 rx = frame->Xresult;\
261 }
262
263#define RRETURN(ra)\
264 {\
265 heapframe *newframe = frame;\
266 frame = newframe->Xprevframe;\
267 if (!(newframe >= stackframes && newframe < stackframesend))\
268 (pcre_stack_free)(newframe);\
269 if (frame != NULL)\
270 {\
271 frame->Xresult = ra;\
272 md->thisframe = frame;\
darined76fb52007-02-06 21:55:25 +0000273 goto RRETURN_LABEL;\
darince72b7a2007-02-06 19:42:35 +0000274 }\
275 return ra;\
276 }
277
darind7737ab2005-09-09 00:51:07 +0000278/* Structure for remembering the local variables in a private frame */
279
280typedef struct heapframe {
281 struct heapframe *Xprevframe;
282
283 /* Function arguments that may change */
284
285 const pcre_uchar *Xeptr;
286 const uschar *Xecode;
287 int Xoffset_top;
288 long int Xims;
289 eptrblock *Xeptrb;
290 int Xflags;
291
292 /* Function local variables */
293
294 const uschar *Xcallpat;
295 const uschar *Xcharptr;
296 const uschar *Xdata;
297 const uschar *Xnext;
298 const pcre_uchar *Xpp;
299 const uschar *Xprev;
300 const pcre_uchar *Xsaved_eptr;
301
302 recursion_info Xnew_recursive;
303
304 BOOL Xcur_is_word;
305 BOOL Xcondition;
306 BOOL Xminimize;
307 BOOL Xprev_is_word;
308
309 unsigned long int Xoriginal_ims;
310
311#ifdef SUPPORT_UCP
312 int Xprop_type;
313 int Xprop_fail_result;
314 int Xprop_category;
315 int Xprop_chartype;
316 int Xprop_othercase;
317 int Xprop_test_against;
318 int *Xprop_test_variable;
darin8bff71f2007-02-07 20:02:50 +0000319
320 int Xrepeat_othercase;
darind7737ab2005-09-09 00:51:07 +0000321#endif
322
323 int Xctype;
324 int Xfc;
325 int Xfi;
326 int Xlength;
327 int Xmax;
328 int Xmin;
329 int Xnumber;
330 int Xoffset;
331 int Xop;
332 int Xsave_capture_last;
333 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
334 int Xstacksave[REC_STACK_SAVE_MAX];
335
336 eptrblock Xnewptrb;
337
338 /* Place to pass back result, and where to jump back to */
339
darined76fb52007-02-06 21:55:25 +0000340 int Xresult;
darince72b7a2007-02-06 19:42:35 +0000341#ifndef __GNUC__
darined76fb52007-02-06 21:55:25 +0000342 int Xwhere;
darince72b7a2007-02-06 19:42:35 +0000343#else
344 void *Xwhere;
345#endif
darind7737ab2005-09-09 00:51:07 +0000346
347} heapframe;
348
349#endif
350
351
352/***************************************************************************
353***************************************************************************/
354
355
356
357/*************************************************
358* Match from current position *
359*************************************************/
360
361/* On entry ecode points to the first opcode, and eptr to the first character
362in the subject string, while eptrb holds the value of eptr at the start of the
363last bracketed group - used for breaking infinite loops matching zero-length
364strings. This function is called recursively in many circumstances. Whenever it
365returns a negative (error) response, the outer incarnation must also return the
366same response.
367
368Performance note: It might be tempting to extract commonly used fields from the
369md structure (e.g. utf8, end_subject) into individual variables to improve
370performance. Tests using gcc on a SPARC disproved this; in the first case, it
371made performance worse.
372
373Arguments:
374 eptr pointer in subject
375 ecode position in code
376 offset_top current top pointer
377 md pointer to "static" info for the match
378 ims current /i, /m, and /s options
379 eptrb pointer to chain of blocks containing eptr at start of
380 brackets - for testing for empty matches
381 flags can contain
382 match_condassert - this is an assertion condition
383 match_isgroup - this is the start of a bracketed group
384
385Returns: MATCH_MATCH if matched ) these values are >= 0
386 MATCH_NOMATCH if failed to match )
387 a negative PCRE_ERROR_xxx value if aborted by an error condition
388 (e.g. stopped by recursion limit)
389*/
390
391static int
392match(REGISTER const pcre_uchar *eptr, REGISTER const uschar *ecode,
393 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
394 int flags)
395{
396/* These variables do not need to be preserved over recursion in this function,
397so they can be ordinary variables in all cases. Mark them with "register"
398because they are used a lot in loops. */
399
400register int rrc; /* Returns from recursive calls */
401register int i; /* Used for loops not involving calls to RMATCH() */
402register int c; /* Character values not kept over RMATCH() calls */
403register BOOL utf8; /* Local copy of UTF-8 flag for speed */
404
405/* When recursion is not being used, all "local" variables that have to be
406preserved over calls to RMATCH() are part of a "frame" which is obtained from
407heap storage. Set up the top-level frame here; others are obtained from the
408heap whenever RMATCH() does a "recursion". See the macro definitions above. */
409
410#ifdef NO_RECURSE
darince72b7a2007-02-06 19:42:35 +0000411
412/* The value 16 here is large enough that most regular expressions don't require
413any calls to pcre_stack_malloc, yet the amount of stack used for the array is
414modest enough that we don't run out of stack. */
415heapframe stackframes[16];
416heapframe *stackframesend = stackframes + sizeof(stackframes) / sizeof(stackframes[0]);
417
418heapframe *frame = stackframes;
darind7737ab2005-09-09 00:51:07 +0000419frame->Xprevframe = NULL; /* Marks the top level */
420
421/* Copy in the original argument variables */
422
423frame->Xeptr = eptr;
424frame->Xecode = ecode;
425frame->Xoffset_top = offset_top;
426frame->Xims = ims;
427frame->Xeptrb = eptrb;
428frame->Xflags = flags;
429
430/* This is where control jumps back to to effect "recursion" */
431
432HEAP_RECURSE:
433
434/* Macros make the argument variables come from the current frame */
435
436#define eptr frame->Xeptr
437#define ecode frame->Xecode
438#define offset_top frame->Xoffset_top
439#define ims frame->Xims
440#define eptrb frame->Xeptrb
441#define flags frame->Xflags
442
443/* Ditto for the local variables */
444
445#ifdef SUPPORT_UTF8
446#define charptr frame->Xcharptr
447#endif
448#define callpat frame->Xcallpat
449#define data frame->Xdata
450#define next frame->Xnext
451#define pp frame->Xpp
452#define prev frame->Xprev
453#define saved_eptr frame->Xsaved_eptr
454
455#define new_recursive frame->Xnew_recursive
456
457#define cur_is_word frame->Xcur_is_word
458#define condition frame->Xcondition
459#define minimize frame->Xminimize
460#define prev_is_word frame->Xprev_is_word
461
462#define original_ims frame->Xoriginal_ims
463
464#ifdef SUPPORT_UCP
darin8bff71f2007-02-07 20:02:50 +0000465
darind7737ab2005-09-09 00:51:07 +0000466#define prop_type frame->Xprop_type
467#define prop_fail_result frame->Xprop_fail_result
468#define prop_category frame->Xprop_category
469#define prop_chartype frame->Xprop_chartype
470#define prop_othercase frame->Xprop_othercase
471#define prop_test_against frame->Xprop_test_against
472#define prop_test_variable frame->Xprop_test_variable
darin8bff71f2007-02-07 20:02:50 +0000473
474#define repeat_othercase frame->Xrepeat_othercase
475
darind7737ab2005-09-09 00:51:07 +0000476#endif
477
478#define ctype frame->Xctype
479#define fc frame->Xfc
480#define fi frame->Xfi
481#define length frame->Xlength
482#define max frame->Xmax
483#define min frame->Xmin
484#define number frame->Xnumber
485#define offset frame->Xoffset
486#define op frame->Xop
487#define save_capture_last frame->Xsave_capture_last
488#define save_offset1 frame->Xsave_offset1
489#define save_offset2 frame->Xsave_offset2
490#define save_offset3 frame->Xsave_offset3
491#define stacksave frame->Xstacksave
492
493#define newptrb frame->Xnewptrb
494
495/* When recursion is being used, local variables are allocated on the stack and
496get preserved during recursion in the normal way. In this environment, fi and
497i, and fc and c, can be the same variables. */
498
499#else
500#define fi i
501#define fc c
502
503
504#if !PCRE_UTF16
505#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
506const uschar *charptr; /* small blocks of the code. My normal */
507#endif /* style of coding would have declared */
508#endif
509const uschar *callpat; /* them within each of those blocks. */
510const uschar *data; /* However, in order to accommodate the */
511const uschar *next; /* version of this code that uses an */
512const pcre_uchar *pp; /* external "stack" implemented on the */
513const uschar *prev; /* heap, it is easier to declare them */
514const pcre_uchar *saved_eptr; /* all here, so the declarations can */
515 /* be cut out in a block. The only */
516recursion_info new_recursive; /* declarations within blocks below are */
517 /* for variables that do not have to */
518BOOL cur_is_word; /* be preserved over a recursive call */
519BOOL condition; /* to RMATCH(). */
520BOOL minimize;
521BOOL prev_is_word;
522
523unsigned long int original_ims;
524
525#ifdef SUPPORT_UCP
darin8bff71f2007-02-07 20:02:50 +0000526
darind7737ab2005-09-09 00:51:07 +0000527int prop_type;
528int prop_fail_result;
529int prop_category;
530int prop_chartype;
531int prop_othercase;
532int prop_test_against;
533int *prop_test_variable;
darin8bff71f2007-02-07 20:02:50 +0000534
535int repeat_othercase;
536
darind7737ab2005-09-09 00:51:07 +0000537#endif
538
539int ctype;
540int length;
541int max;
542int min;
543int number;
544int offset;
545int op;
546int save_capture_last;
547int save_offset1, save_offset2, save_offset3;
548int stacksave[REC_STACK_SAVE_MAX];
549
550eptrblock newptrb;
551#endif
552
553/* These statements are here to stop the compiler complaining about unitialized
554variables. */
555
556#ifdef SUPPORT_UCP
557prop_fail_result = 0;
558prop_test_against = 0;
559prop_test_variable = NULL;
560#endif
561
562/* OK, now we can get on with the real code of the function. Recursion is
563specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
564these just turn into a recursive call to match() and a "return", respectively.
565However, RMATCH isn't like a function call because it's quite a complicated
566macro. It has to be used in one particular way. This shouldn't, however, impact
567performance when true recursion is being used. */
568
darince72b7a2007-02-06 19:42:35 +0000569utf8 = md->utf8; /* Local copy of the flag */
570
darind7737ab2005-09-09 00:51:07 +0000571if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
572
573original_ims = ims; /* Save for resetting on ')' */
darind7737ab2005-09-09 00:51:07 +0000574
575/* At the start of a bracketed group, add the current subject pointer to the
576stack of such pointers, to be re-instated at the end of the group when we hit
577the closing ket. When match() is called in other circumstances, we don't add to
578this stack. */
579
580if ((flags & match_isgroup) != 0)
581 {
582 newptrb.epb_prev = eptrb;
583 newptrb.epb_saved_eptr = eptr;
584 eptrb = &newptrb;
585 }
586
587/* Now start processing the operations. */
588
589for (;;)
590 {
591 op = *ecode;
592 minimize = FALSE;
593
594 /* For partial matching, remember if we ever hit the end of the subject after
595 matching at least one subject character. */
596
597 if (md->partial &&
598 eptr >= md->end_subject &&
599 eptr > md->start_match)
600 md->hitend = TRUE;
601
602 /* Opening capturing bracket. If there is space in the offset vector, save
603 the current subject position in the working slot at the top of the vector. We
604 mustn't change the current values of the data slot, because they may be set
605 from a previous iteration of this group, and be referred to by a reference
606 inside the group.
607
608 If the bracket fails to match, we need to restore this value and also the
609 values of the final offsets, in case they were set by a previous iteration of
610 the same bracket.
611
612 If there isn't enough space in the offset vector, treat this as if it were a
613 non-capturing bracket. Don't worry about setting the flag for the error case
614 here; that is handled in the code for KET. */
615
616 if (op > OP_BRA)
617 {
618 number = op - OP_BRA;
619
620 /* For extended extraction brackets (large number), we have to fish out the
621 number from a dummy opcode at the start. */
622
623 if (number > EXTRACT_BASIC_MAX)
624 number = GET2(ecode, 2+LINK_SIZE);
625 offset = number << 1;
626
627#ifdef DEBUG
628 printf("start bracket %d subject=", number);
629 pchars(eptr, 16, TRUE, md);
630 printf("\n");
631#endif
632
633 if (offset < md->offset_max)
634 {
635 save_offset1 = md->offset_vector[offset];
636 save_offset2 = md->offset_vector[offset+1];
637 save_offset3 = md->offset_vector[md->offset_end - number];
638 save_capture_last = md->capture_last;
639
640 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
thatcherdc18a362006-08-31 21:28:29 +0000641 md->offset_vector[md->offset_end - number] = INT_CAST(eptr - md->start_subject);
darind7737ab2005-09-09 00:51:07 +0000642
643 do
644 {
darined76fb52007-02-06 21:55:25 +0000645 RMATCH(1, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darind7737ab2005-09-09 00:51:07 +0000646 match_isgroup);
647 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
648 md->capture_last = save_capture_last;
649 ecode += GET(ecode, 1);
650 }
651 while (*ecode == OP_ALT);
652
653 DPRINTF(("bracket %d failed\n", number));
654
655 md->offset_vector[offset] = save_offset1;
656 md->offset_vector[offset+1] = save_offset2;
657 md->offset_vector[md->offset_end - number] = save_offset3;
658
659 RRETURN(MATCH_NOMATCH);
660 }
661
662 /* Insufficient room for saving captured contents */
663
664 else op = OP_BRA;
665 }
666
667 /* Other types of node can be handled by a switch */
668
669 switch(op)
670 {
671 case OP_BRA: /* Non-capturing bracket: optimized */
672 DPRINTF(("start bracket 0\n"));
673 do
674 {
darined76fb52007-02-06 21:55:25 +0000675 RMATCH(2, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darind7737ab2005-09-09 00:51:07 +0000676 match_isgroup);
677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
678 ecode += GET(ecode, 1);
679 }
680 while (*ecode == OP_ALT);
681 DPRINTF(("bracket 0 failed\n"));
682 RRETURN(MATCH_NOMATCH);
683
684 /* Conditional group: compilation checked that there are no more than
685 two branches. If the condition is false, skipping the first branch takes us
686 past the end if there is only one branch, but that's OK because that is
687 exactly what going to the ket would do. */
688
689 case OP_COND:
690 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
691 {
692 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
693 condition = (offset == CREF_RECURSE * 2)?
694 (md->recursive != NULL) :
695 (offset < offset_top && md->offset_vector[offset] >= 0);
darined76fb52007-02-06 21:55:25 +0000696 RMATCH(3, rrc, eptr, ecode + (condition?
darind7737ab2005-09-09 00:51:07 +0000697 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
698 offset_top, md, ims, eptrb, match_isgroup);
699 RRETURN(rrc);
700 }
701
702 /* The condition is an assertion. Call match() to evaluate it - setting
703 the final argument TRUE causes it to stop at the end of an assertion. */
704
705 else
706 {
darined76fb52007-02-06 21:55:25 +0000707 RMATCH(4, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darind7737ab2005-09-09 00:51:07 +0000708 match_condassert | match_isgroup);
709 if (rrc == MATCH_MATCH)
710 {
711 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
712 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
713 }
714 else if (rrc != MATCH_NOMATCH)
715 {
716 RRETURN(rrc); /* Need braces because of following else */
717 }
718 else ecode += GET(ecode, 1);
darined76fb52007-02-06 21:55:25 +0000719 RMATCH(5, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darind7737ab2005-09-09 00:51:07 +0000720 match_isgroup);
721 RRETURN(rrc);
722 }
723 /* Control never reaches here */
724
725 /* Skip over conditional reference or large extraction number data if
726 encountered. */
727
728 case OP_CREF:
729 case OP_BRANUMBER:
730 ecode += 3;
731 break;
732
733 /* End of the pattern. If we are in a recursion, we should restore the
734 offsets appropriately and continue from after the call. */
735
736 case OP_END:
737 if (md->recursive != NULL && md->recursive->group_num == 0)
738 {
739 recursion_info *rec = md->recursive;
740 DPRINTF(("Hit the end in a (?0) recursion\n"));
741 md->recursive = rec->prevrec;
742 memmove(md->offset_vector, rec->offset_save,
743 rec->saved_max * sizeof(int));
744 md->start_match = rec->save_start;
745 ims = original_ims;
746 ecode = rec->after_call;
747 break;
748 }
749
750 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
751 string - backtracking will then try other alternatives, if any. */
752
753 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
754 md->end_match_ptr = eptr; /* Record where we ended */
755 md->end_offset_top = offset_top; /* and how many extracts were taken */
756 RRETURN(MATCH_MATCH);
757
758 /* Change option settings */
759
760 case OP_OPT:
761 ims = ecode[1];
762 ecode += 2;
763 DPRINTF(("ims set to %02lx\n", ims));
764 break;
765
766 /* Assertion brackets. Check the alternative branches in turn - the
767 matching won't pass the KET for an assertion. If any one branch matches,
768 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
769 start of each branch to move the current point backwards, so the code at
770 this level is identical to the lookahead case. */
771
772 case OP_ASSERT:
773 case OP_ASSERTBACK:
774 do
775 {
darined76fb52007-02-06 21:55:25 +0000776 RMATCH(6, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darind7737ab2005-09-09 00:51:07 +0000777 match_isgroup);
778 if (rrc == MATCH_MATCH) break;
779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
780 ecode += GET(ecode, 1);
781 }
782 while (*ecode == OP_ALT);
783 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
784
785 /* If checking an assertion for a condition, return MATCH_MATCH. */
786
787 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
788
789 /* Continue from after the assertion, updating the offsets high water
790 mark, since extracts may have been taken during the assertion. */
791
792 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
793 ecode += 1 + LINK_SIZE;
794 offset_top = md->end_offset_top;
795 continue;
796
797 /* Negative assertion: all branches must fail to match */
798
799 case OP_ASSERT_NOT:
800 case OP_ASSERTBACK_NOT:
801 do
802 {
darined76fb52007-02-06 21:55:25 +0000803 RMATCH(7, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darind7737ab2005-09-09 00:51:07 +0000804 match_isgroup);
805 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
807 ecode += GET(ecode,1);
808 }
809 while (*ecode == OP_ALT);
810
811 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
812
813 ecode += 1 + LINK_SIZE;
814 continue;
815
816 /* Move the subject pointer back. This occurs only at the start of
817 each branch of a lookbehind assertion. If we are too close to the start to
818 move back, this match function fails. When working with UTF-8 we move
819 back a number of characters, not bytes. */
820
821 case OP_REVERSE:
822#ifdef SUPPORT_UTF8
823 if (utf8)
824 {
825 c = GET(ecode,1);
826 for (i = 0; i < c; i++)
827 {
828 eptr--;
829 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
830 BACKCHAR(eptr)
831 }
832 }
833 else
834#endif
835
836 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
837
838 {
839 eptr -= GET(ecode,1);
840 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
841 }
842
843 /* Skip to next op code */
844
845 ecode += 1 + LINK_SIZE;
846 break;
847
848 /* The callout item calls an external function, if one is provided, passing
849 details of the match so far. This is mainly for debugging, though the
850 function is able to force a failure. */
851
852 case OP_CALLOUT:
853 if (pcre_callout != NULL)
854 {
855 pcre_callout_block cb;
856 cb.version = 1; /* Version 1 of the callout block */
857 cb.callout_number = ecode[1];
858 cb.offset_vector = md->offset_vector;
859 cb.subject = (const pcre_char *)md->start_subject;
thatcherdc18a362006-08-31 21:28:29 +0000860 cb.subject_length = INT_CAST(md->end_subject - md->start_subject);
861 cb.start_match = INT_CAST(md->start_match - md->start_subject);
862 cb.current_position = INT_CAST(eptr - md->start_subject);
darind7737ab2005-09-09 00:51:07 +0000863 cb.pattern_position = GET(ecode, 2);
864 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
865 cb.capture_top = offset_top/2;
866 cb.capture_last = md->capture_last;
867 cb.callout_data = md->callout_data;
868 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
869 if (rrc < 0) RRETURN(rrc);
870 }
871 ecode += 2 + 2*LINK_SIZE;
872 break;
873
874 /* Recursion either matches the current regex, or some subexpression. The
875 offset data is the offset to the starting bracket from the start of the
876 whole pattern. (This is so that it works from duplicated subpatterns.)
877
878 If there are any capturing brackets started but not finished, we have to
879 save their starting points and reinstate them after the recursion. However,
880 we don't know how many such there are (offset_top records the completed
881 total) so we just have to save all the potential data. There may be up to
882 65535 such values, which is too large to put on the stack, but using malloc
883 for small numbers seems expensive. As a compromise, the stack is used when
884 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
885 is used. A problem is what to do if the malloc fails ... there is no way of
886 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
887 values on the stack, and accept that the rest may be wrong.
888
889 There are also other values that have to be saved. We use a chained
890 sequence of blocks that actually live on the stack. Thanks to Robin Houston
891 for the original version of this logic. */
892
893 case OP_RECURSE:
894 {
895 callpat = md->start_code + GET(ecode, 1);
896 new_recursive.group_num = *callpat - OP_BRA;
897
898 /* For extended extraction brackets (large number), we have to fish out
899 the number from a dummy opcode at the start. */
900
901 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
902 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
903
904 /* Add to "recursing stack" */
905
906 new_recursive.prevrec = md->recursive;
907 md->recursive = &new_recursive;
908
909 /* Find where to continue from afterwards */
910
911 ecode += 1 + LINK_SIZE;
912 new_recursive.after_call = ecode;
913
914 /* Now save the offset data. */
915
916 new_recursive.saved_max = md->offset_end;
917 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
918 new_recursive.offset_save = stacksave;
919 else
920 {
921 new_recursive.offset_save =
922 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
923 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
924 }
925
926 memcpy(new_recursive.offset_save, md->offset_vector,
927 new_recursive.saved_max * sizeof(int));
928 new_recursive.save_start = md->start_match;
929 md->start_match = eptr;
930
931 /* OK, now we can do the recursion. For each top-level alternative we
932 restore the offset and recursion data. */
933
934 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
935 do
936 {
darined76fb52007-02-06 21:55:25 +0000937 RMATCH(8, rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
darind7737ab2005-09-09 00:51:07 +0000938 eptrb, match_isgroup);
939 if (rrc == MATCH_MATCH)
940 {
941 md->recursive = new_recursive.prevrec;
942 if (new_recursive.offset_save != stacksave)
943 (pcre_free)(new_recursive.offset_save);
944 RRETURN(MATCH_MATCH);
945 }
946 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
947
948 md->recursive = &new_recursive;
949 memcpy(md->offset_vector, new_recursive.offset_save,
950 new_recursive.saved_max * sizeof(int));
951 callpat += GET(callpat, 1);
952 }
953 while (*callpat == OP_ALT);
954
955 DPRINTF(("Recursion didn't match\n"));
956 md->recursive = new_recursive.prevrec;
957 if (new_recursive.offset_save != stacksave)
958 (pcre_free)(new_recursive.offset_save);
959 RRETURN(MATCH_NOMATCH);
960 }
961 /* Control never reaches here */
962
963 /* "Once" brackets are like assertion brackets except that after a match,
964 the point in the subject string is not moved back. Thus there can never be
965 a move back into the brackets. Friedl calls these "atomic" subpatterns.
966 Check the alternative branches in turn - the matching won't pass the KET
967 for this kind of subpattern. If any one branch matches, we carry on as at
968 the end of a normal bracket, leaving the subject pointer. */
969
970 case OP_ONCE:
971 {
972 prev = ecode;
973 saved_eptr = eptr;
974
975 do
976 {
darined76fb52007-02-06 21:55:25 +0000977 RMATCH(9, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
darind7737ab2005-09-09 00:51:07 +0000978 eptrb, match_isgroup);
979 if (rrc == MATCH_MATCH) break;
980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
981 ecode += GET(ecode,1);
982 }
983 while (*ecode == OP_ALT);
984
985 /* If hit the end of the group (which could be repeated), fail */
986
987 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
988
989 /* Continue as from after the assertion, updating the offsets high water
990 mark, since extracts may have been taken. */
991
992 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
993
994 offset_top = md->end_offset_top;
995 eptr = md->end_match_ptr;
996
997 /* For a non-repeating ket, just continue at this level. This also
998 happens for a repeating ket if no characters were matched in the group.
999 This is the forcible breaking of infinite loops as implemented in Perl
1000 5.005. If there is an options reset, it will get obeyed in the normal
1001 course of events. */
1002
1003 if (*ecode == OP_KET || eptr == saved_eptr)
1004 {
1005 ecode += 1+LINK_SIZE;
1006 break;
1007 }
1008
1009 /* The repeating kets try the rest of the pattern or restart from the
1010 preceding bracket, in the appropriate order. We need to reset any options
1011 that changed within the bracket before re-running it, so check the next
1012 opcode. */
1013
1014 if (ecode[1+LINK_SIZE] == OP_OPT)
1015 {
1016 ims = (ims & ~PCRE_IMS) | ecode[4];
1017 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1018 }
1019
1020 if (*ecode == OP_KETRMIN)
1021 {
darined76fb52007-02-06 21:55:25 +00001022 RMATCH(10, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darined76fb52007-02-06 21:55:25 +00001024 RMATCH(11, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darind7737ab2005-09-09 00:51:07 +00001025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1026 }
1027 else /* OP_KETRMAX */
1028 {
darined76fb52007-02-06 21:55:25 +00001029 RMATCH(12, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darind7737ab2005-09-09 00:51:07 +00001030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darined76fb52007-02-06 21:55:25 +00001031 RMATCH(13, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1033 }
1034 }
1035 RRETURN(MATCH_NOMATCH);
1036
1037 /* An alternation is the end of a branch; scan along to find the end of the
1038 bracketed group and go to there. */
1039
1040 case OP_ALT:
1041 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1042 break;
1043
1044 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1045 that it may occur zero times. It may repeat infinitely, or not at all -
1046 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1047 repeat limits are compiled as a number of copies, with the optional ones
1048 preceded by BRAZERO or BRAMINZERO. */
1049
1050 case OP_BRAZERO:
1051 {
1052 next = ecode+1;
darined76fb52007-02-06 21:55:25 +00001053 RMATCH(14, rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
darind7737ab2005-09-09 00:51:07 +00001054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1055 do next += GET(next,1); while (*next == OP_ALT);
1056 ecode = next + 1+LINK_SIZE;
1057 }
1058 break;
1059
1060 case OP_BRAMINZERO:
1061 {
1062 next = ecode+1;
1063 do next += GET(next,1); while (*next == OP_ALT);
darined76fb52007-02-06 21:55:25 +00001064 RMATCH(15, rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
darind7737ab2005-09-09 00:51:07 +00001065 match_isgroup);
1066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1067 ecode++;
1068 }
1069 break;
1070
1071 /* End of a group, repeated or non-repeating. If we are at the end of
1072 an assertion "group", stop matching and return MATCH_MATCH, but record the
1073 current high water mark for use by positive assertions. Do this also
1074 for the "once" (not-backup up) groups. */
1075
1076 case OP_KET:
1077 case OP_KETRMIN:
1078 case OP_KETRMAX:
1079 {
1080 prev = ecode - GET(ecode, 1);
1081 saved_eptr = eptrb->epb_saved_eptr;
1082
1083 /* Back up the stack of bracket start pointers. */
1084
1085 eptrb = eptrb->epb_prev;
1086
1087 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1088 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1089 *prev == OP_ONCE)
1090 {
1091 md->end_match_ptr = eptr; /* For ONCE */
1092 md->end_offset_top = offset_top;
1093 RRETURN(MATCH_MATCH);
1094 }
1095
1096 /* In all other cases except a conditional group we have to check the
1097 group number back at the start and if necessary complete handling an
1098 extraction by setting the offsets and bumping the high water mark. */
1099
1100 if (*prev != OP_COND)
1101 {
1102 number = *prev - OP_BRA;
1103
1104 /* For extended extraction brackets (large number), we have to fish out
1105 the number from a dummy opcode at the start. */
1106
1107 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1108 offset = number << 1;
1109
1110#ifdef DEBUG
1111 printf("end bracket %d", number);
1112 printf("\n");
1113#endif
1114
1115 /* Test for a numbered group. This includes groups called as a result
1116 of recursion. Note that whole-pattern recursion is coded as a recurse
1117 into group 0, so it won't be picked up here. Instead, we catch it when
1118 the OP_END is reached. */
1119
1120 if (number > 0)
1121 {
1122 md->capture_last = number;
1123 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1124 {
1125 md->offset_vector[offset] =
1126 md->offset_vector[md->offset_end - number];
thatcherdc18a362006-08-31 21:28:29 +00001127 md->offset_vector[offset+1] = INT_CAST(eptr - md->start_subject);
darind7737ab2005-09-09 00:51:07 +00001128 if (offset_top <= offset) offset_top = offset + 2;
1129 }
1130
1131 /* Handle a recursively called group. Restore the offsets
1132 appropriately and continue from after the call. */
1133
1134 if (md->recursive != NULL && md->recursive->group_num == number)
1135 {
1136 recursion_info *rec = md->recursive;
1137 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1138 md->recursive = rec->prevrec;
1139 md->start_match = rec->save_start;
1140 memcpy(md->offset_vector, rec->offset_save,
1141 rec->saved_max * sizeof(int));
1142 ecode = rec->after_call;
1143 ims = original_ims;
1144 break;
1145 }
1146 }
1147 }
1148
1149 /* Reset the value of the ims flags, in case they got changed during
1150 the group. */
1151
1152 ims = original_ims;
1153 DPRINTF(("ims reset to %02lx\n", ims));
1154
1155 /* For a non-repeating ket, just continue at this level. This also
1156 happens for a repeating ket if no characters were matched in the group.
1157 This is the forcible breaking of infinite loops as implemented in Perl
1158 5.005. If there is an options reset, it will get obeyed in the normal
1159 course of events. */
1160
1161 if (*ecode == OP_KET || eptr == saved_eptr)
1162 {
1163 ecode += 1 + LINK_SIZE;
1164 break;
1165 }
1166
1167 /* The repeating kets try the rest of the pattern or restart from the
1168 preceding bracket, in the appropriate order. */
1169
1170 if (*ecode == OP_KETRMIN)
1171 {
darined76fb52007-02-06 21:55:25 +00001172 RMATCH(16, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darined76fb52007-02-06 21:55:25 +00001174 RMATCH(17, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darind7737ab2005-09-09 00:51:07 +00001175 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1176 }
1177 else /* OP_KETRMAX */
1178 {
darined76fb52007-02-06 21:55:25 +00001179 RMATCH(18, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darind7737ab2005-09-09 00:51:07 +00001180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darined76fb52007-02-06 21:55:25 +00001181 RMATCH(19, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1183 }
1184 }
1185
1186 RRETURN(MATCH_NOMATCH);
1187
1188 /* Start of subject unless notbol, or after internal newline if multiline */
1189
1190 case OP_CIRC:
1191 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1192 if ((ims & PCRE_MULTILINE) != 0)
1193 {
1194 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1195 RRETURN(MATCH_NOMATCH);
1196 ecode++;
1197 break;
1198 }
1199 /* ... else fall through */
1200
1201 /* Start of subject assertion */
1202
1203 case OP_SOD:
1204 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1205 ecode++;
1206 break;
1207
1208 /* Start of match assertion */
1209
1210 case OP_SOM:
1211 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1212 ecode++;
1213 break;
1214
1215 /* Assert before internal newline if multiline, or before a terminating
1216 newline unless endonly is set, else end of subject unless noteol is set. */
1217
1218 case OP_DOLL:
1219 if ((ims & PCRE_MULTILINE) != 0)
1220 {
1221 if (eptr < md->end_subject)
1222 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1223 else
1224 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1225 ecode++;
1226 break;
1227 }
1228 else
1229 {
1230 if (md->noteol) RRETURN(MATCH_NOMATCH);
1231 if (!md->endonly)
1232 {
1233 if (eptr < md->end_subject - 1 ||
1234 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1235 RRETURN(MATCH_NOMATCH);
1236 ecode++;
1237 break;
1238 }
1239 }
1240 /* ... else fall through */
1241
1242 /* End of subject assertion (\z) */
1243
1244 case OP_EOD:
1245 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1246 ecode++;
1247 break;
1248
1249 /* End of subject or ending \n assertion (\Z) */
1250
1251 case OP_EODN:
1252 if (eptr < md->end_subject - 1 ||
1253 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1254 ecode++;
1255 break;
1256
1257 /* Word boundary assertions */
1258
1259 case OP_NOT_WORD_BOUNDARY:
1260 case OP_WORD_BOUNDARY:
1261 {
1262
1263 /* Find out if the previous and current characters are "word" characters.
1264 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1265 be "non-word" characters. */
1266
1267#ifdef SUPPORT_UTF8
1268 if (utf8)
1269 {
1270 if (eptr == md->start_subject) prev_is_word = FALSE; else
1271 {
1272 const pcre_uchar *lastptr = eptr - 1;
1273 while(ISMIDCHAR(*lastptr)) lastptr--;
1274 GETCHAR(c, lastptr);
1275 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1276 }
1277 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1278 {
1279 GETCHAR(c, eptr);
1280 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1281 }
1282 }
1283 else
1284#endif
1285
1286 /* More streamlined when not in UTF-8 mode */
1287
1288 {
1289 prev_is_word = (eptr != md->start_subject) &&
1290 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1291 cur_is_word = (eptr < md->end_subject) &&
1292 ((md->ctypes[*eptr] & ctype_word) != 0);
1293 }
1294
1295 /* Now see if the situation is what we want */
1296
1297 if ((*ecode++ == OP_WORD_BOUNDARY)?
1298 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1299 RRETURN(MATCH_NOMATCH);
1300 }
1301 break;
1302
1303 /* Match a single character type; inline for speed */
1304
1305 case OP_ANY:
1306 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1307 RRETURN(MATCH_NOMATCH);
1308 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1309#ifdef SUPPORT_UTF8
1310 if (utf8)
1311 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
1312#endif
1313 ecode++;
1314 break;
1315
1316 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1317 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1318
1319 case OP_ANYBYTE:
1320 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1321 ecode++;
1322 break;
1323
1324 case OP_NOT_DIGIT:
1325 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1326 GETCHARINCTEST(c, eptr);
1327 if (
1328#ifdef SUPPORT_UTF8
1329 c < 256 &&
1330#endif
1331 (md->ctypes[c] & ctype_digit) != 0
1332 )
1333 RRETURN(MATCH_NOMATCH);
1334 ecode++;
1335 break;
1336
1337 case OP_DIGIT:
1338 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1339 GETCHARINCTEST(c, eptr);
1340 if (
1341#ifdef SUPPORT_UTF8
1342 c >= 256 ||
1343#endif
1344 (md->ctypes[c] & ctype_digit) == 0
1345 )
1346 RRETURN(MATCH_NOMATCH);
1347 ecode++;
1348 break;
1349
1350 case OP_NOT_WHITESPACE:
1351 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1352 GETCHARINCTEST(c, eptr);
1353 if (
1354#ifdef SUPPORT_UTF8
1355 c < 256 &&
1356#endif
1357 (md->ctypes[c] & ctype_space) != 0
1358 )
1359 RRETURN(MATCH_NOMATCH);
1360 ecode++;
1361 break;
1362
1363 case OP_WHITESPACE:
1364 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1365 GETCHARINCTEST(c, eptr);
1366 if (
1367#ifdef SUPPORT_UTF8
1368 c >= 256 ||
1369#endif
1370 (md->ctypes[c] & ctype_space) == 0
1371 )
1372 RRETURN(MATCH_NOMATCH);
1373 ecode++;
1374 break;
1375
1376 case OP_NOT_WORDCHAR:
1377 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1378 GETCHARINCTEST(c, eptr);
1379 if (
1380#ifdef SUPPORT_UTF8
1381 c < 256 &&
1382#endif
1383 (md->ctypes[c] & ctype_word) != 0
1384 )
1385 RRETURN(MATCH_NOMATCH);
1386 ecode++;
1387 break;
1388
1389 case OP_WORDCHAR:
1390 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1391 GETCHARINCTEST(c, eptr);
1392 if (
1393#ifdef SUPPORT_UTF8
1394 c >= 256 ||
1395#endif
1396 (md->ctypes[c] & ctype_word) == 0
1397 )
1398 RRETURN(MATCH_NOMATCH);
1399 ecode++;
1400 break;
1401
1402#ifdef SUPPORT_UCP
1403 /* Check the next character by Unicode property. We will get here only
1404 if the support is in the binary; otherwise a compile-time error occurs. */
1405
1406 case OP_PROP:
1407 case OP_NOTPROP:
1408 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1409 GETCHARINCTEST(c, eptr);
1410 {
1411 int chartype, rqdtype;
1412 int othercase;
ddkilzer60a7a802007-01-01 05:07:40 +00001413 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
darind7737ab2005-09-09 00:51:07 +00001414
1415 rqdtype = *(++ecode);
1416 ecode++;
1417
1418 if (rqdtype >= 128)
1419 {
1420 if ((rqdtype - 128 != category) == (op == OP_PROP))
1421 RRETURN(MATCH_NOMATCH);
1422 }
1423 else
1424 {
1425 if ((rqdtype != chartype) == (op == OP_PROP))
1426 RRETURN(MATCH_NOMATCH);
1427 }
1428 }
1429 break;
1430
1431 /* Match an extended Unicode sequence. We will get here only if the support
1432 is in the binary; otherwise a compile-time error occurs. */
1433
1434 case OP_EXTUNI:
1435 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1436 GETCHARINCTEST(c, eptr);
1437 {
1438 int chartype;
1439 int othercase;
ddkilzer60a7a802007-01-01 05:07:40 +00001440 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
darind7737ab2005-09-09 00:51:07 +00001441 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1442 while (eptr < md->end_subject)
1443 {
1444 int len = 1;
1445 if (!utf8) c = *eptr; else
1446 {
1447 GETCHARLEN(c, eptr, len);
1448 }
ddkilzer60a7a802007-01-01 05:07:40 +00001449 category = _pcre_ucp_findchar(c, &chartype, &othercase);
darind7737ab2005-09-09 00:51:07 +00001450 if (category != ucp_M) break;
1451 eptr += len;
1452 }
1453 }
1454 ecode++;
1455 break;
1456#endif
1457
1458
1459 /* Match a back reference, possibly repeatedly. Look past the end of the
1460 item to see if there is repeat information following. The code is similar
1461 to that for character classes, but repeated for efficiency. Then obey
1462 similar code to character type repeats - written out again for speed.
1463 However, if the referenced string is the empty string, always treat
1464 it as matched, any number of times (otherwise there could be infinite
1465 loops). */
1466
1467 case OP_REF:
1468 {
thatcherdc18a362006-08-31 21:28:29 +00001469 int tmplen;
darind7737ab2005-09-09 00:51:07 +00001470 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1471 ecode += 3; /* Advance past item */
1472
1473 /* If the reference is unset, set the length to be longer than the amount
1474 of subject left; this ensures that every attempt at a match fails. We
1475 can't just fail here, because of the possibility of quantifiers with zero
1476 minima. */
1477
thatcherdc18a362006-08-31 21:28:29 +00001478 tmplen = INT_CAST(md->end_subject - eptr + 1);
darind7737ab2005-09-09 00:51:07 +00001479 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
thatcherdc18a362006-08-31 21:28:29 +00001480 tmplen :
darind7737ab2005-09-09 00:51:07 +00001481 md->offset_vector[offset+1] - md->offset_vector[offset];
1482
1483 /* Set up for repetition, or handle the non-repeated case */
1484
1485 switch (*ecode)
1486 {
1487 case OP_CRSTAR:
1488 case OP_CRMINSTAR:
1489 case OP_CRPLUS:
1490 case OP_CRMINPLUS:
1491 case OP_CRQUERY:
1492 case OP_CRMINQUERY:
1493 c = *ecode++ - OP_CRSTAR;
1494 minimize = (c & 1) != 0;
1495 min = rep_min[c]; /* Pick up values from tables; */
1496 max = rep_max[c]; /* zero for max => infinity */
1497 if (max == 0) max = INT_MAX;
1498 break;
1499
1500 case OP_CRRANGE:
1501 case OP_CRMINRANGE:
1502 minimize = (*ecode == OP_CRMINRANGE);
1503 min = GET2(ecode, 1);
1504 max = GET2(ecode, 3);
1505 if (max == 0) max = INT_MAX;
1506 ecode += 5;
1507 break;
1508
1509 default: /* No repeat follows */
1510 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1511 eptr += length;
1512 continue; /* With the main loop */
1513 }
1514
1515 /* If the length of the reference is zero, just continue with the
1516 main loop. */
1517
1518 if (length == 0) continue;
1519
1520 /* First, ensure the minimum number of matches are present. We get back
1521 the length of the reference string explicitly rather than passing the
1522 address of eptr, so that eptr can be a register variable. */
1523
1524 for (i = 1; i <= min; i++)
1525 {
1526 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1527 eptr += length;
1528 }
1529
1530 /* If min = max, continue at the same level without recursion.
1531 They are not both allowed to be zero. */
1532
1533 if (min == max) continue;
1534
1535 /* If minimizing, keep trying and advancing the pointer */
1536
1537 if (minimize)
1538 {
1539 for (fi = min;; fi++)
1540 {
darined76fb52007-02-06 21:55:25 +00001541 RMATCH(20, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1543 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1544 RRETURN(MATCH_NOMATCH);
1545 eptr += length;
1546 }
1547 /* Control never gets here */
1548 }
1549
1550 /* If maximizing, find the longest string and work backwards */
1551
1552 else
1553 {
1554 pp = eptr;
1555 for (i = min; i < max; i++)
1556 {
1557 if (!match_ref(offset, eptr, length, md, ims)) break;
1558 eptr += length;
1559 }
1560 while (eptr >= pp)
1561 {
darined76fb52007-02-06 21:55:25 +00001562 RMATCH(21, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1564 eptr -= length;
1565 }
1566 RRETURN(MATCH_NOMATCH);
1567 }
1568 }
1569 /* Control never gets here */
1570
1571
1572
1573 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1574 used when all the characters in the class have values in the range 0-255,
1575 and either the matching is caseful, or the characters are in the range
1576 0-127 when UTF-8 processing is enabled. The only difference between
1577 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1578 encountered.
1579
1580 First, look past the end of the item to see if there is repeat information
1581 following. Then obey similar code to character type repeats - written out
1582 again for speed. */
1583
1584 case OP_NCLASS:
1585 case OP_CLASS:
1586 {
1587 data = ecode + 1; /* Save for matching */
1588 ecode += 33; /* Advance past the item */
1589
1590 switch (*ecode)
1591 {
1592 case OP_CRSTAR:
1593 case OP_CRMINSTAR:
1594 case OP_CRPLUS:
1595 case OP_CRMINPLUS:
1596 case OP_CRQUERY:
1597 case OP_CRMINQUERY:
1598 c = *ecode++ - OP_CRSTAR;
1599 minimize = (c & 1) != 0;
1600 min = rep_min[c]; /* Pick up values from tables; */
1601 max = rep_max[c]; /* zero for max => infinity */
1602 if (max == 0) max = INT_MAX;
1603 break;
1604
1605 case OP_CRRANGE:
1606 case OP_CRMINRANGE:
1607 minimize = (*ecode == OP_CRMINRANGE);
1608 min = GET2(ecode, 1);
1609 max = GET2(ecode, 3);
1610 if (max == 0) max = INT_MAX;
1611 ecode += 5;
1612 break;
1613
1614 default: /* No repeat follows */
1615 min = max = 1;
1616 break;
1617 }
1618
1619 /* First, ensure the minimum number of matches are present. */
1620
1621#ifdef SUPPORT_UTF8
1622 /* UTF-8 mode */
1623 if (utf8)
1624 {
1625 for (i = 1; i <= min; i++)
1626 {
1627 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1628 GETCHARINC(c, eptr);
1629 if (c > 255)
1630 {
1631 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1632 }
1633 else
1634 {
1635 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1636 }
1637 }
1638 }
1639 else
1640#endif
1641 /* Not UTF-8 mode */
1642 {
1643 for (i = 1; i <= min; i++)
1644 {
1645 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1646 c = *eptr++;
1647 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1648 }
1649 }
1650
1651 /* If max == min we can continue with the main loop without the
1652 need to recurse. */
1653
1654 if (min == max) continue;
1655
1656 /* If minimizing, keep testing the rest of the expression and advancing
1657 the pointer while it matches the class. */
1658
1659 if (minimize)
1660 {
1661#ifdef SUPPORT_UTF8
1662 /* UTF-8 mode */
1663 if (utf8)
1664 {
1665 for (fi = min;; fi++)
1666 {
darined76fb52007-02-06 21:55:25 +00001667 RMATCH(22, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1669 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1670 GETCHARINC(c, eptr);
1671 if (c > 255)
1672 {
1673 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1674 }
1675 else
1676 {
1677 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1678 }
1679 }
1680 }
1681 else
1682#endif
1683 /* Not UTF-8 mode */
1684 {
1685 for (fi = min;; fi++)
1686 {
darined76fb52007-02-06 21:55:25 +00001687 RMATCH(23, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1689 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1690 c = *eptr++;
1691 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1692 }
1693 }
1694 /* Control never gets here */
1695 }
1696
1697 /* If maximizing, find the longest possible run, then work backwards. */
1698
1699 else
1700 {
1701 pp = eptr;
1702
1703#ifdef SUPPORT_UTF8
1704 /* UTF-8 mode */
1705 if (utf8)
1706 {
1707 for (i = min; i < max; i++)
1708 {
1709 int len = 1;
1710 if (eptr >= md->end_subject) break;
1711 GETCHARLEN(c, eptr, len);
1712 if (c > 255)
1713 {
1714 if (op == OP_CLASS) break;
1715 }
1716 else
1717 {
1718 if ((data[c/8] & (1 << (c&7))) == 0) break;
1719 }
1720 eptr += len;
1721 }
1722 for (;;)
1723 {
darined76fb52007-02-06 21:55:25 +00001724 RMATCH(24, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1726 if (eptr-- == pp) break; /* Stop if tried at original pos */
1727 BACKCHAR(eptr);
1728 }
1729 }
1730 else
1731#endif
1732 /* Not UTF-8 mode */
1733 {
1734 for (i = min; i < max; i++)
1735 {
1736 if (eptr >= md->end_subject) break;
1737 c = *eptr;
1738 if ((data[c/8] & (1 << (c&7))) == 0) break;
1739 eptr++;
1740 }
1741 while (eptr >= pp)
1742 {
darined76fb52007-02-06 21:55:25 +00001743 RMATCH(25, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001744 eptr--;
1745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1746 }
1747 }
1748
1749 RRETURN(MATCH_NOMATCH);
1750 }
1751 }
1752 /* Control never gets here */
1753
1754
1755 /* Match an extended character class. This opcode is encountered only
1756 in UTF-8 mode, because that's the only time it is compiled. */
1757
1758#ifdef SUPPORT_UTF8
1759 case OP_XCLASS:
1760 {
1761 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1762 ecode += GET(ecode, 1); /* Advance past the item */
1763
1764 switch (*ecode)
1765 {
1766 case OP_CRSTAR:
1767 case OP_CRMINSTAR:
1768 case OP_CRPLUS:
1769 case OP_CRMINPLUS:
1770 case OP_CRQUERY:
1771 case OP_CRMINQUERY:
1772 c = *ecode++ - OP_CRSTAR;
1773 minimize = (c & 1) != 0;
1774 min = rep_min[c]; /* Pick up values from tables; */
1775 max = rep_max[c]; /* zero for max => infinity */
1776 if (max == 0) max = INT_MAX;
1777 break;
1778
1779 case OP_CRRANGE:
1780 case OP_CRMINRANGE:
1781 minimize = (*ecode == OP_CRMINRANGE);
1782 min = GET2(ecode, 1);
1783 max = GET2(ecode, 3);
1784 if (max == 0) max = INT_MAX;
1785 ecode += 5;
1786 break;
1787
1788 default: /* No repeat follows */
1789 min = max = 1;
1790 break;
1791 }
1792
1793 /* First, ensure the minimum number of matches are present. */
1794
1795 for (i = 1; i <= min; i++)
1796 {
1797 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1798 GETCHARINC(c, eptr);
1799 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1800 }
1801
1802 /* If max == min we can continue with the main loop without the
1803 need to recurse. */
1804
1805 if (min == max) continue;
1806
1807 /* If minimizing, keep testing the rest of the expression and advancing
1808 the pointer while it matches the class. */
1809
1810 if (minimize)
1811 {
1812 for (fi = min;; fi++)
1813 {
darined76fb52007-02-06 21:55:25 +00001814 RMATCH(26, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1817 GETCHARINC(c, eptr);
1818 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1819 }
1820 /* Control never gets here */
1821 }
1822
1823 /* If maximizing, find the longest possible run, then work backwards. */
1824
1825 else
1826 {
1827 pp = eptr;
1828 for (i = min; i < max; i++)
1829 {
1830 int len = 1;
1831 if (eptr >= md->end_subject) break;
1832 GETCHARLEN(c, eptr, len);
1833 if (!_pcre_xclass(c, data)) break;
1834 eptr += len;
1835 }
1836 for(;;)
1837 {
darined76fb52007-02-06 21:55:25 +00001838 RMATCH(27, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00001839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 if (eptr-- == pp) break; /* Stop if tried at original pos */
1841 BACKCHAR(eptr)
1842 }
1843 RRETURN(MATCH_NOMATCH);
1844 }
1845
1846 /* Control never gets here */
1847 }
1848#endif /* End of XCLASS */
1849
1850 /* Match a single character, casefully */
1851
1852 case OP_CHAR:
1853#ifdef SUPPORT_UTF8
1854 if (utf8)
1855 {
1856 length = 1;
1857 ecode++;
1858 GETUTF8CHARLEN(fc, ecode, length);
1859#if PCRE_UTF16
eseidel67d65af2005-09-29 22:05:12 +00001860 {
darina8702f52006-01-13 09:32:51 +00001861 int dc;
hyatt6c974dd2006-01-06 22:43:44 +00001862 ecode += length;
1863 switch (md->end_subject - eptr)
1864 {
1865 case 0:
eseidel67d65af2005-09-29 22:05:12 +00001866 RRETURN(MATCH_NOMATCH);
hyatt6c974dd2006-01-06 22:43:44 +00001867 case 1:
1868 dc = *eptr++;
1869 if (IS_LEADING_SURROGATE(dc))
1870 RRETURN(MATCH_NOMATCH);
1871 break;
1872 default:
1873 GETCHARINC(dc, eptr);
1874 }
1875 if (fc != dc) RRETURN(MATCH_NOMATCH);
1876 }
darind7737ab2005-09-09 00:51:07 +00001877#else
1878 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1879 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1880#endif
1881 }
1882 else
1883#endif
1884
1885 /* Non-UTF-8 mode */
1886 {
1887 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1888 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1889 ecode += 2;
1890 }
1891 break;
1892
1893 /* Match a single character, caselessly */
1894
1895 case OP_CHARNC:
1896#ifdef SUPPORT_UTF8
1897 if (utf8)
1898 {
1899 length = 1;
1900 ecode++;
1901 GETUTF8CHARLEN(fc, ecode, length);
1902
eseidel67d65af2005-09-29 22:05:12 +00001903#if PCRE_UTF16
1904 if (md->end_subject - eptr == 0) RRETURN(MATCH_NOMATCH);
1905#else
darind7737ab2005-09-09 00:51:07 +00001906 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1907#endif
1908
1909 /* If the pattern character's value is < 128, we have only one byte, and
1910 can use the fast lookup table. */
1911
1912 if (fc < 128)
1913 {
1914#if PCRE_UTF16
1915 int dc;
1916 ecode++;
1917 dc = *eptr++;
1918 if (dc >= 128 || md->lcc[fc] != md->lcc[dc]) RRETURN(MATCH_NOMATCH);
1919#else
1920 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1921#endif
1922 }
1923
1924 /* Otherwise we must pick up the subject character */
1925
1926 else
1927 {
1928 int dc;
eseidel67d65af2005-09-29 22:05:12 +00001929#if PCRE_UTF16
1930 if (md->end_subject - eptr == 1) {
1931 dc = *eptr++;
1932 if (IS_LEADING_SURROGATE(dc))
1933 RRETURN(MATCH_NOMATCH);
1934 } else
1935#endif
1936 GETCHARINC(dc, eptr);
darind7737ab2005-09-09 00:51:07 +00001937 ecode += length;
1938
1939 /* If we have Unicode property support, we can use it to test the other
ddkilzer60a7a802007-01-01 05:07:40 +00001940 case of the character, if there is one. The result of _pcre_ucp_findchar() is
darind7737ab2005-09-09 00:51:07 +00001941 < 0 if the char isn't found, and othercase is returned as zero if there
1942 isn't one. */
1943
1944 if (fc != dc)
1945 {
1946#ifdef SUPPORT_UCP
1947 int chartype;
1948 int othercase;
ddkilzer60a7a802007-01-01 05:07:40 +00001949 if (_pcre_ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
darind7737ab2005-09-09 00:51:07 +00001950#endif
1951 RRETURN(MATCH_NOMATCH);
1952 }
1953 }
1954 }
1955 else
1956#endif /* SUPPORT_UTF8 */
1957
1958 /* Non-UTF-8 mode */
1959 {
1960 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1961 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1962 ecode += 2;
1963 }
1964 break;
1965
1966 /* Match a single character repeatedly; different opcodes share code. */
1967
1968 case OP_EXACT:
1969 min = max = GET2(ecode, 1);
1970 ecode += 3;
1971 goto REPEATCHAR;
1972
1973 case OP_UPTO:
1974 case OP_MINUPTO:
1975 min = 0;
1976 max = GET2(ecode, 1);
1977 minimize = *ecode == OP_MINUPTO;
1978 ecode += 3;
1979 goto REPEATCHAR;
1980
1981 case OP_STAR:
1982 case OP_MINSTAR:
1983 case OP_PLUS:
1984 case OP_MINPLUS:
1985 case OP_QUERY:
1986 case OP_MINQUERY:
1987 c = *ecode++ - OP_STAR;
1988 minimize = (c & 1) != 0;
1989 min = rep_min[c]; /* Pick up values from tables; */
1990 max = rep_max[c]; /* zero for max => infinity */
1991 if (max == 0) max = INT_MAX;
1992
1993 /* Common code for all repeated single-character matches. We can give
1994 up quickly if there are fewer than the minimum number of characters left in
1995 the subject. */
1996
1997 REPEATCHAR:
1998#ifdef SUPPORT_UTF8
1999#if PCRE_UTF16
hyatt6c974dd2006-01-06 22:43:44 +00002000
darina8702f52006-01-13 09:32:51 +00002001 length = 1;
darind7737ab2005-09-09 00:51:07 +00002002 GETUTF8CHARLEN(fc, ecode, length);
darina8702f52006-01-13 09:32:51 +00002003 {
darin496882e2006-07-15 15:30:03 +00002004 if (min * (fc > 0xFFFF ? 2 : 1) > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
darind7737ab2005-09-09 00:51:07 +00002005 ecode += length;
2006
darin496882e2006-07-15 15:30:03 +00002007 if (fc <= 0xFFFF)
darind7737ab2005-09-09 00:51:07 +00002008 {
darind7737ab2005-09-09 00:51:07 +00002009 int othercase;
2010 int chartype;
ddkilzer60a7a802007-01-01 05:07:40 +00002011 if ((ims & PCRE_CASELESS) == 0 || _pcre_ucp_findchar(fc, &chartype, &othercase) < 0)
darind7737ab2005-09-09 00:51:07 +00002012 othercase = -1; /* Guaranteed to not match any character */
darind7737ab2005-09-09 00:51:07 +00002013
2014 for (i = 1; i <= min; i++)
2015 {
2016 if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH);
2017 ++eptr;
2018 }
2019
2020 if (min == max) continue;
2021
2022 if (minimize)
2023 {
darin8bff71f2007-02-07 20:02:50 +00002024 repeat_othercase = othercase;
darind7737ab2005-09-09 00:51:07 +00002025 for (fi = min;; fi++)
2026 {
darined76fb52007-02-06 21:55:25 +00002027 RMATCH(28, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2029 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
darin8bff71f2007-02-07 20:02:50 +00002030 if (*eptr != fc && *eptr != repeat_othercase) RRETURN(MATCH_NOMATCH);
darind7737ab2005-09-09 00:51:07 +00002031 ++eptr;
2032 }
2033 /* Control never gets here */
2034 }
2035 else
2036 {
2037 pp = eptr;
2038 for (i = min; i < max; i++)
2039 {
darin496882e2006-07-15 15:30:03 +00002040 if (eptr >= md->end_subject) break;
darind7737ab2005-09-09 00:51:07 +00002041 if (*eptr != fc && *eptr != othercase) break;
2042 ++eptr;
2043 }
2044 while (eptr >= pp)
2045 {
darined76fb52007-02-06 21:55:25 +00002046 RMATCH(29, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 --eptr;
2049 }
2050 RRETURN(MATCH_NOMATCH);
2051 }
2052 /* Control never gets here */
2053 }
2054 else
2055 {
2056 /* No case on surrogate pairs, so no need to bother with "othercase". */
2057
2058 for (i = 1; i <= min; i++)
2059 {
2060 int nc;
2061 GETCHAR(nc, eptr);
2062 if (nc != fc) RRETURN(MATCH_NOMATCH);
2063 eptr += 2;
2064 }
2065
2066 if (min == max) continue;
2067
2068 if (minimize)
2069 {
2070 for (fi = min;; fi++)
2071 {
2072 int nc;
darined76fb52007-02-06 21:55:25 +00002073 RMATCH(30, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2075 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2076 GETCHAR(nc, eptr);
2077 if (*eptr != fc) RRETURN(MATCH_NOMATCH);
2078 eptr += 2;
2079 }
2080 /* Control never gets here */
2081 }
2082 else
2083 {
2084 pp = eptr;
2085 for (i = min; i < max; i++)
2086 {
2087 int nc;
darin496882e2006-07-15 15:30:03 +00002088 if (eptr > md->end_subject - 2) break;
darind7737ab2005-09-09 00:51:07 +00002089 GETCHAR(nc, eptr);
2090 if (*eptr != fc) break;
2091 eptr += 2;
2092 }
2093 while (eptr >= pp)
2094 {
darined76fb52007-02-06 21:55:25 +00002095 RMATCH(31, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2097 eptr -= 2;
2098 }
2099 RRETURN(MATCH_NOMATCH);
2100 }
2101 /* Control never gets here */
2102 }
2103 /* Control never gets here */
darina8702f52006-01-13 09:32:51 +00002104 }
darind7737ab2005-09-09 00:51:07 +00002105#else
2106 if (utf8)
2107 {
2108 length = 1;
2109 charptr = ecode;
2110 GETCHARLEN(fc, ecode, length);
2111 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2112 ecode += length;
2113
2114 /* Handle multibyte character matching specially here. There is
2115 support for caseless matching if UCP support is present. */
2116
2117 if (length > 1)
2118 {
2119 int oclength = 0;
2120 uschar occhars[8];
2121
2122#ifdef SUPPORT_UCP
2123 int othercase;
2124 int chartype;
2125 if ((ims & PCRE_CASELESS) != 0 &&
ddkilzer60a7a802007-01-01 05:07:40 +00002126 _pcre_ucp_findchar(fc, &chartype, &othercase) >= 0 &&
darind7737ab2005-09-09 00:51:07 +00002127 othercase > 0)
2128 oclength = _pcre_ord2utf8(othercase, occhars);
2129#endif /* SUPPORT_UCP */
2130
2131 for (i = 1; i <= min; i++)
2132 {
2133 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2134 /* Need braces because of following else */
2135 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2136 else
2137 {
2138 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2139 eptr += oclength;
2140 }
2141 }
2142
2143 if (min == max) continue;
2144
2145 if (minimize)
2146 {
2147 for (fi = min;; fi++)
2148 {
darin8bff71f2007-02-07 20:02:50 +00002149 // FIXME: This could blow away occhars and occlength in the NO_RECURSE case.
darined76fb52007-02-06 21:55:25 +00002150 RMATCH(32, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2152 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2153 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2154 /* Need braces because of following else */
2155 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2156 else
2157 {
2158 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2159 eptr += oclength;
2160 }
2161 }
2162 /* Control never gets here */
2163 }
2164 else
2165 {
2166 pp = eptr;
2167 for (i = min; i < max; i++)
2168 {
2169 if (eptr > md->end_subject - length) break;
2170 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2171 else if (oclength == 0) break;
2172 else
2173 {
2174 if (memcmp(eptr, occhars, oclength) != 0) break;
2175 eptr += oclength;
2176 }
2177 }
2178 while (eptr >= pp)
2179 {
darined76fb52007-02-06 21:55:25 +00002180 RMATCH(33, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002181 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2182 eptr -= length;
2183 }
2184 RRETURN(MATCH_NOMATCH);
2185 }
2186 /* Control never gets here */
2187 }
2188
2189 /* If the length of a UTF-8 character is 1, we fall through here, and
2190 obey the code as for non-UTF-8 characters below, though in this case the
2191 value of fc will always be < 128. */
2192 }
2193 else
2194#endif
2195#endif /* SUPPORT_UTF8 */
2196
darinb847b442006-10-27 16:48:28 +00002197#if !PCRE_UTF16
darind7737ab2005-09-09 00:51:07 +00002198 /* When not in UTF-8 mode, load a single-byte character. */
2199 {
2200 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2201 fc = *ecode++;
2202 }
2203
2204 /* The value of fc at this point is always less than 256, though we may or
2205 may not be in UTF-8 mode. The code is duplicated for the caseless and
2206 caseful cases, for speed, since matching characters is likely to be quite
2207 common. First, ensure the minimum number of matches are present. If min =
2208 max, continue at the same level without recursing. Otherwise, if
2209 minimizing, keep trying the rest of the expression and advancing one
2210 matching character if failing, up to the maximum. Alternatively, if
2211 maximizing, find the maximum number of characters and work backwards. */
2212
2213 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2214 max, eptr));
2215
2216 if ((ims & PCRE_CASELESS) != 0)
2217 {
2218 fc = md->lcc[fc];
2219 for (i = 1; i <= min; i++)
2220 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2221 if (min == max) continue;
2222 if (minimize)
2223 {
2224 for (fi = min;; fi++)
2225 {
darined76fb52007-02-06 21:55:25 +00002226 RMATCH(34, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2228 if (fi >= max || eptr >= md->end_subject ||
2229 fc != md->lcc[*eptr++])
2230 RRETURN(MATCH_NOMATCH);
2231 }
2232 /* Control never gets here */
2233 }
2234 else
2235 {
2236 pp = eptr;
2237 for (i = min; i < max; i++)
2238 {
2239 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2240 eptr++;
2241 }
2242 while (eptr >= pp)
2243 {
darined76fb52007-02-06 21:55:25 +00002244 RMATCH(35, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002245 eptr--;
2246 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2247 }
2248 RRETURN(MATCH_NOMATCH);
2249 }
2250 /* Control never gets here */
2251 }
2252
2253 /* Caseful comparisons (includes all multi-byte characters) */
2254
2255 else
2256 {
2257 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2258 if (min == max) continue;
2259 if (minimize)
2260 {
2261 for (fi = min;; fi++)
2262 {
darined76fb52007-02-06 21:55:25 +00002263 RMATCH(36, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2265 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2266 RRETURN(MATCH_NOMATCH);
2267 }
2268 /* Control never gets here */
2269 }
2270 else
2271 {
2272 pp = eptr;
2273 for (i = min; i < max; i++)
2274 {
2275 if (eptr >= md->end_subject || fc != *eptr) break;
2276 eptr++;
2277 }
2278 while (eptr >= pp)
2279 {
darined76fb52007-02-06 21:55:25 +00002280 RMATCH(37, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002281 eptr--;
2282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2283 }
2284 RRETURN(MATCH_NOMATCH);
2285 }
2286 }
2287 /* Control never gets here */
darinb847b442006-10-27 16:48:28 +00002288#endif
darind7737ab2005-09-09 00:51:07 +00002289
2290 /* Match a negated single one-byte character. The character we are
2291 checking can be multibyte. */
2292
2293 case OP_NOT:
2294 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2295 ecode++;
2296 GETCHARINCTEST(c, eptr);
2297 if ((ims & PCRE_CASELESS) != 0)
2298 {
2299#ifdef SUPPORT_UTF8
2300 if (c < 256)
2301#endif
2302 c = md->lcc[c];
2303 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2304 }
2305 else
2306 {
2307 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2308 }
2309 break;
2310
2311 /* Match a negated single one-byte character repeatedly. This is almost a
2312 repeat of the code for a repeated single character, but I haven't found a
2313 nice way of commoning these up that doesn't require a test of the
2314 positive/negative option for each character match. Maybe that wouldn't add
2315 very much to the time taken, but character matching *is* what this is all
2316 about... */
2317
2318 case OP_NOTEXACT:
2319 min = max = GET2(ecode, 1);
2320 ecode += 3;
2321 goto REPEATNOTCHAR;
2322
2323 case OP_NOTUPTO:
2324 case OP_NOTMINUPTO:
2325 min = 0;
2326 max = GET2(ecode, 1);
2327 minimize = *ecode == OP_NOTMINUPTO;
2328 ecode += 3;
2329 goto REPEATNOTCHAR;
2330
2331 case OP_NOTSTAR:
2332 case OP_NOTMINSTAR:
2333 case OP_NOTPLUS:
2334 case OP_NOTMINPLUS:
2335 case OP_NOTQUERY:
2336 case OP_NOTMINQUERY:
2337 c = *ecode++ - OP_NOTSTAR;
2338 minimize = (c & 1) != 0;
2339 min = rep_min[c]; /* Pick up values from tables; */
2340 max = rep_max[c]; /* zero for max => infinity */
2341 if (max == 0) max = INT_MAX;
2342
2343 /* Common code for all repeated single-byte matches. We can give up quickly
2344 if there are fewer than the minimum number of bytes left in the
2345 subject. */
2346
2347 REPEATNOTCHAR:
2348 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2349 fc = *ecode++;
2350
2351 /* The code is duplicated for the caseless and caseful cases, for speed,
2352 since matching characters is likely to be quite common. First, ensure the
2353 minimum number of matches are present. If min = max, continue at the same
2354 level without recursing. Otherwise, if minimizing, keep trying the rest of
2355 the expression and advancing one matching character if failing, up to the
2356 maximum. Alternatively, if maximizing, find the maximum number of
2357 characters and work backwards. */
2358
darin496882e2006-07-15 15:30:03 +00002359#if PCRE_UTF16
2360 DPRINTF(("negative matching %c{%d,%d}\n", fc, min, max));
2361#else
darind7737ab2005-09-09 00:51:07 +00002362 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2363 max, eptr));
darin496882e2006-07-15 15:30:03 +00002364#endif
darind7737ab2005-09-09 00:51:07 +00002365
2366 if ((ims & PCRE_CASELESS) != 0)
2367 {
2368 fc = md->lcc[fc];
2369
2370#ifdef SUPPORT_UTF8
2371 /* UTF-8 mode */
2372 if (utf8)
2373 {
2374 register int d;
2375 for (i = 1; i <= min; i++)
2376 {
2377 GETCHARINC(d, eptr);
2378 if (d < 256) d = md->lcc[d];
2379 if (fc == d) RRETURN(MATCH_NOMATCH);
2380 }
2381 }
2382 else
2383#endif
2384
2385 /* Not UTF-8 mode */
2386 {
2387 for (i = 1; i <= min; i++)
2388 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2389 }
2390
2391 if (min == max) continue;
2392
2393 if (minimize)
2394 {
2395#ifdef SUPPORT_UTF8
2396 /* UTF-8 mode */
2397 if (utf8)
2398 {
2399 register int d;
2400 for (fi = min;; fi++)
2401 {
darined76fb52007-02-06 21:55:25 +00002402 RMATCH(38, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2404 GETCHARINC(d, eptr);
2405 if (d < 256) d = md->lcc[d];
2406 if (fi >= max || eptr >= md->end_subject || fc == d)
2407 RRETURN(MATCH_NOMATCH);
2408 }
2409 }
2410 else
2411#endif
2412 /* Not UTF-8 mode */
2413 {
2414 for (fi = min;; fi++)
2415 {
darined76fb52007-02-06 21:55:25 +00002416 RMATCH(39, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002417 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2418 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 }
2422 /* Control never gets here */
2423 }
2424
2425 /* Maximize case */
2426
2427 else
2428 {
2429 pp = eptr;
2430
2431#ifdef SUPPORT_UTF8
2432 /* UTF-8 mode */
2433 if (utf8)
2434 {
2435 register int d;
2436 for (i = min; i < max; i++)
2437 {
2438 int len = 1;
2439 if (eptr >= md->end_subject) break;
2440 GETCHARLEN(d, eptr, len);
2441 if (d < 256) d = md->lcc[d];
2442 if (fc == d) break;
2443 eptr += len;
2444 }
2445 for(;;)
2446 {
darined76fb52007-02-06 21:55:25 +00002447 RMATCH(40, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449 if (eptr-- == pp) break; /* Stop if tried at original pos */
2450 BACKCHAR(eptr);
2451 }
2452 }
2453 else
2454#endif
2455 /* Not UTF-8 mode */
2456 {
2457 for (i = min; i < max; i++)
2458 {
2459 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2460 eptr++;
2461 }
2462 while (eptr >= pp)
2463 {
darined76fb52007-02-06 21:55:25 +00002464 RMATCH(41, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2466 eptr--;
2467 }
2468 }
2469
2470 RRETURN(MATCH_NOMATCH);
2471 }
2472 /* Control never gets here */
2473 }
2474
2475 /* Caseful comparisons */
2476
2477 else
2478 {
2479#ifdef SUPPORT_UTF8
2480 /* UTF-8 mode */
2481 if (utf8)
2482 {
2483 register int d;
2484 for (i = 1; i <= min; i++)
2485 {
2486 GETCHARINC(d, eptr);
2487 if (fc == d) RRETURN(MATCH_NOMATCH);
2488 }
2489 }
2490 else
2491#endif
2492 /* Not UTF-8 mode */
2493 {
2494 for (i = 1; i <= min; i++)
2495 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2496 }
2497
2498 if (min == max) continue;
2499
2500 if (minimize)
2501 {
2502#ifdef SUPPORT_UTF8
2503 /* UTF-8 mode */
2504 if (utf8)
2505 {
2506 register int d;
2507 for (fi = min;; fi++)
2508 {
darined76fb52007-02-06 21:55:25 +00002509 RMATCH(42, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 GETCHARINC(d, eptr);
2512 if (fi >= max || eptr >= md->end_subject || fc == d)
2513 RRETURN(MATCH_NOMATCH);
2514 }
2515 }
2516 else
2517#endif
2518 /* Not UTF-8 mode */
2519 {
2520 for (fi = min;; fi++)
2521 {
darined76fb52007-02-06 21:55:25 +00002522 RMATCH(43, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2524 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 }
2528 /* Control never gets here */
2529 }
2530
2531 /* Maximize case */
2532
2533 else
2534 {
2535 pp = eptr;
2536
2537#ifdef SUPPORT_UTF8
2538 /* UTF-8 mode */
2539 if (utf8)
2540 {
2541 register int d;
2542 for (i = min; i < max; i++)
2543 {
2544 int len = 1;
2545 if (eptr >= md->end_subject) break;
2546 GETCHARLEN(d, eptr, len);
2547 if (fc == d) break;
2548 eptr += len;
2549 }
2550 for(;;)
2551 {
darined76fb52007-02-06 21:55:25 +00002552 RMATCH(44, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002553 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2554 if (eptr-- == pp) break; /* Stop if tried at original pos */
2555 BACKCHAR(eptr);
2556 }
2557 }
2558 else
2559#endif
2560 /* Not UTF-8 mode */
2561 {
2562 for (i = min; i < max; i++)
2563 {
2564 if (eptr >= md->end_subject || fc == *eptr) break;
2565 eptr++;
2566 }
2567 while (eptr >= pp)
2568 {
darined76fb52007-02-06 21:55:25 +00002569 RMATCH(45, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2571 eptr--;
2572 }
2573 }
2574
2575 RRETURN(MATCH_NOMATCH);
2576 }
2577 }
2578 /* Control never gets here */
2579
2580 /* Match a single character type repeatedly; several different opcodes
2581 share code. This is very similar to the code for single characters, but we
2582 repeat it in the interests of efficiency. */
2583
2584 case OP_TYPEEXACT:
2585 min = max = GET2(ecode, 1);
2586 minimize = TRUE;
2587 ecode += 3;
2588 goto REPEATTYPE;
2589
2590 case OP_TYPEUPTO:
2591 case OP_TYPEMINUPTO:
2592 min = 0;
2593 max = GET2(ecode, 1);
2594 minimize = *ecode == OP_TYPEMINUPTO;
2595 ecode += 3;
2596 goto REPEATTYPE;
2597
2598 case OP_TYPESTAR:
2599 case OP_TYPEMINSTAR:
2600 case OP_TYPEPLUS:
2601 case OP_TYPEMINPLUS:
2602 case OP_TYPEQUERY:
2603 case OP_TYPEMINQUERY:
2604 c = *ecode++ - OP_TYPESTAR;
2605 minimize = (c & 1) != 0;
2606 min = rep_min[c]; /* Pick up values from tables; */
2607 max = rep_max[c]; /* zero for max => infinity */
2608 if (max == 0) max = INT_MAX;
2609
2610 /* Common code for all repeated single character type matches. Note that
2611 in UTF-8 mode, '.' matches a character of any length, but for the other
2612 character types, the valid characters are all one-byte long. */
2613
2614 REPEATTYPE:
2615 ctype = *ecode++; /* Code for the character type */
2616
2617#ifdef SUPPORT_UCP
2618 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2619 {
2620 prop_fail_result = ctype == OP_NOTPROP;
2621 prop_type = *ecode++;
2622 if (prop_type >= 128)
2623 {
2624 prop_test_against = prop_type - 128;
2625 prop_test_variable = &prop_category;
2626 }
2627 else
2628 {
2629 prop_test_against = prop_type;
2630 prop_test_variable = &prop_chartype;
2631 }
2632 }
2633 else prop_type = -1;
2634#endif
2635
2636 /* First, ensure the minimum number of matches are present. Use inline
2637 code for maximizing the speed, and do the type test once at the start
2638 (i.e. keep it out of the loop). Also we can test that there are at least
2639 the minimum number of bytes before we start. This isn't as effective in
2640 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2641 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2642 and single-bytes. */
2643
2644 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2645 if (min > 0)
2646 {
2647#ifdef SUPPORT_UCP
2648 if (prop_type > 0)
2649 {
2650 for (i = 1; i <= min; i++)
2651 {
2652 GETCHARINC(c, eptr);
ddkilzer60a7a802007-01-01 05:07:40 +00002653 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002654 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2655 RRETURN(MATCH_NOMATCH);
2656 }
2657 }
2658
2659 /* Match extended Unicode sequences. We will get here only if the
2660 support is in the binary; otherwise a compile-time error occurs. */
2661
2662 else if (ctype == OP_EXTUNI)
2663 {
2664 for (i = 1; i <= min; i++)
2665 {
2666 GETCHARINCTEST(c, eptr);
ddkilzer60a7a802007-01-01 05:07:40 +00002667 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002668 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2669 while (eptr < md->end_subject)
2670 {
2671 int len = 1;
2672 if (!utf8) c = *eptr; else
2673 {
2674 GETCHARLEN(c, eptr, len);
2675 }
ddkilzer60a7a802007-01-01 05:07:40 +00002676 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002677 if (prop_category != ucp_M) break;
2678 eptr += len;
2679 }
2680 }
2681 }
2682
2683 else
2684#endif /* SUPPORT_UCP */
2685
2686/* Handle all other cases when the coding is UTF-8 */
2687
2688#ifdef SUPPORT_UTF8
2689 if (utf8) switch(ctype)
2690 {
2691 case OP_ANY:
2692 for (i = 1; i <= min; i++)
2693 {
2694 if (eptr >= md->end_subject ||
2695 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2696 RRETURN(MATCH_NOMATCH);
2697 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
2698 }
2699 break;
2700
2701 case OP_ANYBYTE:
2702 eptr += min;
2703 break;
2704
2705 case OP_NOT_DIGIT:
2706 for (i = 1; i <= min; i++)
2707 {
2708 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2709 GETCHARINC(c, eptr);
2710 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2711 RRETURN(MATCH_NOMATCH);
2712 }
2713 break;
2714
2715 case OP_DIGIT:
2716 for (i = 1; i <= min; i++)
2717 {
2718 if (eptr >= md->end_subject ||
2719 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2720 RRETURN(MATCH_NOMATCH);
2721 /* No need to skip more bytes - we know it's a 1-byte character */
2722 }
2723 break;
2724
2725 case OP_NOT_WHITESPACE:
2726 for (i = 1; i <= min; i++)
2727 {
2728 if (eptr >= md->end_subject ||
2729 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2730 RRETURN(MATCH_NOMATCH);
2731 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
2732 }
2733 break;
2734
2735 case OP_WHITESPACE:
2736 for (i = 1; i <= min; i++)
2737 {
2738 if (eptr >= md->end_subject ||
2739 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2740 RRETURN(MATCH_NOMATCH);
2741 /* No need to skip more bytes - we know it's a 1-byte character */
2742 }
2743 break;
2744
2745 case OP_NOT_WORDCHAR:
2746 for (i = 1; i <= min; i++)
2747 {
2748 if (eptr >= md->end_subject ||
2749 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2750 RRETURN(MATCH_NOMATCH);
2751 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
2752 }
2753 break;
2754
2755 case OP_WORDCHAR:
2756 for (i = 1; i <= min; i++)
2757 {
2758 if (eptr >= md->end_subject ||
2759 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2760 RRETURN(MATCH_NOMATCH);
2761 /* No need to skip more bytes - we know it's a 1-byte character */
2762 }
2763 break;
2764
2765 default:
2766 RRETURN(PCRE_ERROR_INTERNAL);
2767 } /* End switch(ctype) */
2768
2769 else
2770#endif /* SUPPORT_UTF8 */
2771
2772 /* Code for the non-UTF-8 case for minimum matching of operators other
2773 than OP_PROP and OP_NOTPROP. */
2774
2775 switch(ctype)
2776 {
2777 case OP_ANY:
2778 if ((ims & PCRE_DOTALL) == 0)
2779 {
2780 for (i = 1; i <= min; i++)
2781 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2782 }
2783 else eptr += min;
2784 break;
2785
2786 case OP_ANYBYTE:
2787 eptr += min;
2788 break;
2789
2790 case OP_NOT_DIGIT:
2791 for (i = 1; i <= min; i++)
2792 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2793 break;
2794
2795 case OP_DIGIT:
2796 for (i = 1; i <= min; i++)
2797 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2798 break;
2799
2800 case OP_NOT_WHITESPACE:
2801 for (i = 1; i <= min; i++)
2802 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2803 break;
2804
2805 case OP_WHITESPACE:
2806 for (i = 1; i <= min; i++)
2807 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2808 break;
2809
2810 case OP_NOT_WORDCHAR:
2811 for (i = 1; i <= min; i++)
2812 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2813 RRETURN(MATCH_NOMATCH);
2814 break;
2815
2816 case OP_WORDCHAR:
2817 for (i = 1; i <= min; i++)
2818 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2819 RRETURN(MATCH_NOMATCH);
2820 break;
2821
2822 default:
2823 RRETURN(PCRE_ERROR_INTERNAL);
2824 }
2825 }
2826
2827 /* If min = max, continue at the same level without recursing */
2828
2829 if (min == max) continue;
2830
2831 /* If minimizing, we have to test the rest of the pattern before each
2832 subsequent match. Again, separate the UTF-8 case for speed, and also
2833 separate the UCP cases. */
2834
2835 if (minimize)
2836 {
2837#ifdef SUPPORT_UCP
2838 if (prop_type > 0)
2839 {
2840 for (fi = min;; fi++)
2841 {
darined76fb52007-02-06 21:55:25 +00002842 RMATCH(46, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2844 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2845 GETCHARINC(c, eptr);
ddkilzer60a7a802007-01-01 05:07:40 +00002846 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002847 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2848 RRETURN(MATCH_NOMATCH);
2849 }
2850 }
2851
2852 /* Match extended Unicode sequences. We will get here only if the
2853 support is in the binary; otherwise a compile-time error occurs. */
2854
2855 else if (ctype == OP_EXTUNI)
2856 {
2857 for (fi = min;; fi++)
2858 {
darined76fb52007-02-06 21:55:25 +00002859 RMATCH(47, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2861 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2862 GETCHARINCTEST(c, eptr);
ddkilzer60a7a802007-01-01 05:07:40 +00002863 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002864 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2865 while (eptr < md->end_subject)
2866 {
2867 int len = 1;
2868 if (!utf8) c = *eptr; else
2869 {
2870 GETCHARLEN(c, eptr, len);
2871 }
ddkilzer60a7a802007-01-01 05:07:40 +00002872 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00002873 if (prop_category != ucp_M) break;
2874 eptr += len;
2875 }
2876 }
2877 }
2878
2879 else
2880#endif /* SUPPORT_UCP */
2881
2882#ifdef SUPPORT_UTF8
2883 /* UTF-8 mode */
2884 if (utf8)
2885 {
2886 for (fi = min;; fi++)
2887 {
darined76fb52007-02-06 21:55:25 +00002888 RMATCH(48, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2890 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2891
2892 GETCHARINC(c, eptr);
2893 switch(ctype)
2894 {
2895 case OP_ANY:
2896 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2897 break;
2898
2899 case OP_ANYBYTE:
2900 break;
2901
2902 case OP_NOT_DIGIT:
2903 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2904 RRETURN(MATCH_NOMATCH);
2905 break;
2906
2907 case OP_DIGIT:
2908 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2909 RRETURN(MATCH_NOMATCH);
2910 break;
2911
2912 case OP_NOT_WHITESPACE:
2913 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2914 RRETURN(MATCH_NOMATCH);
2915 break;
2916
2917 case OP_WHITESPACE:
2918 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2919 RRETURN(MATCH_NOMATCH);
2920 break;
2921
2922 case OP_NOT_WORDCHAR:
2923 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2924 RRETURN(MATCH_NOMATCH);
2925 break;
2926
2927 case OP_WORDCHAR:
2928 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2929 RRETURN(MATCH_NOMATCH);
2930 break;
2931
2932 default:
2933 RRETURN(PCRE_ERROR_INTERNAL);
2934 }
2935 }
2936 }
2937 else
2938#endif
2939 /* Not UTF-8 mode */
2940 {
2941 for (fi = min;; fi++)
2942 {
darined76fb52007-02-06 21:55:25 +00002943 RMATCH(49, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00002944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2945 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2946 c = *eptr++;
2947 switch(ctype)
2948 {
2949 case OP_ANY:
2950 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2951 break;
2952
2953 case OP_ANYBYTE:
2954 break;
2955
2956 case OP_NOT_DIGIT:
2957 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2958 break;
2959
2960 case OP_DIGIT:
2961 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2962 break;
2963
2964 case OP_NOT_WHITESPACE:
2965 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2966 break;
2967
2968 case OP_WHITESPACE:
2969 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2970 break;
2971
2972 case OP_NOT_WORDCHAR:
2973 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2974 break;
2975
2976 case OP_WORDCHAR:
2977 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2978 break;
2979
2980 default:
2981 RRETURN(PCRE_ERROR_INTERNAL);
2982 }
2983 }
2984 }
2985 /* Control never gets here */
2986 }
2987
2988 /* If maximizing it is worth using inline code for speed, doing the type
2989 test once at the start (i.e. keep it out of the loop). Again, keep the
2990 UTF-8 and UCP stuff separate. */
2991
2992 else
2993 {
2994 pp = eptr; /* Remember where we started */
2995
2996#ifdef SUPPORT_UCP
2997 if (prop_type > 0)
2998 {
2999 for (i = min; i < max; i++)
3000 {
3001 int len = 1;
3002 if (eptr >= md->end_subject) break;
3003 GETCHARLEN(c, eptr, len);
ddkilzer60a7a802007-01-01 05:07:40 +00003004 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00003005 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
3006 break;
3007 eptr+= len;
3008 }
3009
3010 /* eptr is now past the end of the maximum run */
3011
3012 for(;;)
3013 {
darined76fb52007-02-06 21:55:25 +00003014 RMATCH(50, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00003015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3016 if (eptr-- == pp) break; /* Stop if tried at original pos */
3017 BACKCHAR(eptr);
3018 }
3019 }
3020
3021 /* Match extended Unicode sequences. We will get here only if the
3022 support is in the binary; otherwise a compile-time error occurs. */
3023
3024 else if (ctype == OP_EXTUNI)
3025 {
3026 for (i = min; i < max; i++)
3027 {
3028 if (eptr >= md->end_subject) break;
3029 GETCHARINCTEST(c, eptr);
ddkilzer60a7a802007-01-01 05:07:40 +00003030 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00003031 if (prop_category == ucp_M) break;
3032 while (eptr < md->end_subject)
3033 {
3034 int len = 1;
3035 if (!utf8) c = *eptr; else
3036 {
3037 GETCHARLEN(c, eptr, len);
3038 }
ddkilzer60a7a802007-01-01 05:07:40 +00003039 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00003040 if (prop_category != ucp_M) break;
3041 eptr += len;
3042 }
3043 }
3044
3045 /* eptr is now past the end of the maximum run */
3046
3047 for(;;)
3048 {
darined76fb52007-02-06 21:55:25 +00003049 RMATCH(51, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00003050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3051 if (eptr-- == pp) break; /* Stop if tried at original pos */
3052 for (;;) /* Move back over one extended */
3053 {
3054 int len = 1;
3055 BACKCHAR(eptr);
3056 if (!utf8) c = *eptr; else
3057 {
3058 GETCHARLEN(c, eptr, len);
3059 }
ddkilzer60a7a802007-01-01 05:07:40 +00003060 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darind7737ab2005-09-09 00:51:07 +00003061 if (prop_category != ucp_M) break;
3062 eptr--;
3063 }
3064 }
3065 }
3066
3067 else
3068#endif /* SUPPORT_UCP */
3069
3070#ifdef SUPPORT_UTF8
3071 /* UTF-8 mode */
3072
3073 if (utf8)
3074 {
3075 switch(ctype)
3076 {
3077 case OP_ANY:
3078
3079 /* Special code is required for UTF8, but when the maximum is unlimited
3080 we don't need it, so we repeat the non-UTF8 code. This is probably
3081 worth it, because .* is quite a common idiom. */
3082
3083 if (max < INT_MAX)
3084 {
3085 if ((ims & PCRE_DOTALL) == 0)
3086 {
3087 for (i = min; i < max; i++)
3088 {
3089 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3090 eptr++;
3091 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3092 }
3093 }
3094 else
3095 {
3096 for (i = min; i < max; i++)
3097 {
3098 eptr++;
3099 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3100 }
3101 }
3102 }
3103
3104 /* Handle unlimited UTF-8 repeat */
3105
3106 else
3107 {
3108 if ((ims & PCRE_DOTALL) == 0)
3109 {
3110 for (i = min; i < max; i++)
3111 {
3112 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3113 eptr++;
3114 }
3115 break;
3116 }
3117 else
3118 {
3119 c = max - min;
thatcherdc18a362006-08-31 21:28:29 +00003120 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darind7737ab2005-09-09 00:51:07 +00003121 eptr += c;
3122 }
3123 }
3124 break;
3125
3126 /* The byte case is the same as non-UTF8 */
3127
3128 case OP_ANYBYTE:
3129 c = max - min;
thatcherdc18a362006-08-31 21:28:29 +00003130 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darind7737ab2005-09-09 00:51:07 +00003131 eptr += c;
3132 break;
3133
3134 case OP_NOT_DIGIT:
3135 for (i = min; i < max; i++)
3136 {
3137 int len = 1;
3138 if (eptr >= md->end_subject) break;
3139 GETCHARLEN(c, eptr, len);
3140 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3141 eptr+= len;
3142 }
3143 break;
3144
3145 case OP_DIGIT:
3146 for (i = min; i < max; i++)
3147 {
3148 int len = 1;
3149 if (eptr >= md->end_subject) break;
3150 GETCHARLEN(c, eptr, len);
3151 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3152 eptr+= len;
3153 }
3154 break;
3155
3156 case OP_NOT_WHITESPACE:
3157 for (i = min; i < max; i++)
3158 {
3159 int len = 1;
3160 if (eptr >= md->end_subject) break;
3161 GETCHARLEN(c, eptr, len);
3162 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3163 eptr+= len;
3164 }
3165 break;
3166
3167 case OP_WHITESPACE:
3168 for (i = min; i < max; i++)
3169 {
3170 int len = 1;
3171 if (eptr >= md->end_subject) break;
3172 GETCHARLEN(c, eptr, len);
3173 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3174 eptr+= len;
3175 }
3176 break;
3177
3178 case OP_NOT_WORDCHAR:
3179 for (i = min; i < max; i++)
3180 {
3181 int len = 1;
3182 if (eptr >= md->end_subject) break;
3183 GETCHARLEN(c, eptr, len);
3184 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3185 eptr+= len;
3186 }
3187 break;
3188
3189 case OP_WORDCHAR:
3190 for (i = min; i < max; i++)
3191 {
3192 int len = 1;
3193 if (eptr >= md->end_subject) break;
3194 GETCHARLEN(c, eptr, len);
3195 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3196 eptr+= len;
3197 }
3198 break;
3199
3200 default:
3201 RRETURN(PCRE_ERROR_INTERNAL);
3202 }
3203
3204 /* eptr is now past the end of the maximum run */
3205
3206 for(;;)
3207 {
darined76fb52007-02-06 21:55:25 +00003208 RMATCH(52, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00003209 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3210 if (eptr-- == pp) break; /* Stop if tried at original pos */
3211 BACKCHAR(eptr);
3212 }
3213 }
3214 else
3215#endif
3216
3217 /* Not UTF-8 mode */
3218 {
3219 switch(ctype)
3220 {
3221 case OP_ANY:
3222 if ((ims & PCRE_DOTALL) == 0)
3223 {
3224 for (i = min; i < max; i++)
3225 {
3226 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3227 eptr++;
3228 }
3229 break;
3230 }
3231 /* For DOTALL case, fall through and treat as \C */
3232
3233 case OP_ANYBYTE:
3234 c = max - min;
thatcherdc18a362006-08-31 21:28:29 +00003235 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darind7737ab2005-09-09 00:51:07 +00003236 eptr += c;
3237 break;
3238
3239 case OP_NOT_DIGIT:
3240 for (i = min; i < max; i++)
3241 {
3242 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3243 break;
3244 eptr++;
3245 }
3246 break;
3247
3248 case OP_DIGIT:
3249 for (i = min; i < max; i++)
3250 {
3251 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3252 break;
3253 eptr++;
3254 }
3255 break;
3256
3257 case OP_NOT_WHITESPACE:
3258 for (i = min; i < max; i++)
3259 {
3260 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3261 break;
3262 eptr++;
3263 }
3264 break;
3265
3266 case OP_WHITESPACE:
3267 for (i = min; i < max; i++)
3268 {
3269 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3270 break;
3271 eptr++;
3272 }
3273 break;
3274
3275 case OP_NOT_WORDCHAR:
3276 for (i = min; i < max; i++)
3277 {
3278 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3279 break;
3280 eptr++;
3281 }
3282 break;
3283
3284 case OP_WORDCHAR:
3285 for (i = min; i < max; i++)
3286 {
3287 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3288 break;
3289 eptr++;
3290 }
3291 break;
3292
3293 default:
3294 RRETURN(PCRE_ERROR_INTERNAL);
3295 }
3296
3297 /* eptr is now past the end of the maximum run */
3298
3299 while (eptr >= pp)
3300 {
darined76fb52007-02-06 21:55:25 +00003301 RMATCH(53, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darind7737ab2005-09-09 00:51:07 +00003302 eptr--;
3303 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3304 }
3305 }
3306
3307 /* Get here if we can't make it match with any permitted repetitions */
3308
3309 RRETURN(MATCH_NOMATCH);
3310 }
3311 /* Control never gets here */
3312
3313 /* There's been some horrible disaster. Since all codes > OP_BRA are
3314 for capturing brackets, and there shouldn't be any gaps between 0 and
3315 OP_BRA, arrival here can only mean there is something seriously wrong
3316 in the code above or the OP_xxx definitions. */
3317
3318 default:
3319 DPRINTF(("Unknown opcode %d\n", *ecode));
3320 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3321 }
3322
3323 /* Do not stick any code in here without much thought; it is assumed
3324 that "continue" in the code above comes out to here to repeat the main
3325 loop. */
3326
3327 } /* End of main loop */
3328/* Control never reaches here */
darined76fb52007-02-06 21:55:25 +00003329
3330#ifdef NO_RECURSE
3331#ifndef __GNUC__
3332
3333RRETURN_SWITCH:
3334switch (frame->Xwhere)
3335 {
3336 case 1: goto RRETURN_1;
3337 case 2: goto RRETURN_2;
3338 case 3: goto RRETURN_3;
3339 case 4: goto RRETURN_4;
3340 case 5: goto RRETURN_5;
3341 case 6: goto RRETURN_6;
3342 case 7: goto RRETURN_7;
3343 case 8: goto RRETURN_8;
3344 case 9: goto RRETURN_9;
3345 case 10: goto RRETURN_10;
3346 case 11: goto RRETURN_11;
3347 case 12: goto RRETURN_12;
3348 case 13: goto RRETURN_13;
3349 case 14: goto RRETURN_14;
3350 case 15: goto RRETURN_15;
3351 case 16: goto RRETURN_16;
3352 case 17: goto RRETURN_17;
3353 case 18: goto RRETURN_18;
3354 case 19: goto RRETURN_19;
3355 case 20: goto RRETURN_20;
3356 case 21: goto RRETURN_21;
3357 case 22: goto RRETURN_22;
3358 case 23: goto RRETURN_23;
3359 case 24: goto RRETURN_24;
3360 case 25: goto RRETURN_25;
3361 case 26: goto RRETURN_26;
3362 case 27: goto RRETURN_27;
3363 case 28: goto RRETURN_28;
3364 case 29: goto RRETURN_29;
3365 case 30: goto RRETURN_30;
3366 case 31: goto RRETURN_31;
3367 case 32: goto RRETURN_32;
3368 case 33: goto RRETURN_33;
3369 case 34: goto RRETURN_34;
3370 case 35: goto RRETURN_35;
3371 case 36: goto RRETURN_36;
3372 case 37: goto RRETURN_37;
3373 case 38: goto RRETURN_38;
3374 case 39: goto RRETURN_39;
3375 case 40: goto RRETURN_40;
3376 case 41: goto RRETURN_41;
3377 case 42: goto RRETURN_42;
3378 case 43: goto RRETURN_43;
3379 case 44: goto RRETURN_44;
3380 case 45: goto RRETURN_45;
3381 case 46: goto RRETURN_46;
3382 case 47: goto RRETURN_47;
3383 case 48: goto RRETURN_48;
3384 case 49: goto RRETURN_49;
3385 case 50: goto RRETURN_50;
3386 case 51: goto RRETURN_51;
3387 case 52: goto RRETURN_52;
3388 case 53: goto RRETURN_53;
3389 }
3390
3391#if PCRE_UTF16
3392/* It's safer to have the extra symbols here than to try to ifdef the switch statement above,
3393because we'll get warnings or errors if we have multiply defined symbols but a runtime failure
3394if we leave something out of the switch statement. */
3395RRETURN_32:
3396RRETURN_33:
3397RRETURN_34:
3398RRETURN_35:
3399RRETURN_36:
3400RRETURN_37:
3401#endif
3402
3403abort();
sfalken38c99b42007-02-06 22:38:04 +00003404return 0;
darined76fb52007-02-06 21:55:25 +00003405
3406#endif
3407#endif
3408
darind7737ab2005-09-09 00:51:07 +00003409}
3410
3411
3412/***************************************************************************
3413****************************************************************************
3414 RECURSION IN THE match() FUNCTION
3415
3416Undefine all the macros that were defined above to handle this. */
3417
3418#ifdef NO_RECURSE
3419#undef eptr
3420#undef ecode
3421#undef offset_top
3422#undef ims
3423#undef eptrb
3424#undef flags
3425
3426#undef callpat
3427#undef charptr
3428#undef data
3429#undef next
3430#undef pp
3431#undef prev
3432#undef saved_eptr
3433
3434#undef new_recursive
3435
3436#undef cur_is_word
3437#undef condition
3438#undef minimize
3439#undef prev_is_word
3440
3441#undef original_ims
3442
3443#undef ctype
3444#undef length
3445#undef max
3446#undef min
3447#undef number
3448#undef offset
3449#undef op
3450#undef save_capture_last
3451#undef save_offset1
3452#undef save_offset2
3453#undef save_offset3
3454#undef stacksave
3455
3456#undef newptrb
3457
3458#endif
3459
3460/* These two are defined as macros in both cases */
3461
3462#undef fc
3463#undef fi
3464
3465/***************************************************************************
3466***************************************************************************/
3467
3468
3469
3470/*************************************************
3471* Execute a Regular Expression *
3472*************************************************/
3473
3474/* This function applies a compiled re to a subject string and picks out
3475portions of the string if it matches. Two elements in the vector are set for
3476each substring: the offsets to the start and end of the substring.
3477
3478Arguments:
3479 argument_re points to the compiled expression
3480 extra_data points to extra data or is NULL
3481 subject points to the subject string
3482 length length of subject string (may contain binary zeros)
3483 start_offset where to start in the subject string
3484 options option bits
3485 offsets points to a vector of ints to be filled in with offsets
3486 offsetcount the number of elements in the vector
3487
3488Returns: > 0 => success; value is the number of elements filled in
3489 = 0 => success, but offsets is not big enough
3490 -1 => failed to match
3491 < -1 => some kind of unexpected problem
3492*/
3493
ddkilzer60a7a802007-01-01 05:07:40 +00003494PCRE_EXPORT int
darind7737ab2005-09-09 00:51:07 +00003495pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3496 const pcre_char *subject, int length, int start_offset, int options, int *offsets,
3497 int offsetcount)
3498{
3499int rc, resetcount, ocount;
3500int first_byte = -1;
3501int req_byte = -1;
3502int req_byte2 = -1;
3503unsigned long int ims = 0;
3504BOOL using_temporary_offsets = FALSE;
3505BOOL anchored;
3506BOOL startline;
3507BOOL firstline;
3508BOOL first_byte_caseless = FALSE;
3509BOOL req_byte_caseless = FALSE;
3510match_data match_block;
3511const uschar *tables;
3512const uschar *start_bits = NULL;
3513const pcre_uchar *start_match = (const pcre_uchar *)subject + start_offset;
3514const pcre_uchar *end_subject;
3515const pcre_uchar *req_byte_ptr = start_match - 1;
3516
3517pcre_study_data internal_study;
3518const pcre_study_data *study;
3519
3520real_pcre internal_re;
3521const real_pcre *external_re = (const real_pcre *)argument_re;
3522const real_pcre *re = external_re;
3523
3524/* Plausibility checks */
3525
3526if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3527if (re == NULL || subject == NULL ||
3528 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3529if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3530
3531/* Fish out the optional data from the extra_data structure, first setting
3532the default values. */
3533
3534study = NULL;
3535match_block.match_limit = MATCH_LIMIT;
3536match_block.callout_data = NULL;
3537
3538/* The table pointer is always in native byte order. */
3539
3540tables = external_re->tables;
3541
3542if (extra_data != NULL)
3543 {
thatcherdc18a362006-08-31 21:28:29 +00003544 register unsigned long flags = extra_data->flags;
darind7737ab2005-09-09 00:51:07 +00003545 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3546 study = (const pcre_study_data *)extra_data->study_data;
3547 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3548 match_block.match_limit = extra_data->match_limit;
3549 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3550 match_block.callout_data = extra_data->callout_data;
3551 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3552 }
3553
3554/* If the exec call supplied NULL for tables, use the inbuilt ones. This
3555is a feature that makes it possible to save compiled regex and re-use them
3556in other programs later. */
3557
3558if (tables == NULL) tables = _pcre_default_tables;
3559
3560/* Check that the first field in the block is the magic number. If it is not,
3561test for a regex that was compiled on a host of opposite endianness. If this is
3562the case, flipped values are put in internal_re and internal_study if there was
3563study data too. */
3564
3565if (re->magic_number != MAGIC_NUMBER)
3566 {
3567 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3568 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3569 if (study != NULL) study = &internal_study;
3570 }
3571
3572/* Set up other data */
3573
3574anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3575startline = (re->options & PCRE_STARTLINE) != 0;
3576firstline = (re->options & PCRE_FIRSTLINE) != 0;
3577
3578/* The code starts after the real_pcre block and the capture name table. */
3579
3580match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3581 re->name_count * re->name_entry_size;
3582
3583match_block.start_subject = (const pcre_uchar *)subject;
3584match_block.start_offset = start_offset;
3585match_block.end_subject = match_block.start_subject + length;
3586end_subject = match_block.end_subject;
3587
3588match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3589match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3590
3591match_block.notbol = (options & PCRE_NOTBOL) != 0;
3592match_block.noteol = (options & PCRE_NOTEOL) != 0;
3593match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3594match_block.partial = (options & PCRE_PARTIAL) != 0;
3595match_block.hitend = FALSE;
3596
3597match_block.recursive = NULL; /* No recursion at top level */
3598
3599match_block.lcc = tables + lcc_offset;
3600match_block.ctypes = tables + ctypes_offset;
3601
3602/* Partial matching is supported only for a restricted set of regexes at the
3603moment. */
3604
3605if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3606 return PCRE_ERROR_BADPARTIAL;
3607
3608/* Check a UTF-8 string if required. Unfortunately there's no way of passing
3609back the character offset. */
3610
3611#if !PCRE_UTF16
3612#ifdef SUPPORT_UTF8
3613if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3614 {
3615 if (_pcre_valid_utf8((pcre_uchar *)subject, length) >= 0)
3616 return PCRE_ERROR_BADUTF8;
3617 if (start_offset > 0 && start_offset < length)
3618 {
3619 int tb = ((pcre_uchar *)subject)[start_offset];
3620 if (tb > 127)
3621 {
3622 tb &= 0xc0;
3623 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3624 }
3625 }
3626 }
3627#endif
3628#endif
3629
3630/* The ims options can vary during the matching as a result of the presence
3631of (?ims) items in the pattern. They are kept in a local variable so that
3632restoring at the exit of a group is easy. */
3633
3634ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3635
3636/* If the expression has got more back references than the offsets supplied can
3637hold, we get a temporary chunk of working store to use during the matching.
3638Otherwise, we can use the vector supplied, rounding down its size to a multiple
3639of 3. */
3640
3641ocount = offsetcount - (offsetcount % 3);
3642
3643if (re->top_backref > 0 && re->top_backref >= ocount/3)
3644 {
3645 ocount = re->top_backref * 3 + 3;
3646 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3647 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3648 using_temporary_offsets = TRUE;
3649 DPRINTF(("Got memory to hold back references\n"));
3650 }
3651else match_block.offset_vector = offsets;
3652
3653match_block.offset_end = ocount;
3654match_block.offset_max = (2*ocount)/3;
3655match_block.offset_overflow = FALSE;
3656match_block.capture_last = -1;
3657
3658/* Compute the minimum number of offsets that we need to reset each time. Doing
3659this makes a huge difference to execution time when there aren't many brackets
3660in the pattern. */
3661
3662resetcount = 2 + re->top_bracket * 2;
3663if (resetcount > offsetcount) resetcount = ocount;
3664
3665/* Reset the working variable associated with each extraction. These should
3666never be used unless previously set, but they get saved and restored, and so we
3667initialize them to avoid reading uninitialized locations. */
3668
3669if (match_block.offset_vector != NULL)
3670 {
3671 register int *iptr = match_block.offset_vector + ocount;
3672 register int *iend = iptr - resetcount/2 + 1;
3673 while (--iptr >= iend) *iptr = -1;
3674 }
3675
3676/* Set up the first character to match, if available. The first_byte value is
3677never set for an anchored regular expression, but the anchoring may be forced
3678at run time, so we have to test for anchoring. The first char may be unset for
3679an unanchored pattern, of course. If there's no first char and the pattern was
3680studied, there may be a bitmap of possible first characters. */
3681
3682if (!anchored)
3683 {
3684 if ((re->options & PCRE_FIRSTSET) != 0)
3685 {
3686 first_byte = re->first_byte & 255;
3687 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3688 first_byte = match_block.lcc[first_byte];
3689 }
3690 else
3691 if (!startline && study != NULL &&
3692 (study->options & PCRE_STUDY_MAPPED) != 0)
3693 start_bits = study->start_bits;
3694 }
3695
3696/* For anchored or unanchored matches, there may be a "last known required
3697character" set. */
3698
3699if ((re->options & PCRE_REQCHSET) != 0)
3700 {
3701 req_byte = re->req_byte & 255;
3702 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3703 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3704 }
3705
3706/* Loop for handling unanchored repeated matching attempts; for anchored regexs
3707the loop runs just once. */
3708
3709do
3710 {
3711 const pcre_uchar *save_end_subject = end_subject;
3712
3713 /* Reset the maximum number of extractions we might see. */
3714
3715 if (match_block.offset_vector != NULL)
3716 {
3717 register int *iptr = match_block.offset_vector;
3718 register int *iend = iptr + resetcount;
3719 while (iptr < iend) *iptr++ = -1;
3720 }
3721
3722 /* Advance to a unique first char if possible. If firstline is TRUE, the
3723 start of the match is constrained to the first line of a multiline string.
3724 Implement this by temporarily adjusting end_subject so that we stop scanning
3725 at a newline. If the match fails at the newline, later code breaks this loop.
3726 */
3727
3728 if (firstline)
3729 {
3730 const pcre_uchar *t = start_match;
3731 while (t < save_end_subject && *t != '\n') t++;
3732 end_subject = t;
3733 }
3734
3735 /* Now test for a unique first byte */
3736
3737 if (first_byte >= 0)
3738 {
darince72b7a2007-02-06 19:42:35 +00003739 pcre_uchar first_char = first_byte;
darind7737ab2005-09-09 00:51:07 +00003740 if (first_byte_caseless)
3741 while (start_match < end_subject)
3742 {
3743 int sm = *start_match;
3744#if PCRE_UTF16
3745 if (sm > 127)
3746 break;
3747#endif
darince72b7a2007-02-06 19:42:35 +00003748 if (match_block.lcc[sm] == first_char)
darind7737ab2005-09-09 00:51:07 +00003749 break;
3750 start_match++;
3751 }
3752 else
darince72b7a2007-02-06 19:42:35 +00003753 while (start_match < end_subject && *start_match != first_char)
darind7737ab2005-09-09 00:51:07 +00003754 start_match++;
3755 }
3756
3757 /* Or to just after \n for a multiline match if possible */
3758
3759 else if (startline)
3760 {
3761 if (start_match > match_block.start_subject + start_offset)
3762 {
3763 while (start_match < end_subject && start_match[-1] != NEWLINE)
3764 start_match++;
3765 }
3766 }
3767
3768 /* Or to a non-unique first char after study */
3769
3770 else if (start_bits != NULL)
3771 {
3772 while (start_match < end_subject)
3773 {
3774 register unsigned int c = *start_match;
3775 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3776 }
3777 }
3778
3779 /* Restore fudged end_subject */
3780
3781 end_subject = save_end_subject;
3782
3783#ifdef DEBUG /* Sigh. Some compilers never learn. */
3784 printf(">>>> Match against: ");
3785 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3786 printf("\n");
3787#endif
3788
3789 /* If req_byte is set, we know that that character must appear in the subject
3790 for the match to succeed. If the first character is set, req_byte must be
3791 later in the subject; otherwise the test starts at the match point. This
3792 optimization can save a huge amount of backtracking in patterns with nested
3793 unlimited repeats that aren't going to match. Writing separate code for
3794 cased/caseless versions makes it go faster, as does using an autoincrement
3795 and backing off on a match.
3796
3797 HOWEVER: when the subject string is very, very long, searching to its end can
3798 take a long time, and give bad performance on quite ordinary patterns. This
3799 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3800 don't do this when the string is sufficiently long.
3801
3802 ALSO: this processing is disabled when partial matching is requested.
3803 */
3804
3805 if (req_byte >= 0 &&
3806 end_subject - start_match < REQ_BYTE_MAX &&
3807 !match_block.partial)
3808 {
3809 register const pcre_uchar *p = start_match + ((first_byte >= 0)? 1 : 0);
3810
3811 /* We don't need to repeat the search if we haven't yet reached the
3812 place we found it at last time. */
3813
3814 if (p > req_byte_ptr)
3815 {
3816 if (req_byte_caseless)
3817 {
3818 while (p < end_subject)
3819 {
3820 register int pp = *p++;
3821 if (pp == req_byte || pp == req_byte2) { p--; break; }
3822 }
3823 }
3824 else
3825 {
3826 while (p < end_subject)
3827 {
3828 if (*p++ == req_byte) { p--; break; }
3829 }
3830 }
3831
3832 /* If we can't find the required character, break the matching loop */
3833
3834 if (p >= end_subject) break;
3835
3836 /* If we have found the required character, save the point where we
3837 found it, so that we don't search again next time round the loop if
3838 the start hasn't passed this character yet. */
3839
3840 req_byte_ptr = p;
3841 }
3842 }
3843
3844 /* When a match occurs, substrings will be set for all internal extractions;
3845 we just need to set up the whole thing as substring 0 before returning. If
3846 there were too many extractions, set the return code to zero. In the case
3847 where we had to get some local store to hold offsets for backreferences, copy
3848 those back references that we can. In this case there need not be overflow
3849 if certain parts of the pattern were not used. */
3850
3851 match_block.start_match = start_match;
3852 match_block.match_call_count = 0;
3853
3854 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3855 match_isgroup);
3856
3857 /* When the result is no match, if the subject's first character was a
3858 newline and the PCRE_FIRSTLINE option is set, break (which will return
3859 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3860 newline in the subject. Otherwise, advance the pointer to the next character
3861 and continue - but the continuation will actually happen only when the
3862 pattern is not anchored. */
3863
3864 if (rc == MATCH_NOMATCH)
3865 {
3866 if (firstline && *start_match == NEWLINE) break;
3867 start_match++;
3868#ifdef SUPPORT_UTF8
3869 if (match_block.utf8)
darin496882e2006-07-15 15:30:03 +00003870 while(start_match < end_subject && ISMIDCHAR(*start_match))
darind7737ab2005-09-09 00:51:07 +00003871 start_match++;
3872#endif
3873 continue;
3874 }
3875
3876 if (rc != MATCH_MATCH)
3877 {
3878 DPRINTF((">>>> error: returning %d\n", rc));
3879 return rc;
3880 }
3881
3882 /* We have a match! Copy the offset information from temporary store if
3883 necessary */
3884
3885 if (using_temporary_offsets)
3886 {
3887 if (offsetcount >= 4)
3888 {
3889 memcpy(offsets + 2, match_block.offset_vector + 2,
3890 (offsetcount - 2) * sizeof(int));
3891 DPRINTF(("Copied offsets from temporary memory\n"));
3892 }
3893 if (match_block.end_offset_top > offsetcount)
3894 match_block.offset_overflow = TRUE;
3895
3896 DPRINTF(("Freeing temporary memory\n"));
3897 (pcre_free)(match_block.offset_vector);
3898 }
3899
3900 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3901
3902 if (offsetcount < 2) rc = 0; else
3903 {
thatcherdc18a362006-08-31 21:28:29 +00003904 offsets[0] = INT_CAST(start_match - match_block.start_subject);
3905 offsets[1] = INT_CAST(match_block.end_match_ptr - match_block.start_subject);
darind7737ab2005-09-09 00:51:07 +00003906 }
3907
3908 DPRINTF((">>>> returning %d\n", rc));
3909 return rc;
3910 }
3911
3912/* This "while" is the end of the "do" above */
3913
3914while (!anchored && start_match <= end_subject);
3915
3916if (using_temporary_offsets)
3917 {
3918 DPRINTF(("Freeing temporary memory\n"));
3919 (pcre_free)(match_block.offset_vector);
3920 }
3921
3922if (match_block.partial && match_block.hitend)
3923 {
3924 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3925 return PCRE_ERROR_PARTIAL;
3926 }
3927else
3928 {
3929 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3930 return PCRE_ERROR_NOMATCH;
3931 }
3932}
3933
3934/* End of pcre_exec.c */