Blame - JavaScriptCore/pcre/pcre_exec.c - WebKit

blob: aedabdc844f23afc0b2798509c3974fec7388972 [file] [log] [blame]

darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Copyright (c) 1997-2005 University of Cambridge
				10
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	11	Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
				12
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	13	-----------------------------------------------------------------------------
				14	Redistribution and use in source and binary forms, with or without
				15	modification, are permitted provided that the following conditions are met:
				16
				17	* Redistributions of source code must retain the above copyright notice,
				18	this list of conditions and the following disclaimer.
				19
				20	* Redistributions in binary form must reproduce the above copyright
				21	notice, this list of conditions and the following disclaimer in the
				22	documentation and/or other materials provided with the distribution.
				23
				24	* Neither the name of the University of Cambridge nor the names of its
				25	contributors may be used to endorse or promote products derived from
				26	this software without specific prior written permission.
				27
				28	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				29	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				30	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				31	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				32	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				33	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				34	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				35	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				36	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				37	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				38	POSSIBILITY OF SUCH DAMAGE.
				39	-----------------------------------------------------------------------------
				40	*/
				41
				42
				43	/* This module contains pcre_exec(), the externally visible function that does
				44	pattern matching using an NFA algorithm, trying to mimic Perl as closely as
				45	possible. There are also some static supporting functions. */
				46
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	47	#include "pcre_internal.h"
				48
darin	b847b44	2006-10-27 16:48:28 +0000	[diff] [blame]	49	/* Avoid warnings on Windows. */
				50	#undef min
				51	#undef max
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	52
				53	/* Structure for building a chain of data that actually lives on the
				54	stack, for holding the values of the subject pointer at the start of each
				55	subpattern, so as to detect when an empty string has been matched by a
				56	subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
				57	are on the heap, not on the stack. */
				58
				59	typedef struct eptrblock {
				60	struct eptrblock *epb_prev;
				61	const pcre_uchar *epb_saved_eptr;
				62	} eptrblock;
				63
				64	/* Flag bits for the match() function */
				65
				66	#define match_condassert 0x01 /* Called to check a condition assertion */
				67	#define match_isgroup 0x02 /* Set if start of bracketed group */
				68
				69	/* Non-error returns from the match() function. Error returns are externally
				70	defined PCRE_ERROR_xxx codes, which are all negative. */
				71
				72	#define MATCH_MATCH 1
				73	#define MATCH_NOMATCH 0
				74
				75	/* Maximum number of ints of offset to save on the stack for recursive calls.
				76	If the offset vector is bigger, malloc is used. This should be a multiple of 3,
				77	because the offset vector is always a multiple of 3 long. */
				78
				79	#define REC_STACK_SAVE_MAX 30
				80
				81	/* Min and max values for the common repeats; for the maxima, 0 => infinity */
				82
				83	static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
				84	static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
				85
				86
				87
				88	#ifdef DEBUG
				89	/*************************************************
				90	* Debugging function to print chars *
				91	*************************************************/
				92
				93	/* Print a sequence of chars in printable format, stopping at the end of the
				94	subject if the requested.
				95
				96	Arguments:
				97	p points to characters
				98	length number to print
				99	is_subject TRUE if printing from within md->start_subject
				100	md pointer to matching data block, if is_subject is TRUE
				101
				102	Returns: nothing
				103	*/
				104
				105	static void
				106	pchars(const pcre_uchar p, int length, BOOL is_subject, match_data md)
				107	{
				108	int c;
				109	if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
				110	while (length-- > 0)
				111	if (isprint(c = *(p++))) printf("%c", c);
				112	#if PCRE_UTF16
				113	else if (c < 256) printf("\\x%02x", c);
				114	else printf("\\x{%x}", c);
				115	#else
				116	else printf("\\x%02x", c);
				117	#endif
				118	}
				119	#endif
				120
				121
				122
				123	/*************************************************
				124	* Match a back-reference *
				125	*************************************************/
				126
				127	/* If a back reference hasn't been set, the length that is passed is greater
				128	than the number of characters left in the string, so the match fails.
				129
				130	Arguments:
				131	offset index into the offset vector
				132	eptr points into the subject
				133	length length to be matched
				134	md points to match data block
				135	ims the ims flags
				136
				137	Returns: TRUE if matched
				138	*/
				139
				140	static BOOL
				141	match_ref(int offset, register const pcre_uchar eptr, int length, match_data md,
				142	unsigned long int ims)
				143	{
				144	const pcre_uchar *p = md->start_subject + md->offset_vector[offset];
				145
				146	#ifdef DEBUG
				147	if (eptr >= md->end_subject)
				148	printf("matching subject <null>");
				149	else
				150	{
				151	printf("matching subject ");
				152	pchars(eptr, length, TRUE, md);
				153	}
				154	printf(" against backref ");
				155	pchars(p, length, FALSE, md);
				156	printf("\n");
				157	#endif
				158
				159	/* Always fail if not enough characters left */
				160
				161	if (length > md->end_subject - eptr) return FALSE;
				162
				163	/* Separate the caselesss case for speed */
				164
				165	if ((ims & PCRE_CASELESS) != 0)
				166	{
				167	while (length-- > 0)
				168	if (md->lcc[p++] != md->lcc[eptr++]) return FALSE;
				169	}
				170	else
				171	{ while (length-- > 0) if (p++ != eptr++) return FALSE; }
				172
				173	return TRUE;
				174	}
				175
				176
				177
				178	/***************************************************************************
				179	****************************************************************************
				180	RECURSION IN THE match() FUNCTION
				181
				182	The match() function is highly recursive. Some regular expressions can cause
				183	it to recurse thousands of times. I was writing for Unix, so I just let it
				184	call itself recursively. This uses the stack for saving everything that has
				185	to be saved for a recursive call. On Unix, the stack can be large, and this
				186	works fine.
				187
				188	It turns out that on non-Unix systems there are problems with programs that
				189	use a lot of stack. (This despite the fact that every last chip has oodles
				190	of memory these days, and techniques for extending the stack have been known
				191	for decades.) So....
				192
				193	There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
				194	calls by keeping local variables that need to be preserved in blocks of memory
				195	obtained from malloc instead instead of on the stack. Macros are used to
				196	achieve this so that the actual code doesn't look very different to what it
				197	always used to.
				198	****************************************************************************
				199	***************************************************************************/
				200
				201
				202	/* These versions of the macros use the stack, as normal */
				203
				204	#ifndef NO_RECURSE
				205	#define REGISTER register
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	206	#define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	207	#define RRETURN(ra) return ra
				208	#else
				209
				210
				211	/* These versions of the macros manage a private stack on the heap. Note
				212	that the rd argument of RMATCH isn't actually used. It's the md argument of
				213	match(), which never changes. */
				214
				215	#define REGISTER
				216
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	217	#ifndef __GNUC__
				218
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	219	/* Use numbered labels and switch statement at the bottom of the match function. */
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	220
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	221	#define RMATCH_WHERE(num) num
				222	#define RRETURN_LABEL RRETURN_SWITCH
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	223
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	224	#else
				225
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	226	/* Use GCC's computed goto extension. */
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	227
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	228	/* For one test case this is more than 40% faster than the switch statement.
				229	We could avoid the use of the num argument entirely by using local labels,
				230	but using it for the GCC case as well as the non-GCC case allows us to share
				231	a bit more code and notice if we use conflicting numbers.*/
				232
				233	#define RMATCH_WHERE(num) &&RRETURN_##num
				234	#define RRETURN_LABEL *frame->Xwhere
				235
				236	#endif
				237
				238
				239	#define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg)\
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	240	{\
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	241	heapframe *newframe;\
				242	if (frame >= stackframes && frame + 1 < stackframesend)\
				243	newframe = frame + 1;\
				244	else\
				245	newframe = (pcre_stack_malloc)(sizeof(heapframe));\
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	246	frame->Xwhere = RMATCH_WHERE(num);\
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	247	newframe->Xeptr = ra;\
				248	newframe->Xecode = rb;\
				249	newframe->Xoffset_top = rc;\
				250	newframe->Xims = re;\
				251	newframe->Xeptrb = rf;\
				252	newframe->Xflags = rg;\
				253	newframe->Xprevframe = frame;\
				254	frame = newframe;\
				255	DPRINTF(("restarting from line %d\n", __LINE__));\
				256	goto HEAP_RECURSE;\
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	257	RRETURN_##num:\
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	258	DPRINTF(("did a goto back to line %d\n", __LINE__));\
				259	frame = md->thisframe;\
				260	rx = frame->Xresult;\
				261	}
				262
				263	#define RRETURN(ra)\
				264	{\
				265	heapframe *newframe = frame;\
				266	frame = newframe->Xprevframe;\
				267	if (!(newframe >= stackframes && newframe < stackframesend))\
				268	(pcre_stack_free)(newframe);\
				269	if (frame != NULL)\
				270	{\
				271	frame->Xresult = ra;\
				272	md->thisframe = frame;\
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	273	goto RRETURN_LABEL;\
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	274	}\
				275	return ra;\
				276	}
				277
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	278	/* Structure for remembering the local variables in a private frame */
				279
				280	typedef struct heapframe {
				281	struct heapframe *Xprevframe;
				282
				283	/* Function arguments that may change */
				284
				285	const pcre_uchar *Xeptr;
				286	const uschar *Xecode;
				287	int Xoffset_top;
				288	long int Xims;
				289	eptrblock *Xeptrb;
				290	int Xflags;
				291
				292	/* Function local variables */
				293
				294	const uschar *Xcallpat;
				295	const uschar *Xcharptr;
				296	const uschar *Xdata;
				297	const uschar *Xnext;
				298	const pcre_uchar *Xpp;
				299	const uschar *Xprev;
				300	const pcre_uchar *Xsaved_eptr;
				301
				302	recursion_info Xnew_recursive;
				303
				304	BOOL Xcur_is_word;
				305	BOOL Xcondition;
				306	BOOL Xminimize;
				307	BOOL Xprev_is_word;
				308
				309	unsigned long int Xoriginal_ims;
				310
				311	#ifdef SUPPORT_UCP
				312	int Xprop_type;
				313	int Xprop_fail_result;
				314	int Xprop_category;
				315	int Xprop_chartype;
				316	int Xprop_othercase;
				317	int Xprop_test_against;
				318	int *Xprop_test_variable;
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	319
				320	int Xrepeat_othercase;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	321	#endif
				322
				323	int Xctype;
				324	int Xfc;
				325	int Xfi;
				326	int Xlength;
				327	int Xmax;
				328	int Xmin;
				329	int Xnumber;
				330	int Xoffset;
				331	int Xop;
				332	int Xsave_capture_last;
				333	int Xsave_offset1, Xsave_offset2, Xsave_offset3;
				334	int Xstacksave[REC_STACK_SAVE_MAX];
				335
				336	eptrblock Xnewptrb;
				337
				338	/* Place to pass back result, and where to jump back to */
				339
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	340	int Xresult;
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	341	#ifndef __GNUC__
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	342	int Xwhere;
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	343	#else
				344	void *Xwhere;
				345	#endif
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	346
				347	} heapframe;
				348
				349	#endif
				350
				351
				352	/***************************************************************************
				353	***************************************************************************/
				354
				355
				356
				357	/*************************************************
				358	* Match from current position *
				359	*************************************************/
				360
				361	/* On entry ecode points to the first opcode, and eptr to the first character
				362	in the subject string, while eptrb holds the value of eptr at the start of the
				363	last bracketed group - used for breaking infinite loops matching zero-length
				364	strings. This function is called recursively in many circumstances. Whenever it
				365	returns a negative (error) response, the outer incarnation must also return the
				366	same response.
				367
				368	Performance note: It might be tempting to extract commonly used fields from the
				369	md structure (e.g. utf8, end_subject) into individual variables to improve
				370	performance. Tests using gcc on a SPARC disproved this; in the first case, it
				371	made performance worse.
				372
				373	Arguments:
				374	eptr pointer in subject
				375	ecode position in code
				376	offset_top current top pointer
				377	md pointer to "static" info for the match
				378	ims current /i, /m, and /s options
				379	eptrb pointer to chain of blocks containing eptr at start of
				380	brackets - for testing for empty matches
				381	flags can contain
				382	match_condassert - this is an assertion condition
				383	match_isgroup - this is the start of a bracketed group
				384
				385	Returns: MATCH_MATCH if matched ) these values are >= 0
				386	MATCH_NOMATCH if failed to match )
				387	a negative PCRE_ERROR_xxx value if aborted by an error condition
				388	(e.g. stopped by recursion limit)
				389	*/
				390
				391	static int
				392	match(REGISTER const pcre_uchar eptr, REGISTER const uschar ecode,
				393	int offset_top, match_data md, unsigned long int ims, eptrblock eptrb,
				394	int flags)
				395	{
				396	/* These variables do not need to be preserved over recursion in this function,
				397	so they can be ordinary variables in all cases. Mark them with "register"
				398	because they are used a lot in loops. */
				399
				400	register int rrc; /* Returns from recursive calls */
				401	register int i; /* Used for loops not involving calls to RMATCH() */
				402	register int c; /* Character values not kept over RMATCH() calls */
				403	register BOOL utf8; /* Local copy of UTF-8 flag for speed */
				404
				405	/* When recursion is not being used, all "local" variables that have to be
				406	preserved over calls to RMATCH() are part of a "frame" which is obtained from
				407	heap storage. Set up the top-level frame here; others are obtained from the
				408	heap whenever RMATCH() does a "recursion". See the macro definitions above. */
				409
				410	#ifdef NO_RECURSE
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	411
				412	/* The value 16 here is large enough that most regular expressions don't require
				413	any calls to pcre_stack_malloc, yet the amount of stack used for the array is
				414	modest enough that we don't run out of stack. */
				415	heapframe stackframes[16];
				416	heapframe *stackframesend = stackframes + sizeof(stackframes) / sizeof(stackframes[0]);
				417
				418	heapframe *frame = stackframes;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	419	frame->Xprevframe = NULL; /* Marks the top level */
				420
				421	/* Copy in the original argument variables */
				422
				423	frame->Xeptr = eptr;
				424	frame->Xecode = ecode;
				425	frame->Xoffset_top = offset_top;
				426	frame->Xims = ims;
				427	frame->Xeptrb = eptrb;
				428	frame->Xflags = flags;
				429
				430	/* This is where control jumps back to to effect "recursion" */
				431
				432	HEAP_RECURSE:
				433
				434	/* Macros make the argument variables come from the current frame */
				435
				436	#define eptr frame->Xeptr
				437	#define ecode frame->Xecode
				438	#define offset_top frame->Xoffset_top
				439	#define ims frame->Xims
				440	#define eptrb frame->Xeptrb
				441	#define flags frame->Xflags
				442
				443	/* Ditto for the local variables */
				444
				445	#ifdef SUPPORT_UTF8
				446	#define charptr frame->Xcharptr
				447	#endif
				448	#define callpat frame->Xcallpat
				449	#define data frame->Xdata
				450	#define next frame->Xnext
				451	#define pp frame->Xpp
				452	#define prev frame->Xprev
				453	#define saved_eptr frame->Xsaved_eptr
				454
				455	#define new_recursive frame->Xnew_recursive
				456
				457	#define cur_is_word frame->Xcur_is_word
				458	#define condition frame->Xcondition
				459	#define minimize frame->Xminimize
				460	#define prev_is_word frame->Xprev_is_word
				461
				462	#define original_ims frame->Xoriginal_ims
				463
				464	#ifdef SUPPORT_UCP
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	465
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	466	#define prop_type frame->Xprop_type
				467	#define prop_fail_result frame->Xprop_fail_result
				468	#define prop_category frame->Xprop_category
				469	#define prop_chartype frame->Xprop_chartype
				470	#define prop_othercase frame->Xprop_othercase
				471	#define prop_test_against frame->Xprop_test_against
				472	#define prop_test_variable frame->Xprop_test_variable
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	473
				474	#define repeat_othercase frame->Xrepeat_othercase
				475
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	476	#endif
				477
				478	#define ctype frame->Xctype
				479	#define fc frame->Xfc
				480	#define fi frame->Xfi
				481	#define length frame->Xlength
				482	#define max frame->Xmax
				483	#define min frame->Xmin
				484	#define number frame->Xnumber
				485	#define offset frame->Xoffset
				486	#define op frame->Xop
				487	#define save_capture_last frame->Xsave_capture_last
				488	#define save_offset1 frame->Xsave_offset1
				489	#define save_offset2 frame->Xsave_offset2
				490	#define save_offset3 frame->Xsave_offset3
				491	#define stacksave frame->Xstacksave
				492
				493	#define newptrb frame->Xnewptrb
				494
				495	/* When recursion is being used, local variables are allocated on the stack and
				496	get preserved during recursion in the normal way. In this environment, fi and
				497	i, and fc and c, can be the same variables. */
				498
				499	#else
				500	#define fi i
				501	#define fc c
				502
				503
				504	#if !PCRE_UTF16
				505	#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
				506	const uschar charptr; / small blocks of the code. My normal */
				507	#endif /* style of coding would have declared */
				508	#endif
				509	const uschar callpat; / them within each of those blocks. */
				510	const uschar data; / However, in order to accommodate the */
				511	const uschar next; / version of this code that uses an */
				512	const pcre_uchar pp; / external "stack" implemented on the */
				513	const uschar prev; / heap, it is easier to declare them */
				514	const pcre_uchar saved_eptr; / all here, so the declarations can */
				515	/* be cut out in a block. The only */
				516	recursion_info new_recursive; /* declarations within blocks below are */
				517	/* for variables that do not have to */
				518	BOOL cur_is_word; /* be preserved over a recursive call */
				519	BOOL condition; /* to RMATCH(). */
				520	BOOL minimize;
				521	BOOL prev_is_word;
				522
				523	unsigned long int original_ims;
				524
				525	#ifdef SUPPORT_UCP
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	526
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	527	int prop_type;
				528	int prop_fail_result;
				529	int prop_category;
				530	int prop_chartype;
				531	int prop_othercase;
				532	int prop_test_against;
				533	int *prop_test_variable;
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	534
				535	int repeat_othercase;
				536
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	537	#endif
				538
				539	int ctype;
				540	int length;
				541	int max;
				542	int min;
				543	int number;
				544	int offset;
				545	int op;
				546	int save_capture_last;
				547	int save_offset1, save_offset2, save_offset3;
				548	int stacksave[REC_STACK_SAVE_MAX];
				549
				550	eptrblock newptrb;
				551	#endif
				552
				553	/* These statements are here to stop the compiler complaining about unitialized
				554	variables. */
				555
				556	#ifdef SUPPORT_UCP
				557	prop_fail_result = 0;
				558	prop_test_against = 0;
				559	prop_test_variable = NULL;
				560	#endif
				561
				562	/* OK, now we can get on with the real code of the function. Recursion is
				563	specified by the macros RMATCH and RRETURN. When NO_RECURSE is not defined,
				564	these just turn into a recursive call to match() and a "return", respectively.
				565	However, RMATCH isn't like a function call because it's quite a complicated
				566	macro. It has to be used in one particular way. This shouldn't, however, impact
				567	performance when true recursion is being used. */
				568
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	569	utf8 = md->utf8; /* Local copy of the flag */
				570
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	571	if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
				572
				573	original_ims = ims; /* Save for resetting on ')' */
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	574
				575	/* At the start of a bracketed group, add the current subject pointer to the
				576	stack of such pointers, to be re-instated at the end of the group when we hit
				577	the closing ket. When match() is called in other circumstances, we don't add to
				578	this stack. */
				579
				580	if ((flags & match_isgroup) != 0)
				581	{
				582	newptrb.epb_prev = eptrb;
				583	newptrb.epb_saved_eptr = eptr;
				584	eptrb = &newptrb;
				585	}
				586
				587	/* Now start processing the operations. */
				588
				589	for (;;)
				590	{
				591	op = *ecode;
				592	minimize = FALSE;
				593
				594	/* For partial matching, remember if we ever hit the end of the subject after
				595	matching at least one subject character. */
				596
				597	if (md->partial &&
				598	eptr >= md->end_subject &&
				599	eptr > md->start_match)
				600	md->hitend = TRUE;
				601
				602	/* Opening capturing bracket. If there is space in the offset vector, save
				603	the current subject position in the working slot at the top of the vector. We
				604	mustn't change the current values of the data slot, because they may be set
				605	from a previous iteration of this group, and be referred to by a reference
				606	inside the group.
				607
				608	If the bracket fails to match, we need to restore this value and also the
				609	values of the final offsets, in case they were set by a previous iteration of
				610	the same bracket.
				611
				612	If there isn't enough space in the offset vector, treat this as if it were a
				613	non-capturing bracket. Don't worry about setting the flag for the error case
				614	here; that is handled in the code for KET. */
				615
				616	if (op > OP_BRA)
				617	{
				618	number = op - OP_BRA;
				619
				620	/* For extended extraction brackets (large number), we have to fish out the
				621	number from a dummy opcode at the start. */
				622
				623	if (number > EXTRACT_BASIC_MAX)
				624	number = GET2(ecode, 2+LINK_SIZE);
				625	offset = number << 1;
				626
				627	#ifdef DEBUG
				628	printf("start bracket %d subject=", number);
				629	pchars(eptr, 16, TRUE, md);
				630	printf("\n");
				631	#endif
				632
				633	if (offset < md->offset_max)
				634	{
				635	save_offset1 = md->offset_vector[offset];
				636	save_offset2 = md->offset_vector[offset+1];
				637	save_offset3 = md->offset_vector[md->offset_end - number];
				638	save_capture_last = md->capture_last;
				639
				640	DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	641	md->offset_vector[md->offset_end - number] = INT_CAST(eptr - md->start_subject);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	642
				643	do
				644	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	645	RMATCH(1, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	646	match_isgroup);
				647	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				648	md->capture_last = save_capture_last;
				649	ecode += GET(ecode, 1);
				650	}
				651	while (*ecode == OP_ALT);
				652
				653	DPRINTF(("bracket %d failed\n", number));
				654
				655	md->offset_vector[offset] = save_offset1;
				656	md->offset_vector[offset+1] = save_offset2;
				657	md->offset_vector[md->offset_end - number] = save_offset3;
				658
				659	RRETURN(MATCH_NOMATCH);
				660	}
				661
				662	/* Insufficient room for saving captured contents */
				663
				664	else op = OP_BRA;
				665	}
				666
				667	/* Other types of node can be handled by a switch */
				668
				669	switch(op)
				670	{
				671	case OP_BRA: /* Non-capturing bracket: optimized */
				672	DPRINTF(("start bracket 0\n"));
				673	do
				674	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	675	RMATCH(2, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	676	match_isgroup);
				677	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				678	ecode += GET(ecode, 1);
				679	}
				680	while (*ecode == OP_ALT);
				681	DPRINTF(("bracket 0 failed\n"));
				682	RRETURN(MATCH_NOMATCH);
				683
				684	/* Conditional group: compilation checked that there are no more than
				685	two branches. If the condition is false, skipping the first branch takes us
				686	past the end if there is only one branch, but that's OK because that is
				687	exactly what going to the ket would do. */
				688
				689	case OP_COND:
				690	if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
				691	{
				692	offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
				693	condition = (offset == CREF_RECURSE * 2)?
				694	(md->recursive != NULL) :
				695	(offset < offset_top && md->offset_vector[offset] >= 0);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	696	RMATCH(3, rrc, eptr, ecode + (condition?
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	697	(LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
				698	offset_top, md, ims, eptrb, match_isgroup);
				699	RRETURN(rrc);
				700	}
				701
				702	/* The condition is an assertion. Call match() to evaluate it - setting
				703	the final argument TRUE causes it to stop at the end of an assertion. */
				704
				705	else
				706	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	707	RMATCH(4, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	708	match_condassert \| match_isgroup);
				709	if (rrc == MATCH_MATCH)
				710	{
				711	ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
				712	while (*ecode == OP_ALT) ecode += GET(ecode, 1);
				713	}
				714	else if (rrc != MATCH_NOMATCH)
				715	{
				716	RRETURN(rrc); /* Need braces because of following else */
				717	}
				718	else ecode += GET(ecode, 1);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	719	RMATCH(5, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	720	match_isgroup);
				721	RRETURN(rrc);
				722	}
				723	/* Control never reaches here */
				724
				725	/* Skip over conditional reference or large extraction number data if
				726	encountered. */
				727
				728	case OP_CREF:
				729	case OP_BRANUMBER:
				730	ecode += 3;
				731	break;
				732
				733	/* End of the pattern. If we are in a recursion, we should restore the
				734	offsets appropriately and continue from after the call. */
				735
				736	case OP_END:
				737	if (md->recursive != NULL && md->recursive->group_num == 0)
				738	{
				739	recursion_info *rec = md->recursive;
				740	DPRINTF(("Hit the end in a (?0) recursion\n"));
				741	md->recursive = rec->prevrec;
				742	memmove(md->offset_vector, rec->offset_save,
				743	rec->saved_max * sizeof(int));
				744	md->start_match = rec->save_start;
				745	ims = original_ims;
				746	ecode = rec->after_call;
				747	break;
				748	}
				749
				750	/* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
				751	string - backtracking will then try other alternatives, if any. */
				752
				753	if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
				754	md->end_match_ptr = eptr; /* Record where we ended */
				755	md->end_offset_top = offset_top; /* and how many extracts were taken */
				756	RRETURN(MATCH_MATCH);
				757
				758	/* Change option settings */
				759
				760	case OP_OPT:
				761	ims = ecode[1];
				762	ecode += 2;
				763	DPRINTF(("ims set to %02lx\n", ims));
				764	break;
				765
				766	/* Assertion brackets. Check the alternative branches in turn - the
				767	matching won't pass the KET for an assertion. If any one branch matches,
				768	the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
				769	start of each branch to move the current point backwards, so the code at
				770	this level is identical to the lookahead case. */
				771
				772	case OP_ASSERT:
				773	case OP_ASSERTBACK:
				774	do
				775	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	776	RMATCH(6, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	777	match_isgroup);
				778	if (rrc == MATCH_MATCH) break;
				779	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				780	ecode += GET(ecode, 1);
				781	}
				782	while (*ecode == OP_ALT);
				783	if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
				784
				785	/* If checking an assertion for a condition, return MATCH_MATCH. */
				786
				787	if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
				788
				789	/* Continue from after the assertion, updating the offsets high water
				790	mark, since extracts may have been taken during the assertion. */
				791
				792	do ecode += GET(ecode,1); while (*ecode == OP_ALT);
				793	ecode += 1 + LINK_SIZE;
				794	offset_top = md->end_offset_top;
				795	continue;
				796
				797	/* Negative assertion: all branches must fail to match */
				798
				799	case OP_ASSERT_NOT:
				800	case OP_ASSERTBACK_NOT:
				801	do
				802	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	803	RMATCH(7, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	804	match_isgroup);
				805	if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
				806	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				807	ecode += GET(ecode,1);
				808	}
				809	while (*ecode == OP_ALT);
				810
				811	if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
				812
				813	ecode += 1 + LINK_SIZE;
				814	continue;
				815
				816	/* Move the subject pointer back. This occurs only at the start of
				817	each branch of a lookbehind assertion. If we are too close to the start to
				818	move back, this match function fails. When working with UTF-8 we move
				819	back a number of characters, not bytes. */
				820
				821	case OP_REVERSE:
				822	#ifdef SUPPORT_UTF8
				823	if (utf8)
				824	{
				825	c = GET(ecode,1);
				826	for (i = 0; i < c; i++)
				827	{
				828	eptr--;
				829	if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
				830	BACKCHAR(eptr)
				831	}
				832	}
				833	else
				834	#endif
				835
				836	/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
				837
				838	{
				839	eptr -= GET(ecode,1);
				840	if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
				841	}
				842
				843	/* Skip to next op code */
				844
				845	ecode += 1 + LINK_SIZE;
				846	break;
				847
				848	/* The callout item calls an external function, if one is provided, passing
				849	details of the match so far. This is mainly for debugging, though the
				850	function is able to force a failure. */
				851
				852	case OP_CALLOUT:
				853	if (pcre_callout != NULL)
				854	{
				855	pcre_callout_block cb;
				856	cb.version = 1; /* Version 1 of the callout block */
				857	cb.callout_number = ecode[1];
				858	cb.offset_vector = md->offset_vector;
				859	cb.subject = (const pcre_char *)md->start_subject;
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	860	cb.subject_length = INT_CAST(md->end_subject - md->start_subject);
				861	cb.start_match = INT_CAST(md->start_match - md->start_subject);
				862	cb.current_position = INT_CAST(eptr - md->start_subject);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	863	cb.pattern_position = GET(ecode, 2);
				864	cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
				865	cb.capture_top = offset_top/2;
				866	cb.capture_last = md->capture_last;
				867	cb.callout_data = md->callout_data;
				868	if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
				869	if (rrc < 0) RRETURN(rrc);
				870	}
				871	ecode += 2 + 2*LINK_SIZE;
				872	break;
				873
				874	/* Recursion either matches the current regex, or some subexpression. The
				875	offset data is the offset to the starting bracket from the start of the
				876	whole pattern. (This is so that it works from duplicated subpatterns.)
				877
				878	If there are any capturing brackets started but not finished, we have to
				879	save their starting points and reinstate them after the recursion. However,
				880	we don't know how many such there are (offset_top records the completed
				881	total) so we just have to save all the potential data. There may be up to
				882	65535 such values, which is too large to put on the stack, but using malloc
				883	for small numbers seems expensive. As a compromise, the stack is used when
				884	there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
				885	is used. A problem is what to do if the malloc fails ... there is no way of
				886	returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
				887	values on the stack, and accept that the rest may be wrong.
				888
				889	There are also other values that have to be saved. We use a chained
				890	sequence of blocks that actually live on the stack. Thanks to Robin Houston
				891	for the original version of this logic. */
				892
				893	case OP_RECURSE:
				894	{
				895	callpat = md->start_code + GET(ecode, 1);
				896	new_recursive.group_num = *callpat - OP_BRA;
				897
				898	/* For extended extraction brackets (large number), we have to fish out
				899	the number from a dummy opcode at the start. */
				900
				901	if (new_recursive.group_num > EXTRACT_BASIC_MAX)
				902	new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
				903
				904	/* Add to "recursing stack" */
				905
				906	new_recursive.prevrec = md->recursive;
				907	md->recursive = &new_recursive;
				908
				909	/* Find where to continue from afterwards */
				910
				911	ecode += 1 + LINK_SIZE;
				912	new_recursive.after_call = ecode;
				913
				914	/* Now save the offset data. */
				915
				916	new_recursive.saved_max = md->offset_end;
				917	if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
				918	new_recursive.offset_save = stacksave;
				919	else
				920	{
				921	new_recursive.offset_save =
				922	(int )(pcre_malloc)(new_recursive.saved_max sizeof(int));
				923	if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
				924	}
				925
				926	memcpy(new_recursive.offset_save, md->offset_vector,
				927	new_recursive.saved_max * sizeof(int));
				928	new_recursive.save_start = md->start_match;
				929	md->start_match = eptr;
				930
				931	/* OK, now we can do the recursion. For each top-level alternative we
				932	restore the offset and recursion data. */
				933
				934	DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
				935	do
				936	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	937	RMATCH(8, rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	938	eptrb, match_isgroup);
				939	if (rrc == MATCH_MATCH)
				940	{
				941	md->recursive = new_recursive.prevrec;
				942	if (new_recursive.offset_save != stacksave)
				943	(pcre_free)(new_recursive.offset_save);
				944	RRETURN(MATCH_MATCH);
				945	}
				946	else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				947
				948	md->recursive = &new_recursive;
				949	memcpy(md->offset_vector, new_recursive.offset_save,
				950	new_recursive.saved_max * sizeof(int));
				951	callpat += GET(callpat, 1);
				952	}
				953	while (*callpat == OP_ALT);
				954
				955	DPRINTF(("Recursion didn't match\n"));
				956	md->recursive = new_recursive.prevrec;
				957	if (new_recursive.offset_save != stacksave)
				958	(pcre_free)(new_recursive.offset_save);
				959	RRETURN(MATCH_NOMATCH);
				960	}
				961	/* Control never reaches here */
				962
				963	/* "Once" brackets are like assertion brackets except that after a match,
				964	the point in the subject string is not moved back. Thus there can never be
				965	a move back into the brackets. Friedl calls these "atomic" subpatterns.
				966	Check the alternative branches in turn - the matching won't pass the KET
				967	for this kind of subpattern. If any one branch matches, we carry on as at
				968	the end of a normal bracket, leaving the subject pointer. */
				969
				970	case OP_ONCE:
				971	{
				972	prev = ecode;
				973	saved_eptr = eptr;
				974
				975	do
				976	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	977	RMATCH(9, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	978	eptrb, match_isgroup);
				979	if (rrc == MATCH_MATCH) break;
				980	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				981	ecode += GET(ecode,1);
				982	}
				983	while (*ecode == OP_ALT);
				984
				985	/* If hit the end of the group (which could be repeated), fail */
				986
				987	if (ecode != OP_ONCE && ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
				988
				989	/* Continue as from after the assertion, updating the offsets high water
				990	mark, since extracts may have been taken. */
				991
				992	do ecode += GET(ecode,1); while (*ecode == OP_ALT);
				993
				994	offset_top = md->end_offset_top;
				995	eptr = md->end_match_ptr;
				996
				997	/* For a non-repeating ket, just continue at this level. This also
				998	happens for a repeating ket if no characters were matched in the group.
				999	This is the forcible breaking of infinite loops as implemented in Perl
				1000	5.005. If there is an options reset, it will get obeyed in the normal
				1001	course of events. */
				1002
				1003	if (*ecode == OP_KET \|\| eptr == saved_eptr)
				1004	{
				1005	ecode += 1+LINK_SIZE;
				1006	break;
				1007	}
				1008
				1009	/* The repeating kets try the rest of the pattern or restart from the
				1010	preceding bracket, in the appropriate order. We need to reset any options
				1011	that changed within the bracket before re-running it, so check the next
				1012	opcode. */
				1013
				1014	if (ecode[1+LINK_SIZE] == OP_OPT)
				1015	{
				1016	ims = (ims & ~PCRE_IMS) \| ecode[4];
				1017	DPRINTF(("ims set to %02lx at group repeat\n", ims));
				1018	}
				1019
				1020	if (*ecode == OP_KETRMIN)
				1021	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1022	RMATCH(10, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1023	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1024	RMATCH(11, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1025	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1026	}
				1027	else /* OP_KETRMAX */
				1028	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1029	RMATCH(12, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1030	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1031	RMATCH(13, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1032	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1033	}
				1034	}
				1035	RRETURN(MATCH_NOMATCH);
				1036
				1037	/* An alternation is the end of a branch; scan along to find the end of the
				1038	bracketed group and go to there. */
				1039
				1040	case OP_ALT:
				1041	do ecode += GET(ecode,1); while (*ecode == OP_ALT);
				1042	break;
				1043
				1044	/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
				1045	that it may occur zero times. It may repeat infinitely, or not at all -
				1046	i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
				1047	repeat limits are compiled as a number of copies, with the optional ones
				1048	preceded by BRAZERO or BRAMINZERO. */
				1049
				1050	case OP_BRAZERO:
				1051	{
				1052	next = ecode+1;
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1053	RMATCH(14, rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1054	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1055	do next += GET(next,1); while (*next == OP_ALT);
				1056	ecode = next + 1+LINK_SIZE;
				1057	}
				1058	break;
				1059
				1060	case OP_BRAMINZERO:
				1061	{
				1062	next = ecode+1;
				1063	do next += GET(next,1); while (*next == OP_ALT);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1064	RMATCH(15, rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1065	match_isgroup);
				1066	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1067	ecode++;
				1068	}
				1069	break;
				1070
				1071	/* End of a group, repeated or non-repeating. If we are at the end of
				1072	an assertion "group", stop matching and return MATCH_MATCH, but record the
				1073	current high water mark for use by positive assertions. Do this also
				1074	for the "once" (not-backup up) groups. */
				1075
				1076	case OP_KET:
				1077	case OP_KETRMIN:
				1078	case OP_KETRMAX:
				1079	{
				1080	prev = ecode - GET(ecode, 1);
				1081	saved_eptr = eptrb->epb_saved_eptr;
				1082
				1083	/* Back up the stack of bracket start pointers. */
				1084
				1085	eptrb = eptrb->epb_prev;
				1086
				1087	if (prev == OP_ASSERT \|\| prev == OP_ASSERT_NOT \|\|
				1088	prev == OP_ASSERTBACK \|\| prev == OP_ASSERTBACK_NOT \|\|
				1089	*prev == OP_ONCE)
				1090	{
				1091	md->end_match_ptr = eptr; /* For ONCE */
				1092	md->end_offset_top = offset_top;
				1093	RRETURN(MATCH_MATCH);
				1094	}
				1095
				1096	/* In all other cases except a conditional group we have to check the
				1097	group number back at the start and if necessary complete handling an
				1098	extraction by setting the offsets and bumping the high water mark. */
				1099
				1100	if (*prev != OP_COND)
				1101	{
				1102	number = *prev - OP_BRA;
				1103
				1104	/* For extended extraction brackets (large number), we have to fish out
				1105	the number from a dummy opcode at the start. */
				1106
				1107	if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
				1108	offset = number << 1;
				1109
				1110	#ifdef DEBUG
				1111	printf("end bracket %d", number);
				1112	printf("\n");
				1113	#endif
				1114
				1115	/* Test for a numbered group. This includes groups called as a result
				1116	of recursion. Note that whole-pattern recursion is coded as a recurse
				1117	into group 0, so it won't be picked up here. Instead, we catch it when
				1118	the OP_END is reached. */
				1119
				1120	if (number > 0)
				1121	{
				1122	md->capture_last = number;
				1123	if (offset >= md->offset_max) md->offset_overflow = TRUE; else
				1124	{
				1125	md->offset_vector[offset] =
				1126	md->offset_vector[md->offset_end - number];
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	1127	md->offset_vector[offset+1] = INT_CAST(eptr - md->start_subject);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1128	if (offset_top <= offset) offset_top = offset + 2;
				1129	}
				1130
				1131	/* Handle a recursively called group. Restore the offsets
				1132	appropriately and continue from after the call. */
				1133
				1134	if (md->recursive != NULL && md->recursive->group_num == number)
				1135	{
				1136	recursion_info *rec = md->recursive;
				1137	DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
				1138	md->recursive = rec->prevrec;
				1139	md->start_match = rec->save_start;
				1140	memcpy(md->offset_vector, rec->offset_save,
				1141	rec->saved_max * sizeof(int));
				1142	ecode = rec->after_call;
				1143	ims = original_ims;
				1144	break;
				1145	}
				1146	}
				1147	}
				1148
				1149	/* Reset the value of the ims flags, in case they got changed during
				1150	the group. */
				1151
				1152	ims = original_ims;
				1153	DPRINTF(("ims reset to %02lx\n", ims));
				1154
				1155	/* For a non-repeating ket, just continue at this level. This also
				1156	happens for a repeating ket if no characters were matched in the group.
				1157	This is the forcible breaking of infinite loops as implemented in Perl
				1158	5.005. If there is an options reset, it will get obeyed in the normal
				1159	course of events. */
				1160
				1161	if (*ecode == OP_KET \|\| eptr == saved_eptr)
				1162	{
				1163	ecode += 1 + LINK_SIZE;
				1164	break;
				1165	}
				1166
				1167	/* The repeating kets try the rest of the pattern or restart from the
				1168	preceding bracket, in the appropriate order. */
				1169
				1170	if (*ecode == OP_KETRMIN)
				1171	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1172	RMATCH(16, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1173	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1174	RMATCH(17, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1175	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1176	}
				1177	else /* OP_KETRMAX */
				1178	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1179	RMATCH(18, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1180	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1181	RMATCH(19, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1182	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1183	}
				1184	}
				1185
				1186	RRETURN(MATCH_NOMATCH);
				1187
				1188	/* Start of subject unless notbol, or after internal newline if multiline */
				1189
				1190	case OP_CIRC:
				1191	if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
				1192	if ((ims & PCRE_MULTILINE) != 0)
				1193	{
				1194	if (eptr != md->start_subject && eptr[-1] != NEWLINE)
				1195	RRETURN(MATCH_NOMATCH);
				1196	ecode++;
				1197	break;
				1198	}
				1199	/* ... else fall through */
				1200
				1201	/* Start of subject assertion */
				1202
				1203	case OP_SOD:
				1204	if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
				1205	ecode++;
				1206	break;
				1207
				1208	/* Start of match assertion */
				1209
				1210	case OP_SOM:
				1211	if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
				1212	ecode++;
				1213	break;
				1214
				1215	/* Assert before internal newline if multiline, or before a terminating
				1216	newline unless endonly is set, else end of subject unless noteol is set. */
				1217
				1218	case OP_DOLL:
				1219	if ((ims & PCRE_MULTILINE) != 0)
				1220	{
				1221	if (eptr < md->end_subject)
				1222	{ if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
				1223	else
				1224	{ if (md->noteol) RRETURN(MATCH_NOMATCH); }
				1225	ecode++;
				1226	break;
				1227	}
				1228	else
				1229	{
				1230	if (md->noteol) RRETURN(MATCH_NOMATCH);
				1231	if (!md->endonly)
				1232	{
				1233	if (eptr < md->end_subject - 1 \|\|
				1234	(eptr == md->end_subject - 1 && *eptr != NEWLINE))
				1235	RRETURN(MATCH_NOMATCH);
				1236	ecode++;
				1237	break;
				1238	}
				1239	}
				1240	/* ... else fall through */
				1241
				1242	/* End of subject assertion (\z) */
				1243
				1244	case OP_EOD:
				1245	if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
				1246	ecode++;
				1247	break;
				1248
				1249	/* End of subject or ending \n assertion (\Z) */
				1250
				1251	case OP_EODN:
				1252	if (eptr < md->end_subject - 1 \|\|
				1253	(eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
				1254	ecode++;
				1255	break;
				1256
				1257	/* Word boundary assertions */
				1258
				1259	case OP_NOT_WORD_BOUNDARY:
				1260	case OP_WORD_BOUNDARY:
				1261	{
				1262
				1263	/* Find out if the previous and current characters are "word" characters.
				1264	It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
				1265	be "non-word" characters. */
				1266
				1267	#ifdef SUPPORT_UTF8
				1268	if (utf8)
				1269	{
				1270	if (eptr == md->start_subject) prev_is_word = FALSE; else
				1271	{
				1272	const pcre_uchar *lastptr = eptr - 1;
				1273	while(ISMIDCHAR(*lastptr)) lastptr--;
				1274	GETCHAR(c, lastptr);
				1275	prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
				1276	}
				1277	if (eptr >= md->end_subject) cur_is_word = FALSE; else
				1278	{
				1279	GETCHAR(c, eptr);
				1280	cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
				1281	}
				1282	}
				1283	else
				1284	#endif
				1285
				1286	/* More streamlined when not in UTF-8 mode */
				1287
				1288	{
				1289	prev_is_word = (eptr != md->start_subject) &&
				1290	((md->ctypes[eptr[-1]] & ctype_word) != 0);
				1291	cur_is_word = (eptr < md->end_subject) &&
				1292	((md->ctypes[*eptr] & ctype_word) != 0);
				1293	}
				1294
				1295	/* Now see if the situation is what we want */
				1296
				1297	if ((*ecode++ == OP_WORD_BOUNDARY)?
				1298	cur_is_word == prev_is_word : cur_is_word != prev_is_word)
				1299	RRETURN(MATCH_NOMATCH);
				1300	}
				1301	break;
				1302
				1303	/* Match a single character type; inline for speed */
				1304
				1305	case OP_ANY:
				1306	if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
				1307	RRETURN(MATCH_NOMATCH);
				1308	if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1309	#ifdef SUPPORT_UTF8
				1310	if (utf8)
				1311	while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
				1312	#endif
				1313	ecode++;
				1314	break;
				1315
				1316	/* Match a single byte, even in UTF-8 mode. This opcode really does match
				1317	any byte, even newline, independent of the setting of PCRE_DOTALL. */
				1318
				1319	case OP_ANYBYTE:
				1320	if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1321	ecode++;
				1322	break;
				1323
				1324	case OP_NOT_DIGIT:
				1325	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1326	GETCHARINCTEST(c, eptr);
				1327	if (
				1328	#ifdef SUPPORT_UTF8
				1329	c < 256 &&
				1330	#endif
				1331	(md->ctypes[c] & ctype_digit) != 0
				1332	)
				1333	RRETURN(MATCH_NOMATCH);
				1334	ecode++;
				1335	break;
				1336
				1337	case OP_DIGIT:
				1338	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1339	GETCHARINCTEST(c, eptr);
				1340	if (
				1341	#ifdef SUPPORT_UTF8
				1342	c >= 256 \|\|
				1343	#endif
				1344	(md->ctypes[c] & ctype_digit) == 0
				1345	)
				1346	RRETURN(MATCH_NOMATCH);
				1347	ecode++;
				1348	break;
				1349
				1350	case OP_NOT_WHITESPACE:
				1351	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1352	GETCHARINCTEST(c, eptr);
				1353	if (
				1354	#ifdef SUPPORT_UTF8
				1355	c < 256 &&
				1356	#endif
				1357	(md->ctypes[c] & ctype_space) != 0
				1358	)
				1359	RRETURN(MATCH_NOMATCH);
				1360	ecode++;
				1361	break;
				1362
				1363	case OP_WHITESPACE:
				1364	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1365	GETCHARINCTEST(c, eptr);
				1366	if (
				1367	#ifdef SUPPORT_UTF8
				1368	c >= 256 \|\|
				1369	#endif
				1370	(md->ctypes[c] & ctype_space) == 0
				1371	)
				1372	RRETURN(MATCH_NOMATCH);
				1373	ecode++;
				1374	break;
				1375
				1376	case OP_NOT_WORDCHAR:
				1377	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1378	GETCHARINCTEST(c, eptr);
				1379	if (
				1380	#ifdef SUPPORT_UTF8
				1381	c < 256 &&
				1382	#endif
				1383	(md->ctypes[c] & ctype_word) != 0
				1384	)
				1385	RRETURN(MATCH_NOMATCH);
				1386	ecode++;
				1387	break;
				1388
				1389	case OP_WORDCHAR:
				1390	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1391	GETCHARINCTEST(c, eptr);
				1392	if (
				1393	#ifdef SUPPORT_UTF8
				1394	c >= 256 \|\|
				1395	#endif
				1396	(md->ctypes[c] & ctype_word) == 0
				1397	)
				1398	RRETURN(MATCH_NOMATCH);
				1399	ecode++;
				1400	break;
				1401
				1402	#ifdef SUPPORT_UCP
				1403	/* Check the next character by Unicode property. We will get here only
				1404	if the support is in the binary; otherwise a compile-time error occurs. */
				1405
				1406	case OP_PROP:
				1407	case OP_NOTPROP:
				1408	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1409	GETCHARINCTEST(c, eptr);
				1410	{
				1411	int chartype, rqdtype;
				1412	int othercase;
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	1413	int category = _pcre_ucp_findchar(c, &chartype, &othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1414
				1415	rqdtype = *(++ecode);
				1416	ecode++;
				1417
				1418	if (rqdtype >= 128)
				1419	{
				1420	if ((rqdtype - 128 != category) == (op == OP_PROP))
				1421	RRETURN(MATCH_NOMATCH);
				1422	}
				1423	else
				1424	{
				1425	if ((rqdtype != chartype) == (op == OP_PROP))
				1426	RRETURN(MATCH_NOMATCH);
				1427	}
				1428	}
				1429	break;
				1430
				1431	/* Match an extended Unicode sequence. We will get here only if the support
				1432	is in the binary; otherwise a compile-time error occurs. */
				1433
				1434	case OP_EXTUNI:
				1435	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1436	GETCHARINCTEST(c, eptr);
				1437	{
				1438	int chartype;
				1439	int othercase;
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	1440	int category = _pcre_ucp_findchar(c, &chartype, &othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1441	if (category == ucp_M) RRETURN(MATCH_NOMATCH);
				1442	while (eptr < md->end_subject)
				1443	{
				1444	int len = 1;
				1445	if (!utf8) c = *eptr; else
				1446	{
				1447	GETCHARLEN(c, eptr, len);
				1448	}
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	1449	category = _pcre_ucp_findchar(c, &chartype, &othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1450	if (category != ucp_M) break;
				1451	eptr += len;
				1452	}
				1453	}
				1454	ecode++;
				1455	break;
				1456	#endif
				1457
				1458
				1459	/* Match a back reference, possibly repeatedly. Look past the end of the
				1460	item to see if there is repeat information following. The code is similar
				1461	to that for character classes, but repeated for efficiency. Then obey
				1462	similar code to character type repeats - written out again for speed.
				1463	However, if the referenced string is the empty string, always treat
				1464	it as matched, any number of times (otherwise there could be infinite
				1465	loops). */
				1466
				1467	case OP_REF:
				1468	{
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	1469	int tmplen;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1470	offset = GET2(ecode, 1) << 1; /* Doubled ref number */
				1471	ecode += 3; /* Advance past item */
				1472
				1473	/* If the reference is unset, set the length to be longer than the amount
				1474	of subject left; this ensures that every attempt at a match fails. We
				1475	can't just fail here, because of the possibility of quantifiers with zero
				1476	minima. */
				1477
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	1478	tmplen = INT_CAST(md->end_subject - eptr + 1);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1479	length = (offset >= offset_top \|\| md->offset_vector[offset] < 0)?
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	1480	tmplen :
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1481	md->offset_vector[offset+1] - md->offset_vector[offset];
				1482
				1483	/* Set up for repetition, or handle the non-repeated case */
				1484
				1485	switch (*ecode)
				1486	{
				1487	case OP_CRSTAR:
				1488	case OP_CRMINSTAR:
				1489	case OP_CRPLUS:
				1490	case OP_CRMINPLUS:
				1491	case OP_CRQUERY:
				1492	case OP_CRMINQUERY:
				1493	c = *ecode++ - OP_CRSTAR;
				1494	minimize = (c & 1) != 0;
				1495	min = rep_min[c]; /* Pick up values from tables; */
				1496	max = rep_max[c]; /* zero for max => infinity */
				1497	if (max == 0) max = INT_MAX;
				1498	break;
				1499
				1500	case OP_CRRANGE:
				1501	case OP_CRMINRANGE:
				1502	minimize = (*ecode == OP_CRMINRANGE);
				1503	min = GET2(ecode, 1);
				1504	max = GET2(ecode, 3);
				1505	if (max == 0) max = INT_MAX;
				1506	ecode += 5;
				1507	break;
				1508
				1509	default: /* No repeat follows */
				1510	if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
				1511	eptr += length;
				1512	continue; /* With the main loop */
				1513	}
				1514
				1515	/* If the length of the reference is zero, just continue with the
				1516	main loop. */
				1517
				1518	if (length == 0) continue;
				1519
				1520	/* First, ensure the minimum number of matches are present. We get back
				1521	the length of the reference string explicitly rather than passing the
				1522	address of eptr, so that eptr can be a register variable. */
				1523
				1524	for (i = 1; i <= min; i++)
				1525	{
				1526	if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
				1527	eptr += length;
				1528	}
				1529
				1530	/* If min = max, continue at the same level without recursion.
				1531	They are not both allowed to be zero. */
				1532
				1533	if (min == max) continue;
				1534
				1535	/* If minimizing, keep trying and advancing the pointer */
				1536
				1537	if (minimize)
				1538	{
				1539	for (fi = min;; fi++)
				1540	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1541	RMATCH(20, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1542	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1543	if (fi >= max \|\| !match_ref(offset, eptr, length, md, ims))
				1544	RRETURN(MATCH_NOMATCH);
				1545	eptr += length;
				1546	}
				1547	/* Control never gets here */
				1548	}
				1549
				1550	/* If maximizing, find the longest string and work backwards */
				1551
				1552	else
				1553	{
				1554	pp = eptr;
				1555	for (i = min; i < max; i++)
				1556	{
				1557	if (!match_ref(offset, eptr, length, md, ims)) break;
				1558	eptr += length;
				1559	}
				1560	while (eptr >= pp)
				1561	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1562	RMATCH(21, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1563	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1564	eptr -= length;
				1565	}
				1566	RRETURN(MATCH_NOMATCH);
				1567	}
				1568	}
				1569	/* Control never gets here */
				1570
				1571
				1572
				1573	/* Match a bit-mapped character class, possibly repeatedly. This op code is
				1574	used when all the characters in the class have values in the range 0-255,
				1575	and either the matching is caseful, or the characters are in the range
				1576	0-127 when UTF-8 processing is enabled. The only difference between
				1577	OP_CLASS and OP_NCLASS occurs when a data character outside the range is
				1578	encountered.
				1579
				1580	First, look past the end of the item to see if there is repeat information
				1581	following. Then obey similar code to character type repeats - written out
				1582	again for speed. */
				1583
				1584	case OP_NCLASS:
				1585	case OP_CLASS:
				1586	{
				1587	data = ecode + 1; /* Save for matching */
				1588	ecode += 33; /* Advance past the item */
				1589
				1590	switch (*ecode)
				1591	{
				1592	case OP_CRSTAR:
				1593	case OP_CRMINSTAR:
				1594	case OP_CRPLUS:
				1595	case OP_CRMINPLUS:
				1596	case OP_CRQUERY:
				1597	case OP_CRMINQUERY:
				1598	c = *ecode++ - OP_CRSTAR;
				1599	minimize = (c & 1) != 0;
				1600	min = rep_min[c]; /* Pick up values from tables; */
				1601	max = rep_max[c]; /* zero for max => infinity */
				1602	if (max == 0) max = INT_MAX;
				1603	break;
				1604
				1605	case OP_CRRANGE:
				1606	case OP_CRMINRANGE:
				1607	minimize = (*ecode == OP_CRMINRANGE);
				1608	min = GET2(ecode, 1);
				1609	max = GET2(ecode, 3);
				1610	if (max == 0) max = INT_MAX;
				1611	ecode += 5;
				1612	break;
				1613
				1614	default: /* No repeat follows */
				1615	min = max = 1;
				1616	break;
				1617	}
				1618
				1619	/* First, ensure the minimum number of matches are present. */
				1620
				1621	#ifdef SUPPORT_UTF8
				1622	/* UTF-8 mode */
				1623	if (utf8)
				1624	{
				1625	for (i = 1; i <= min; i++)
				1626	{
				1627	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1628	GETCHARINC(c, eptr);
				1629	if (c > 255)
				1630	{
				1631	if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
				1632	}
				1633	else
				1634	{
				1635	if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
				1636	}
				1637	}
				1638	}
				1639	else
				1640	#endif
				1641	/* Not UTF-8 mode */
				1642	{
				1643	for (i = 1; i <= min; i++)
				1644	{
				1645	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1646	c = *eptr++;
				1647	if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
				1648	}
				1649	}
				1650
				1651	/* If max == min we can continue with the main loop without the
				1652	need to recurse. */
				1653
				1654	if (min == max) continue;
				1655
				1656	/* If minimizing, keep testing the rest of the expression and advancing
				1657	the pointer while it matches the class. */
				1658
				1659	if (minimize)
				1660	{
				1661	#ifdef SUPPORT_UTF8
				1662	/* UTF-8 mode */
				1663	if (utf8)
				1664	{
				1665	for (fi = min;; fi++)
				1666	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1667	RMATCH(22, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1668	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1669	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1670	GETCHARINC(c, eptr);
				1671	if (c > 255)
				1672	{
				1673	if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
				1674	}
				1675	else
				1676	{
				1677	if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
				1678	}
				1679	}
				1680	}
				1681	else
				1682	#endif
				1683	/* Not UTF-8 mode */
				1684	{
				1685	for (fi = min;; fi++)
				1686	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1687	RMATCH(23, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1688	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1689	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1690	c = *eptr++;
				1691	if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
				1692	}
				1693	}
				1694	/* Control never gets here */
				1695	}
				1696
				1697	/* If maximizing, find the longest possible run, then work backwards. */
				1698
				1699	else
				1700	{
				1701	pp = eptr;
				1702
				1703	#ifdef SUPPORT_UTF8
				1704	/* UTF-8 mode */
				1705	if (utf8)
				1706	{
				1707	for (i = min; i < max; i++)
				1708	{
				1709	int len = 1;
				1710	if (eptr >= md->end_subject) break;
				1711	GETCHARLEN(c, eptr, len);
				1712	if (c > 255)
				1713	{
				1714	if (op == OP_CLASS) break;
				1715	}
				1716	else
				1717	{
				1718	if ((data[c/8] & (1 << (c&7))) == 0) break;
				1719	}
				1720	eptr += len;
				1721	}
				1722	for (;;)
				1723	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1724	RMATCH(24, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1725	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1726	if (eptr-- == pp) break; /* Stop if tried at original pos */
				1727	BACKCHAR(eptr);
				1728	}
				1729	}
				1730	else
				1731	#endif
				1732	/* Not UTF-8 mode */
				1733	{
				1734	for (i = min; i < max; i++)
				1735	{
				1736	if (eptr >= md->end_subject) break;
				1737	c = *eptr;
				1738	if ((data[c/8] & (1 << (c&7))) == 0) break;
				1739	eptr++;
				1740	}
				1741	while (eptr >= pp)
				1742	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1743	RMATCH(25, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1744	eptr--;
				1745	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1746	}
				1747	}
				1748
				1749	RRETURN(MATCH_NOMATCH);
				1750	}
				1751	}
				1752	/* Control never gets here */
				1753
				1754
				1755	/* Match an extended character class. This opcode is encountered only
				1756	in UTF-8 mode, because that's the only time it is compiled. */
				1757
				1758	#ifdef SUPPORT_UTF8
				1759	case OP_XCLASS:
				1760	{
				1761	data = ecode + 1 + LINK_SIZE; /* Save for matching */
				1762	ecode += GET(ecode, 1); /* Advance past the item */
				1763
				1764	switch (*ecode)
				1765	{
				1766	case OP_CRSTAR:
				1767	case OP_CRMINSTAR:
				1768	case OP_CRPLUS:
				1769	case OP_CRMINPLUS:
				1770	case OP_CRQUERY:
				1771	case OP_CRMINQUERY:
				1772	c = *ecode++ - OP_CRSTAR;
				1773	minimize = (c & 1) != 0;
				1774	min = rep_min[c]; /* Pick up values from tables; */
				1775	max = rep_max[c]; /* zero for max => infinity */
				1776	if (max == 0) max = INT_MAX;
				1777	break;
				1778
				1779	case OP_CRRANGE:
				1780	case OP_CRMINRANGE:
				1781	minimize = (*ecode == OP_CRMINRANGE);
				1782	min = GET2(ecode, 1);
				1783	max = GET2(ecode, 3);
				1784	if (max == 0) max = INT_MAX;
				1785	ecode += 5;
				1786	break;
				1787
				1788	default: /* No repeat follows */
				1789	min = max = 1;
				1790	break;
				1791	}
				1792
				1793	/* First, ensure the minimum number of matches are present. */
				1794
				1795	for (i = 1; i <= min; i++)
				1796	{
				1797	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1798	GETCHARINC(c, eptr);
				1799	if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
				1800	}
				1801
				1802	/* If max == min we can continue with the main loop without the
				1803	need to recurse. */
				1804
				1805	if (min == max) continue;
				1806
				1807	/* If minimizing, keep testing the rest of the expression and advancing
				1808	the pointer while it matches the class. */
				1809
				1810	if (minimize)
				1811	{
				1812	for (fi = min;; fi++)
				1813	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1814	RMATCH(26, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1815	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1816	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				1817	GETCHARINC(c, eptr);
				1818	if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
				1819	}
				1820	/* Control never gets here */
				1821	}
				1822
				1823	/* If maximizing, find the longest possible run, then work backwards. */
				1824
				1825	else
				1826	{
				1827	pp = eptr;
				1828	for (i = min; i < max; i++)
				1829	{
				1830	int len = 1;
				1831	if (eptr >= md->end_subject) break;
				1832	GETCHARLEN(c, eptr, len);
				1833	if (!_pcre_xclass(c, data)) break;
				1834	eptr += len;
				1835	}
				1836	for(;;)
				1837	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	1838	RMATCH(27, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1839	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				1840	if (eptr-- == pp) break; /* Stop if tried at original pos */
				1841	BACKCHAR(eptr)
				1842	}
				1843	RRETURN(MATCH_NOMATCH);
				1844	}
				1845
				1846	/* Control never gets here */
				1847	}
				1848	#endif /* End of XCLASS */
				1849
				1850	/* Match a single character, casefully */
				1851
				1852	case OP_CHAR:
				1853	#ifdef SUPPORT_UTF8
				1854	if (utf8)
				1855	{
				1856	length = 1;
				1857	ecode++;
				1858	GETUTF8CHARLEN(fc, ecode, length);
				1859	#if PCRE_UTF16
eseidel	67d65af	2005-09-29 22:05:12 +0000	[diff] [blame]	1860	{
darin	a8702f5	2006-01-13 09:32:51 +0000	[diff] [blame]	1861	int dc;
hyatt	6c974dd	2006-01-06 22:43:44 +0000	[diff] [blame]	1862	ecode += length;
				1863	switch (md->end_subject - eptr)
				1864	{
				1865	case 0:
eseidel	67d65af	2005-09-29 22:05:12 +0000	[diff] [blame]	1866	RRETURN(MATCH_NOMATCH);
hyatt	6c974dd	2006-01-06 22:43:44 +0000	[diff] [blame]	1867	case 1:
				1868	dc = *eptr++;
				1869	if (IS_LEADING_SURROGATE(dc))
				1870	RRETURN(MATCH_NOMATCH);
				1871	break;
				1872	default:
				1873	GETCHARINC(dc, eptr);
				1874	}
				1875	if (fc != dc) RRETURN(MATCH_NOMATCH);
				1876	}
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1877	#else
				1878	if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				1879	while (length-- > 0) if (ecode++ != eptr++) RRETURN(MATCH_NOMATCH);
				1880	#endif
				1881	}
				1882	else
				1883	#endif
				1884
				1885	/* Non-UTF-8 mode */
				1886	{
				1887	if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
				1888	if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
				1889	ecode += 2;
				1890	}
				1891	break;
				1892
				1893	/* Match a single character, caselessly */
				1894
				1895	case OP_CHARNC:
				1896	#ifdef SUPPORT_UTF8
				1897	if (utf8)
				1898	{
				1899	length = 1;
				1900	ecode++;
				1901	GETUTF8CHARLEN(fc, ecode, length);
				1902
eseidel	67d65af	2005-09-29 22:05:12 +0000	[diff] [blame]	1903	#if PCRE_UTF16
				1904	if (md->end_subject - eptr == 0) RRETURN(MATCH_NOMATCH);
				1905	#else
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1906	if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				1907	#endif
				1908
				1909	/* If the pattern character's value is < 128, we have only one byte, and
				1910	can use the fast lookup table. */
				1911
				1912	if (fc < 128)
				1913	{
				1914	#if PCRE_UTF16
				1915	int dc;
				1916	ecode++;
				1917	dc = *eptr++;
				1918	if (dc >= 128 \|\| md->lcc[fc] != md->lcc[dc]) RRETURN(MATCH_NOMATCH);
				1919	#else
				1920	if (md->lcc[ecode++] != md->lcc[eptr++]) RRETURN(MATCH_NOMATCH);
				1921	#endif
				1922	}
				1923
				1924	/* Otherwise we must pick up the subject character */
				1925
				1926	else
				1927	{
				1928	int dc;
eseidel	67d65af	2005-09-29 22:05:12 +0000	[diff] [blame]	1929	#if PCRE_UTF16
				1930	if (md->end_subject - eptr == 1) {
				1931	dc = *eptr++;
				1932	if (IS_LEADING_SURROGATE(dc))
				1933	RRETURN(MATCH_NOMATCH);
				1934	} else
				1935	#endif
				1936	GETCHARINC(dc, eptr);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1937	ecode += length;
				1938
				1939	/* If we have Unicode property support, we can use it to test the other
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	1940	case of the character, if there is one. The result of _pcre_ucp_findchar() is
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1941	< 0 if the char isn't found, and othercase is returned as zero if there
				1942	isn't one. */
				1943
				1944	if (fc != dc)
				1945	{
				1946	#ifdef SUPPORT_UCP
				1947	int chartype;
				1948	int othercase;
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	1949	if (_pcre_ucp_findchar(fc, &chartype, &othercase) < 0 \|\| dc != othercase)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	1950	#endif
				1951	RRETURN(MATCH_NOMATCH);
				1952	}
				1953	}
				1954	}
				1955	else
				1956	#endif /* SUPPORT_UTF8 */
				1957
				1958	/* Non-UTF-8 mode */
				1959	{
				1960	if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
				1961	if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
				1962	ecode += 2;
				1963	}
				1964	break;
				1965
				1966	/* Match a single character repeatedly; different opcodes share code. */
				1967
				1968	case OP_EXACT:
				1969	min = max = GET2(ecode, 1);
				1970	ecode += 3;
				1971	goto REPEATCHAR;
				1972
				1973	case OP_UPTO:
				1974	case OP_MINUPTO:
				1975	min = 0;
				1976	max = GET2(ecode, 1);
				1977	minimize = *ecode == OP_MINUPTO;
				1978	ecode += 3;
				1979	goto REPEATCHAR;
				1980
				1981	case OP_STAR:
				1982	case OP_MINSTAR:
				1983	case OP_PLUS:
				1984	case OP_MINPLUS:
				1985	case OP_QUERY:
				1986	case OP_MINQUERY:
				1987	c = *ecode++ - OP_STAR;
				1988	minimize = (c & 1) != 0;
				1989	min = rep_min[c]; /* Pick up values from tables; */
				1990	max = rep_max[c]; /* zero for max => infinity */
				1991	if (max == 0) max = INT_MAX;
				1992
				1993	/* Common code for all repeated single-character matches. We can give
				1994	up quickly if there are fewer than the minimum number of characters left in
				1995	the subject. */
				1996
				1997	REPEATCHAR:
				1998	#ifdef SUPPORT_UTF8
				1999	#if PCRE_UTF16
hyatt	6c974dd	2006-01-06 22:43:44 +0000	[diff] [blame]	2000
darin	a8702f5	2006-01-13 09:32:51 +0000	[diff] [blame]	2001	length = 1;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2002	GETUTF8CHARLEN(fc, ecode, length);
darin	a8702f5	2006-01-13 09:32:51 +0000	[diff] [blame]	2003	{
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2004	if (min * (fc > 0xFFFF ? 2 : 1) > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2005	ecode += length;
				2006
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2007	if (fc <= 0xFFFF)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2008	{
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2009	int othercase;
				2010	int chartype;
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2011	if ((ims & PCRE_CASELESS) == 0 \|\| _pcre_ucp_findchar(fc, &chartype, &othercase) < 0)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2012	othercase = -1; /* Guaranteed to not match any character */
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2013
				2014	for (i = 1; i <= min; i++)
				2015	{
				2016	if (eptr != fc && eptr != othercase) RRETURN(MATCH_NOMATCH);
				2017	++eptr;
				2018	}
				2019
				2020	if (min == max) continue;
				2021
				2022	if (minimize)
				2023	{
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	2024	repeat_othercase = othercase;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2025	for (fi = min;; fi++)
				2026	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2027	RMATCH(28, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2028	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2029	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	2030	if (eptr != fc && eptr != repeat_othercase) RRETURN(MATCH_NOMATCH);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2031	++eptr;
				2032	}
				2033	/* Control never gets here */
				2034	}
				2035	else
				2036	{
				2037	pp = eptr;
				2038	for (i = min; i < max; i++)
				2039	{
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2040	if (eptr >= md->end_subject) break;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2041	if (eptr != fc && eptr != othercase) break;
				2042	++eptr;
				2043	}
				2044	while (eptr >= pp)
				2045	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2046	RMATCH(29, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2047	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2048	--eptr;
				2049	}
				2050	RRETURN(MATCH_NOMATCH);
				2051	}
				2052	/* Control never gets here */
				2053	}
				2054	else
				2055	{
				2056	/* No case on surrogate pairs, so no need to bother with "othercase". */
				2057
				2058	for (i = 1; i <= min; i++)
				2059	{
				2060	int nc;
				2061	GETCHAR(nc, eptr);
				2062	if (nc != fc) RRETURN(MATCH_NOMATCH);
				2063	eptr += 2;
				2064	}
				2065
				2066	if (min == max) continue;
				2067
				2068	if (minimize)
				2069	{
				2070	for (fi = min;; fi++)
				2071	{
				2072	int nc;
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2073	RMATCH(30, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2074	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2075	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2076	GETCHAR(nc, eptr);
				2077	if (*eptr != fc) RRETURN(MATCH_NOMATCH);
				2078	eptr += 2;
				2079	}
				2080	/* Control never gets here */
				2081	}
				2082	else
				2083	{
				2084	pp = eptr;
				2085	for (i = min; i < max; i++)
				2086	{
				2087	int nc;
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2088	if (eptr > md->end_subject - 2) break;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2089	GETCHAR(nc, eptr);
				2090	if (*eptr != fc) break;
				2091	eptr += 2;
				2092	}
				2093	while (eptr >= pp)
				2094	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2095	RMATCH(31, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2096	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2097	eptr -= 2;
				2098	}
				2099	RRETURN(MATCH_NOMATCH);
				2100	}
				2101	/* Control never gets here */
				2102	}
				2103	/* Control never gets here */
darin	a8702f5	2006-01-13 09:32:51 +0000	[diff] [blame]	2104	}
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2105	#else
				2106	if (utf8)
				2107	{
				2108	length = 1;
				2109	charptr = ecode;
				2110	GETCHARLEN(fc, ecode, length);
				2111	if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				2112	ecode += length;
				2113
				2114	/* Handle multibyte character matching specially here. There is
				2115	support for caseless matching if UCP support is present. */
				2116
				2117	if (length > 1)
				2118	{
				2119	int oclength = 0;
				2120	uschar occhars[8];
				2121
				2122	#ifdef SUPPORT_UCP
				2123	int othercase;
				2124	int chartype;
				2125	if ((ims & PCRE_CASELESS) != 0 &&
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2126	_pcre_ucp_findchar(fc, &chartype, &othercase) >= 0 &&
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2127	othercase > 0)
				2128	oclength = _pcre_ord2utf8(othercase, occhars);
				2129	#endif /* SUPPORT_UCP */
				2130
				2131	for (i = 1; i <= min; i++)
				2132	{
				2133	if (memcmp(eptr, charptr, length) == 0) eptr += length;
				2134	/* Need braces because of following else */
				2135	else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
				2136	else
				2137	{
				2138	if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
				2139	eptr += oclength;
				2140	}
				2141	}
				2142
				2143	if (min == max) continue;
				2144
				2145	if (minimize)
				2146	{
				2147	for (fi = min;; fi++)
				2148	{
darin	8bff71f	2007-02-07 20:02:50 +0000	[diff] [blame]	2149	// FIXME: This could blow away occhars and occlength in the NO_RECURSE case.
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2150	RMATCH(32, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2151	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2152	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2153	if (memcmp(eptr, charptr, length) == 0) eptr += length;
				2154	/* Need braces because of following else */
				2155	else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
				2156	else
				2157	{
				2158	if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
				2159	eptr += oclength;
				2160	}
				2161	}
				2162	/* Control never gets here */
				2163	}
				2164	else
				2165	{
				2166	pp = eptr;
				2167	for (i = min; i < max; i++)
				2168	{
				2169	if (eptr > md->end_subject - length) break;
				2170	if (memcmp(eptr, charptr, length) == 0) eptr += length;
				2171	else if (oclength == 0) break;
				2172	else
				2173	{
				2174	if (memcmp(eptr, occhars, oclength) != 0) break;
				2175	eptr += oclength;
				2176	}
				2177	}
				2178	while (eptr >= pp)
				2179	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2180	RMATCH(33, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2181	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2182	eptr -= length;
				2183	}
				2184	RRETURN(MATCH_NOMATCH);
				2185	}
				2186	/* Control never gets here */
				2187	}
				2188
				2189	/* If the length of a UTF-8 character is 1, we fall through here, and
				2190	obey the code as for non-UTF-8 characters below, though in this case the
				2191	value of fc will always be < 128. */
				2192	}
				2193	else
				2194	#endif
				2195	#endif /* SUPPORT_UTF8 */
				2196
darin	b847b44	2006-10-27 16:48:28 +0000	[diff] [blame]	2197	#if !PCRE_UTF16
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2198	/* When not in UTF-8 mode, load a single-byte character. */
				2199	{
				2200	if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				2201	fc = *ecode++;
				2202	}
				2203
				2204	/* The value of fc at this point is always less than 256, though we may or
				2205	may not be in UTF-8 mode. The code is duplicated for the caseless and
				2206	caseful cases, for speed, since matching characters is likely to be quite
				2207	common. First, ensure the minimum number of matches are present. If min =
				2208	max, continue at the same level without recursing. Otherwise, if
				2209	minimizing, keep trying the rest of the expression and advancing one
				2210	matching character if failing, up to the maximum. Alternatively, if
				2211	maximizing, find the maximum number of characters and work backwards. */
				2212
				2213	DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
				2214	max, eptr));
				2215
				2216	if ((ims & PCRE_CASELESS) != 0)
				2217	{
				2218	fc = md->lcc[fc];
				2219	for (i = 1; i <= min; i++)
				2220	if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
				2221	if (min == max) continue;
				2222	if (minimize)
				2223	{
				2224	for (fi = min;; fi++)
				2225	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2226	RMATCH(34, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2227	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2228	if (fi >= max \|\| eptr >= md->end_subject \|\|
				2229	fc != md->lcc[*eptr++])
				2230	RRETURN(MATCH_NOMATCH);
				2231	}
				2232	/* Control never gets here */
				2233	}
				2234	else
				2235	{
				2236	pp = eptr;
				2237	for (i = min; i < max; i++)
				2238	{
				2239	if (eptr >= md->end_subject \|\| fc != md->lcc[*eptr]) break;
				2240	eptr++;
				2241	}
				2242	while (eptr >= pp)
				2243	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2244	RMATCH(35, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2245	eptr--;
				2246	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2247	}
				2248	RRETURN(MATCH_NOMATCH);
				2249	}
				2250	/* Control never gets here */
				2251	}
				2252
				2253	/* Caseful comparisons (includes all multi-byte characters) */
				2254
				2255	else
				2256	{
				2257	for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
				2258	if (min == max) continue;
				2259	if (minimize)
				2260	{
				2261	for (fi = min;; fi++)
				2262	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2263	RMATCH(36, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2264	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2265	if (fi >= max \|\| eptr >= md->end_subject \|\| fc != *eptr++)
				2266	RRETURN(MATCH_NOMATCH);
				2267	}
				2268	/* Control never gets here */
				2269	}
				2270	else
				2271	{
				2272	pp = eptr;
				2273	for (i = min; i < max; i++)
				2274	{
				2275	if (eptr >= md->end_subject \|\| fc != *eptr) break;
				2276	eptr++;
				2277	}
				2278	while (eptr >= pp)
				2279	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2280	RMATCH(37, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2281	eptr--;
				2282	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2283	}
				2284	RRETURN(MATCH_NOMATCH);
				2285	}
				2286	}
				2287	/* Control never gets here */
darin	b847b44	2006-10-27 16:48:28 +0000	[diff] [blame]	2288	#endif
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2289
				2290	/* Match a negated single one-byte character. The character we are
				2291	checking can be multibyte. */
				2292
				2293	case OP_NOT:
				2294	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2295	ecode++;
				2296	GETCHARINCTEST(c, eptr);
				2297	if ((ims & PCRE_CASELESS) != 0)
				2298	{
				2299	#ifdef SUPPORT_UTF8
				2300	if (c < 256)
				2301	#endif
				2302	c = md->lcc[c];
				2303	if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
				2304	}
				2305	else
				2306	{
				2307	if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
				2308	}
				2309	break;
				2310
				2311	/* Match a negated single one-byte character repeatedly. This is almost a
				2312	repeat of the code for a repeated single character, but I haven't found a
				2313	nice way of commoning these up that doesn't require a test of the
				2314	positive/negative option for each character match. Maybe that wouldn't add
				2315	very much to the time taken, but character matching is what this is all
				2316	about... */
				2317
				2318	case OP_NOTEXACT:
				2319	min = max = GET2(ecode, 1);
				2320	ecode += 3;
				2321	goto REPEATNOTCHAR;
				2322
				2323	case OP_NOTUPTO:
				2324	case OP_NOTMINUPTO:
				2325	min = 0;
				2326	max = GET2(ecode, 1);
				2327	minimize = *ecode == OP_NOTMINUPTO;
				2328	ecode += 3;
				2329	goto REPEATNOTCHAR;
				2330
				2331	case OP_NOTSTAR:
				2332	case OP_NOTMINSTAR:
				2333	case OP_NOTPLUS:
				2334	case OP_NOTMINPLUS:
				2335	case OP_NOTQUERY:
				2336	case OP_NOTMINQUERY:
				2337	c = *ecode++ - OP_NOTSTAR;
				2338	minimize = (c & 1) != 0;
				2339	min = rep_min[c]; /* Pick up values from tables; */
				2340	max = rep_max[c]; /* zero for max => infinity */
				2341	if (max == 0) max = INT_MAX;
				2342
				2343	/* Common code for all repeated single-byte matches. We can give up quickly
				2344	if there are fewer than the minimum number of bytes left in the
				2345	subject. */
				2346
				2347	REPEATNOTCHAR:
				2348	if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				2349	fc = *ecode++;
				2350
				2351	/* The code is duplicated for the caseless and caseful cases, for speed,
				2352	since matching characters is likely to be quite common. First, ensure the
				2353	minimum number of matches are present. If min = max, continue at the same
				2354	level without recursing. Otherwise, if minimizing, keep trying the rest of
				2355	the expression and advancing one matching character if failing, up to the
				2356	maximum. Alternatively, if maximizing, find the maximum number of
				2357	characters and work backwards. */
				2358
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2359	#if PCRE_UTF16
				2360	DPRINTF(("negative matching %c{%d,%d}\n", fc, min, max));
				2361	#else
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2362	DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
				2363	max, eptr));
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	2364	#endif
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2365
				2366	if ((ims & PCRE_CASELESS) != 0)
				2367	{
				2368	fc = md->lcc[fc];
				2369
				2370	#ifdef SUPPORT_UTF8
				2371	/* UTF-8 mode */
				2372	if (utf8)
				2373	{
				2374	register int d;
				2375	for (i = 1; i <= min; i++)
				2376	{
				2377	GETCHARINC(d, eptr);
				2378	if (d < 256) d = md->lcc[d];
				2379	if (fc == d) RRETURN(MATCH_NOMATCH);
				2380	}
				2381	}
				2382	else
				2383	#endif
				2384
				2385	/* Not UTF-8 mode */
				2386	{
				2387	for (i = 1; i <= min; i++)
				2388	if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
				2389	}
				2390
				2391	if (min == max) continue;
				2392
				2393	if (minimize)
				2394	{
				2395	#ifdef SUPPORT_UTF8
				2396	/* UTF-8 mode */
				2397	if (utf8)
				2398	{
				2399	register int d;
				2400	for (fi = min;; fi++)
				2401	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2402	RMATCH(38, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2403	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2404	GETCHARINC(d, eptr);
				2405	if (d < 256) d = md->lcc[d];
				2406	if (fi >= max \|\| eptr >= md->end_subject \|\| fc == d)
				2407	RRETURN(MATCH_NOMATCH);
				2408	}
				2409	}
				2410	else
				2411	#endif
				2412	/* Not UTF-8 mode */
				2413	{
				2414	for (fi = min;; fi++)
				2415	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2416	RMATCH(39, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2417	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2418	if (fi >= max \|\| eptr >= md->end_subject \|\| fc == md->lcc[*eptr++])
				2419	RRETURN(MATCH_NOMATCH);
				2420	}
				2421	}
				2422	/* Control never gets here */
				2423	}
				2424
				2425	/* Maximize case */
				2426
				2427	else
				2428	{
				2429	pp = eptr;
				2430
				2431	#ifdef SUPPORT_UTF8
				2432	/* UTF-8 mode */
				2433	if (utf8)
				2434	{
				2435	register int d;
				2436	for (i = min; i < max; i++)
				2437	{
				2438	int len = 1;
				2439	if (eptr >= md->end_subject) break;
				2440	GETCHARLEN(d, eptr, len);
				2441	if (d < 256) d = md->lcc[d];
				2442	if (fc == d) break;
				2443	eptr += len;
				2444	}
				2445	for(;;)
				2446	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2447	RMATCH(40, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2448	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2449	if (eptr-- == pp) break; /* Stop if tried at original pos */
				2450	BACKCHAR(eptr);
				2451	}
				2452	}
				2453	else
				2454	#endif
				2455	/* Not UTF-8 mode */
				2456	{
				2457	for (i = min; i < max; i++)
				2458	{
				2459	if (eptr >= md->end_subject \|\| fc == md->lcc[*eptr]) break;
				2460	eptr++;
				2461	}
				2462	while (eptr >= pp)
				2463	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2464	RMATCH(41, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2465	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2466	eptr--;
				2467	}
				2468	}
				2469
				2470	RRETURN(MATCH_NOMATCH);
				2471	}
				2472	/* Control never gets here */
				2473	}
				2474
				2475	/* Caseful comparisons */
				2476
				2477	else
				2478	{
				2479	#ifdef SUPPORT_UTF8
				2480	/* UTF-8 mode */
				2481	if (utf8)
				2482	{
				2483	register int d;
				2484	for (i = 1; i <= min; i++)
				2485	{
				2486	GETCHARINC(d, eptr);
				2487	if (fc == d) RRETURN(MATCH_NOMATCH);
				2488	}
				2489	}
				2490	else
				2491	#endif
				2492	/* Not UTF-8 mode */
				2493	{
				2494	for (i = 1; i <= min; i++)
				2495	if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
				2496	}
				2497
				2498	if (min == max) continue;
				2499
				2500	if (minimize)
				2501	{
				2502	#ifdef SUPPORT_UTF8
				2503	/* UTF-8 mode */
				2504	if (utf8)
				2505	{
				2506	register int d;
				2507	for (fi = min;; fi++)
				2508	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2509	RMATCH(42, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2510	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2511	GETCHARINC(d, eptr);
				2512	if (fi >= max \|\| eptr >= md->end_subject \|\| fc == d)
				2513	RRETURN(MATCH_NOMATCH);
				2514	}
				2515	}
				2516	else
				2517	#endif
				2518	/* Not UTF-8 mode */
				2519	{
				2520	for (fi = min;; fi++)
				2521	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2522	RMATCH(43, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2523	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2524	if (fi >= max \|\| eptr >= md->end_subject \|\| fc == *eptr++)
				2525	RRETURN(MATCH_NOMATCH);
				2526	}
				2527	}
				2528	/* Control never gets here */
				2529	}
				2530
				2531	/* Maximize case */
				2532
				2533	else
				2534	{
				2535	pp = eptr;
				2536
				2537	#ifdef SUPPORT_UTF8
				2538	/* UTF-8 mode */
				2539	if (utf8)
				2540	{
				2541	register int d;
				2542	for (i = min; i < max; i++)
				2543	{
				2544	int len = 1;
				2545	if (eptr >= md->end_subject) break;
				2546	GETCHARLEN(d, eptr, len);
				2547	if (fc == d) break;
				2548	eptr += len;
				2549	}
				2550	for(;;)
				2551	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2552	RMATCH(44, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2553	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2554	if (eptr-- == pp) break; /* Stop if tried at original pos */
				2555	BACKCHAR(eptr);
				2556	}
				2557	}
				2558	else
				2559	#endif
				2560	/* Not UTF-8 mode */
				2561	{
				2562	for (i = min; i < max; i++)
				2563	{
				2564	if (eptr >= md->end_subject \|\| fc == *eptr) break;
				2565	eptr++;
				2566	}
				2567	while (eptr >= pp)
				2568	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2569	RMATCH(45, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2570	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2571	eptr--;
				2572	}
				2573	}
				2574
				2575	RRETURN(MATCH_NOMATCH);
				2576	}
				2577	}
				2578	/* Control never gets here */
				2579
				2580	/* Match a single character type repeatedly; several different opcodes
				2581	share code. This is very similar to the code for single characters, but we
				2582	repeat it in the interests of efficiency. */
				2583
				2584	case OP_TYPEEXACT:
				2585	min = max = GET2(ecode, 1);
				2586	minimize = TRUE;
				2587	ecode += 3;
				2588	goto REPEATTYPE;
				2589
				2590	case OP_TYPEUPTO:
				2591	case OP_TYPEMINUPTO:
				2592	min = 0;
				2593	max = GET2(ecode, 1);
				2594	minimize = *ecode == OP_TYPEMINUPTO;
				2595	ecode += 3;
				2596	goto REPEATTYPE;
				2597
				2598	case OP_TYPESTAR:
				2599	case OP_TYPEMINSTAR:
				2600	case OP_TYPEPLUS:
				2601	case OP_TYPEMINPLUS:
				2602	case OP_TYPEQUERY:
				2603	case OP_TYPEMINQUERY:
				2604	c = *ecode++ - OP_TYPESTAR;
				2605	minimize = (c & 1) != 0;
				2606	min = rep_min[c]; /* Pick up values from tables; */
				2607	max = rep_max[c]; /* zero for max => infinity */
				2608	if (max == 0) max = INT_MAX;
				2609
				2610	/* Common code for all repeated single character type matches. Note that
				2611	in UTF-8 mode, '.' matches a character of any length, but for the other
				2612	character types, the valid characters are all one-byte long. */
				2613
				2614	REPEATTYPE:
				2615	ctype = ecode++; / Code for the character type */
				2616
				2617	#ifdef SUPPORT_UCP
				2618	if (ctype == OP_PROP \|\| ctype == OP_NOTPROP)
				2619	{
				2620	prop_fail_result = ctype == OP_NOTPROP;
				2621	prop_type = *ecode++;
				2622	if (prop_type >= 128)
				2623	{
				2624	prop_test_against = prop_type - 128;
				2625	prop_test_variable = &prop_category;
				2626	}
				2627	else
				2628	{
				2629	prop_test_against = prop_type;
				2630	prop_test_variable = &prop_chartype;
				2631	}
				2632	}
				2633	else prop_type = -1;
				2634	#endif
				2635
				2636	/* First, ensure the minimum number of matches are present. Use inline
				2637	code for maximizing the speed, and do the type test once at the start
				2638	(i.e. keep it out of the loop). Also we can test that there are at least
				2639	the minimum number of bytes before we start. This isn't as effective in
				2640	UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
				2641	is tidier. Also separate the UCP code, which can be the same for both UTF-8
				2642	and single-bytes. */
				2643
				2644	if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
				2645	if (min > 0)
				2646	{
				2647	#ifdef SUPPORT_UCP
				2648	if (prop_type > 0)
				2649	{
				2650	for (i = 1; i <= min; i++)
				2651	{
				2652	GETCHARINC(c, eptr);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2653	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2654	if ((*prop_test_variable == prop_test_against) == prop_fail_result)
				2655	RRETURN(MATCH_NOMATCH);
				2656	}
				2657	}
				2658
				2659	/* Match extended Unicode sequences. We will get here only if the
				2660	support is in the binary; otherwise a compile-time error occurs. */
				2661
				2662	else if (ctype == OP_EXTUNI)
				2663	{
				2664	for (i = 1; i <= min; i++)
				2665	{
				2666	GETCHARINCTEST(c, eptr);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2667	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2668	if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
				2669	while (eptr < md->end_subject)
				2670	{
				2671	int len = 1;
				2672	if (!utf8) c = *eptr; else
				2673	{
				2674	GETCHARLEN(c, eptr, len);
				2675	}
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2676	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2677	if (prop_category != ucp_M) break;
				2678	eptr += len;
				2679	}
				2680	}
				2681	}
				2682
				2683	else
				2684	#endif /* SUPPORT_UCP */
				2685
				2686	/* Handle all other cases when the coding is UTF-8 */
				2687
				2688	#ifdef SUPPORT_UTF8
				2689	if (utf8) switch(ctype)
				2690	{
				2691	case OP_ANY:
				2692	for (i = 1; i <= min; i++)
				2693	{
				2694	if (eptr >= md->end_subject \|\|
				2695	(*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
				2696	RRETURN(MATCH_NOMATCH);
				2697	while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
				2698	}
				2699	break;
				2700
				2701	case OP_ANYBYTE:
				2702	eptr += min;
				2703	break;
				2704
				2705	case OP_NOT_DIGIT:
				2706	for (i = 1; i <= min; i++)
				2707	{
				2708	if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2709	GETCHARINC(c, eptr);
				2710	if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
				2711	RRETURN(MATCH_NOMATCH);
				2712	}
				2713	break;
				2714
				2715	case OP_DIGIT:
				2716	for (i = 1; i <= min; i++)
				2717	{
				2718	if (eptr >= md->end_subject \|\|
				2719	eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_digit) == 0)
				2720	RRETURN(MATCH_NOMATCH);
				2721	/* No need to skip more bytes - we know it's a 1-byte character */
				2722	}
				2723	break;
				2724
				2725	case OP_NOT_WHITESPACE:
				2726	for (i = 1; i <= min; i++)
				2727	{
				2728	if (eptr >= md->end_subject \|\|
				2729	(eptr < 128 && (md->ctypes[eptr++] & ctype_space) != 0))
				2730	RRETURN(MATCH_NOMATCH);
				2731	while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
				2732	}
				2733	break;
				2734
				2735	case OP_WHITESPACE:
				2736	for (i = 1; i <= min; i++)
				2737	{
				2738	if (eptr >= md->end_subject \|\|
				2739	eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_space) == 0)
				2740	RRETURN(MATCH_NOMATCH);
				2741	/* No need to skip more bytes - we know it's a 1-byte character */
				2742	}
				2743	break;
				2744
				2745	case OP_NOT_WORDCHAR:
				2746	for (i = 1; i <= min; i++)
				2747	{
				2748	if (eptr >= md->end_subject \|\|
				2749	(eptr < 128 && (md->ctypes[eptr++] & ctype_word) != 0))
				2750	RRETURN(MATCH_NOMATCH);
				2751	while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
				2752	}
				2753	break;
				2754
				2755	case OP_WORDCHAR:
				2756	for (i = 1; i <= min; i++)
				2757	{
				2758	if (eptr >= md->end_subject \|\|
				2759	eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_word) == 0)
				2760	RRETURN(MATCH_NOMATCH);
				2761	/* No need to skip more bytes - we know it's a 1-byte character */
				2762	}
				2763	break;
				2764
				2765	default:
				2766	RRETURN(PCRE_ERROR_INTERNAL);
				2767	} /* End switch(ctype) */
				2768
				2769	else
				2770	#endif /* SUPPORT_UTF8 */
				2771
				2772	/* Code for the non-UTF-8 case for minimum matching of operators other
				2773	than OP_PROP and OP_NOTPROP. */
				2774
				2775	switch(ctype)
				2776	{
				2777	case OP_ANY:
				2778	if ((ims & PCRE_DOTALL) == 0)
				2779	{
				2780	for (i = 1; i <= min; i++)
				2781	if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
				2782	}
				2783	else eptr += min;
				2784	break;
				2785
				2786	case OP_ANYBYTE:
				2787	eptr += min;
				2788	break;
				2789
				2790	case OP_NOT_DIGIT:
				2791	for (i = 1; i <= min; i++)
				2792	if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
				2793	break;
				2794
				2795	case OP_DIGIT:
				2796	for (i = 1; i <= min; i++)
				2797	if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
				2798	break;
				2799
				2800	case OP_NOT_WHITESPACE:
				2801	for (i = 1; i <= min; i++)
				2802	if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
				2803	break;
				2804
				2805	case OP_WHITESPACE:
				2806	for (i = 1; i <= min; i++)
				2807	if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
				2808	break;
				2809
				2810	case OP_NOT_WORDCHAR:
				2811	for (i = 1; i <= min; i++)
				2812	if ((md->ctypes[*eptr++] & ctype_word) != 0)
				2813	RRETURN(MATCH_NOMATCH);
				2814	break;
				2815
				2816	case OP_WORDCHAR:
				2817	for (i = 1; i <= min; i++)
				2818	if ((md->ctypes[*eptr++] & ctype_word) == 0)
				2819	RRETURN(MATCH_NOMATCH);
				2820	break;
				2821
				2822	default:
				2823	RRETURN(PCRE_ERROR_INTERNAL);
				2824	}
				2825	}
				2826
				2827	/* If min = max, continue at the same level without recursing */
				2828
				2829	if (min == max) continue;
				2830
				2831	/* If minimizing, we have to test the rest of the pattern before each
				2832	subsequent match. Again, separate the UTF-8 case for speed, and also
				2833	separate the UCP cases. */
				2834
				2835	if (minimize)
				2836	{
				2837	#ifdef SUPPORT_UCP
				2838	if (prop_type > 0)
				2839	{
				2840	for (fi = min;; fi++)
				2841	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2842	RMATCH(46, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2843	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2844	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2845	GETCHARINC(c, eptr);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2846	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2847	if ((*prop_test_variable == prop_test_against) == prop_fail_result)
				2848	RRETURN(MATCH_NOMATCH);
				2849	}
				2850	}
				2851
				2852	/* Match extended Unicode sequences. We will get here only if the
				2853	support is in the binary; otherwise a compile-time error occurs. */
				2854
				2855	else if (ctype == OP_EXTUNI)
				2856	{
				2857	for (fi = min;; fi++)
				2858	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2859	RMATCH(47, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2860	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2861	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2862	GETCHARINCTEST(c, eptr);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2863	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2864	if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
				2865	while (eptr < md->end_subject)
				2866	{
				2867	int len = 1;
				2868	if (!utf8) c = *eptr; else
				2869	{
				2870	GETCHARLEN(c, eptr, len);
				2871	}
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	2872	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2873	if (prop_category != ucp_M) break;
				2874	eptr += len;
				2875	}
				2876	}
				2877	}
				2878
				2879	else
				2880	#endif /* SUPPORT_UCP */
				2881
				2882	#ifdef SUPPORT_UTF8
				2883	/* UTF-8 mode */
				2884	if (utf8)
				2885	{
				2886	for (fi = min;; fi++)
				2887	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2888	RMATCH(48, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2889	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2890	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2891
				2892	GETCHARINC(c, eptr);
				2893	switch(ctype)
				2894	{
				2895	case OP_ANY:
				2896	if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
				2897	break;
				2898
				2899	case OP_ANYBYTE:
				2900	break;
				2901
				2902	case OP_NOT_DIGIT:
				2903	if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
				2904	RRETURN(MATCH_NOMATCH);
				2905	break;
				2906
				2907	case OP_DIGIT:
				2908	if (c >= 256 \|\| (md->ctypes[c] & ctype_digit) == 0)
				2909	RRETURN(MATCH_NOMATCH);
				2910	break;
				2911
				2912	case OP_NOT_WHITESPACE:
				2913	if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
				2914	RRETURN(MATCH_NOMATCH);
				2915	break;
				2916
				2917	case OP_WHITESPACE:
				2918	if (c >= 256 \|\| (md->ctypes[c] & ctype_space) == 0)
				2919	RRETURN(MATCH_NOMATCH);
				2920	break;
				2921
				2922	case OP_NOT_WORDCHAR:
				2923	if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
				2924	RRETURN(MATCH_NOMATCH);
				2925	break;
				2926
				2927	case OP_WORDCHAR:
				2928	if (c >= 256 \|\| (md->ctypes[c] & ctype_word) == 0)
				2929	RRETURN(MATCH_NOMATCH);
				2930	break;
				2931
				2932	default:
				2933	RRETURN(PCRE_ERROR_INTERNAL);
				2934	}
				2935	}
				2936	}
				2937	else
				2938	#endif
				2939	/* Not UTF-8 mode */
				2940	{
				2941	for (fi = min;; fi++)
				2942	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	2943	RMATCH(49, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	2944	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				2945	if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
				2946	c = *eptr++;
				2947	switch(ctype)
				2948	{
				2949	case OP_ANY:
				2950	if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
				2951	break;
				2952
				2953	case OP_ANYBYTE:
				2954	break;
				2955
				2956	case OP_NOT_DIGIT:
				2957	if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
				2958	break;
				2959
				2960	case OP_DIGIT:
				2961	if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
				2962	break;
				2963
				2964	case OP_NOT_WHITESPACE:
				2965	if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
				2966	break;
				2967
				2968	case OP_WHITESPACE:
				2969	if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
				2970	break;
				2971
				2972	case OP_NOT_WORDCHAR:
				2973	if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
				2974	break;
				2975
				2976	case OP_WORDCHAR:
				2977	if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
				2978	break;
				2979
				2980	default:
				2981	RRETURN(PCRE_ERROR_INTERNAL);
				2982	}
				2983	}
				2984	}
				2985	/* Control never gets here */
				2986	}
				2987
				2988	/* If maximizing it is worth using inline code for speed, doing the type
				2989	test once at the start (i.e. keep it out of the loop). Again, keep the
				2990	UTF-8 and UCP stuff separate. */
				2991
				2992	else
				2993	{
				2994	pp = eptr; /* Remember where we started */
				2995
				2996	#ifdef SUPPORT_UCP
				2997	if (prop_type > 0)
				2998	{
				2999	for (i = min; i < max; i++)
				3000	{
				3001	int len = 1;
				3002	if (eptr >= md->end_subject) break;
				3003	GETCHARLEN(c, eptr, len);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	3004	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3005	if ((*prop_test_variable == prop_test_against) == prop_fail_result)
				3006	break;
				3007	eptr+= len;
				3008	}
				3009
				3010	/* eptr is now past the end of the maximum run */
				3011
				3012	for(;;)
				3013	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3014	RMATCH(50, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3015	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				3016	if (eptr-- == pp) break; /* Stop if tried at original pos */
				3017	BACKCHAR(eptr);
				3018	}
				3019	}
				3020
				3021	/* Match extended Unicode sequences. We will get here only if the
				3022	support is in the binary; otherwise a compile-time error occurs. */
				3023
				3024	else if (ctype == OP_EXTUNI)
				3025	{
				3026	for (i = min; i < max; i++)
				3027	{
				3028	if (eptr >= md->end_subject) break;
				3029	GETCHARINCTEST(c, eptr);
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	3030	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3031	if (prop_category == ucp_M) break;
				3032	while (eptr < md->end_subject)
				3033	{
				3034	int len = 1;
				3035	if (!utf8) c = *eptr; else
				3036	{
				3037	GETCHARLEN(c, eptr, len);
				3038	}
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	3039	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3040	if (prop_category != ucp_M) break;
				3041	eptr += len;
				3042	}
				3043	}
				3044
				3045	/* eptr is now past the end of the maximum run */
				3046
				3047	for(;;)
				3048	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3049	RMATCH(51, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3050	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				3051	if (eptr-- == pp) break; /* Stop if tried at original pos */
				3052	for (;;) /* Move back over one extended */
				3053	{
				3054	int len = 1;
				3055	BACKCHAR(eptr);
				3056	if (!utf8) c = *eptr; else
				3057	{
				3058	GETCHARLEN(c, eptr, len);
				3059	}
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	3060	prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3061	if (prop_category != ucp_M) break;
				3062	eptr--;
				3063	}
				3064	}
				3065	}
				3066
				3067	else
				3068	#endif /* SUPPORT_UCP */
				3069
				3070	#ifdef SUPPORT_UTF8
				3071	/* UTF-8 mode */
				3072
				3073	if (utf8)
				3074	{
				3075	switch(ctype)
				3076	{
				3077	case OP_ANY:
				3078
				3079	/* Special code is required for UTF8, but when the maximum is unlimited
				3080	we don't need it, so we repeat the non-UTF8 code. This is probably
				3081	worth it, because .* is quite a common idiom. */
				3082
				3083	if (max < INT_MAX)
				3084	{
				3085	if ((ims & PCRE_DOTALL) == 0)
				3086	{
				3087	for (i = min; i < max; i++)
				3088	{
				3089	if (eptr >= md->end_subject \|\| *eptr == NEWLINE) break;
				3090	eptr++;
				3091	while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
				3092	}
				3093	}
				3094	else
				3095	{
				3096	for (i = min; i < max; i++)
				3097	{
				3098	eptr++;
				3099	while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
				3100	}
				3101	}
				3102	}
				3103
				3104	/* Handle unlimited UTF-8 repeat */
				3105
				3106	else
				3107	{
				3108	if ((ims & PCRE_DOTALL) == 0)
				3109	{
				3110	for (i = min; i < max; i++)
				3111	{
				3112	if (eptr >= md->end_subject \|\| *eptr == NEWLINE) break;
				3113	eptr++;
				3114	}
				3115	break;
				3116	}
				3117	else
				3118	{
				3119	c = max - min;
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	3120	if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3121	eptr += c;
				3122	}
				3123	}
				3124	break;
				3125
				3126	/* The byte case is the same as non-UTF8 */
				3127
				3128	case OP_ANYBYTE:
				3129	c = max - min;
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	3130	if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3131	eptr += c;
				3132	break;
				3133
				3134	case OP_NOT_DIGIT:
				3135	for (i = min; i < max; i++)
				3136	{
				3137	int len = 1;
				3138	if (eptr >= md->end_subject) break;
				3139	GETCHARLEN(c, eptr, len);
				3140	if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
				3141	eptr+= len;
				3142	}
				3143	break;
				3144
				3145	case OP_DIGIT:
				3146	for (i = min; i < max; i++)
				3147	{
				3148	int len = 1;
				3149	if (eptr >= md->end_subject) break;
				3150	GETCHARLEN(c, eptr, len);
				3151	if (c >= 256 \|\|(md->ctypes[c] & ctype_digit) == 0) break;
				3152	eptr+= len;
				3153	}
				3154	break;
				3155
				3156	case OP_NOT_WHITESPACE:
				3157	for (i = min; i < max; i++)
				3158	{
				3159	int len = 1;
				3160	if (eptr >= md->end_subject) break;
				3161	GETCHARLEN(c, eptr, len);
				3162	if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
				3163	eptr+= len;
				3164	}
				3165	break;
				3166
				3167	case OP_WHITESPACE:
				3168	for (i = min; i < max; i++)
				3169	{
				3170	int len = 1;
				3171	if (eptr >= md->end_subject) break;
				3172	GETCHARLEN(c, eptr, len);
				3173	if (c >= 256 \|\|(md->ctypes[c] & ctype_space) == 0) break;
				3174	eptr+= len;
				3175	}
				3176	break;
				3177
				3178	case OP_NOT_WORDCHAR:
				3179	for (i = min; i < max; i++)
				3180	{
				3181	int len = 1;
				3182	if (eptr >= md->end_subject) break;
				3183	GETCHARLEN(c, eptr, len);
				3184	if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
				3185	eptr+= len;
				3186	}
				3187	break;
				3188
				3189	case OP_WORDCHAR:
				3190	for (i = min; i < max; i++)
				3191	{
				3192	int len = 1;
				3193	if (eptr >= md->end_subject) break;
				3194	GETCHARLEN(c, eptr, len);
				3195	if (c >= 256 \|\| (md->ctypes[c] & ctype_word) == 0) break;
				3196	eptr+= len;
				3197	}
				3198	break;
				3199
				3200	default:
				3201	RRETURN(PCRE_ERROR_INTERNAL);
				3202	}
				3203
				3204	/* eptr is now past the end of the maximum run */
				3205
				3206	for(;;)
				3207	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3208	RMATCH(52, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3209	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				3210	if (eptr-- == pp) break; /* Stop if tried at original pos */
				3211	BACKCHAR(eptr);
				3212	}
				3213	}
				3214	else
				3215	#endif
				3216
				3217	/* Not UTF-8 mode */
				3218	{
				3219	switch(ctype)
				3220	{
				3221	case OP_ANY:
				3222	if ((ims & PCRE_DOTALL) == 0)
				3223	{
				3224	for (i = min; i < max; i++)
				3225	{
				3226	if (eptr >= md->end_subject \|\| *eptr == NEWLINE) break;
				3227	eptr++;
				3228	}
				3229	break;
				3230	}
				3231	/* For DOTALL case, fall through and treat as \C */
				3232
				3233	case OP_ANYBYTE:
				3234	c = max - min;
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	3235	if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3236	eptr += c;
				3237	break;
				3238
				3239	case OP_NOT_DIGIT:
				3240	for (i = min; i < max; i++)
				3241	{
				3242	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_digit) != 0)
				3243	break;
				3244	eptr++;
				3245	}
				3246	break;
				3247
				3248	case OP_DIGIT:
				3249	for (i = min; i < max; i++)
				3250	{
				3251	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_digit) == 0)
				3252	break;
				3253	eptr++;
				3254	}
				3255	break;
				3256
				3257	case OP_NOT_WHITESPACE:
				3258	for (i = min; i < max; i++)
				3259	{
				3260	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_space) != 0)
				3261	break;
				3262	eptr++;
				3263	}
				3264	break;
				3265
				3266	case OP_WHITESPACE:
				3267	for (i = min; i < max; i++)
				3268	{
				3269	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_space) == 0)
				3270	break;
				3271	eptr++;
				3272	}
				3273	break;
				3274
				3275	case OP_NOT_WORDCHAR:
				3276	for (i = min; i < max; i++)
				3277	{
				3278	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_word) != 0)
				3279	break;
				3280	eptr++;
				3281	}
				3282	break;
				3283
				3284	case OP_WORDCHAR:
				3285	for (i = min; i < max; i++)
				3286	{
				3287	if (eptr >= md->end_subject \|\| (md->ctypes[*eptr] & ctype_word) == 0)
				3288	break;
				3289	eptr++;
				3290	}
				3291	break;
				3292
				3293	default:
				3294	RRETURN(PCRE_ERROR_INTERNAL);
				3295	}
				3296
				3297	/* eptr is now past the end of the maximum run */
				3298
				3299	while (eptr >= pp)
				3300	{
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3301	RMATCH(53, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3302	eptr--;
				3303	if (rrc != MATCH_NOMATCH) RRETURN(rrc);
				3304	}
				3305	}
				3306
				3307	/* Get here if we can't make it match with any permitted repetitions */
				3308
				3309	RRETURN(MATCH_NOMATCH);
				3310	}
				3311	/* Control never gets here */
				3312
				3313	/* There's been some horrible disaster. Since all codes > OP_BRA are
				3314	for capturing brackets, and there shouldn't be any gaps between 0 and
				3315	OP_BRA, arrival here can only mean there is something seriously wrong
				3316	in the code above or the OP_xxx definitions. */
				3317
				3318	default:
				3319	DPRINTF(("Unknown opcode %d\n", *ecode));
				3320	RRETURN(PCRE_ERROR_UNKNOWN_NODE);
				3321	}
				3322
				3323	/* Do not stick any code in here without much thought; it is assumed
				3324	that "continue" in the code above comes out to here to repeat the main
				3325	loop. */
				3326
				3327	} /* End of main loop */
				3328	/* Control never reaches here */
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3329
				3330	#ifdef NO_RECURSE
				3331	#ifndef __GNUC__
				3332
				3333	RRETURN_SWITCH:
				3334	switch (frame->Xwhere)
				3335	{
				3336	case 1: goto RRETURN_1;
				3337	case 2: goto RRETURN_2;
				3338	case 3: goto RRETURN_3;
				3339	case 4: goto RRETURN_4;
				3340	case 5: goto RRETURN_5;
				3341	case 6: goto RRETURN_6;
				3342	case 7: goto RRETURN_7;
				3343	case 8: goto RRETURN_8;
				3344	case 9: goto RRETURN_9;
				3345	case 10: goto RRETURN_10;
				3346	case 11: goto RRETURN_11;
				3347	case 12: goto RRETURN_12;
				3348	case 13: goto RRETURN_13;
				3349	case 14: goto RRETURN_14;
				3350	case 15: goto RRETURN_15;
				3351	case 16: goto RRETURN_16;
				3352	case 17: goto RRETURN_17;
				3353	case 18: goto RRETURN_18;
				3354	case 19: goto RRETURN_19;
				3355	case 20: goto RRETURN_20;
				3356	case 21: goto RRETURN_21;
				3357	case 22: goto RRETURN_22;
				3358	case 23: goto RRETURN_23;
				3359	case 24: goto RRETURN_24;
				3360	case 25: goto RRETURN_25;
				3361	case 26: goto RRETURN_26;
				3362	case 27: goto RRETURN_27;
				3363	case 28: goto RRETURN_28;
				3364	case 29: goto RRETURN_29;
				3365	case 30: goto RRETURN_30;
				3366	case 31: goto RRETURN_31;
				3367	case 32: goto RRETURN_32;
				3368	case 33: goto RRETURN_33;
				3369	case 34: goto RRETURN_34;
				3370	case 35: goto RRETURN_35;
				3371	case 36: goto RRETURN_36;
				3372	case 37: goto RRETURN_37;
				3373	case 38: goto RRETURN_38;
				3374	case 39: goto RRETURN_39;
				3375	case 40: goto RRETURN_40;
				3376	case 41: goto RRETURN_41;
				3377	case 42: goto RRETURN_42;
				3378	case 43: goto RRETURN_43;
				3379	case 44: goto RRETURN_44;
				3380	case 45: goto RRETURN_45;
				3381	case 46: goto RRETURN_46;
				3382	case 47: goto RRETURN_47;
				3383	case 48: goto RRETURN_48;
				3384	case 49: goto RRETURN_49;
				3385	case 50: goto RRETURN_50;
				3386	case 51: goto RRETURN_51;
				3387	case 52: goto RRETURN_52;
				3388	case 53: goto RRETURN_53;
				3389	}
				3390
				3391	#if PCRE_UTF16
				3392	/* It's safer to have the extra symbols here than to try to ifdef the switch statement above,
				3393	because we'll get warnings or errors if we have multiply defined symbols but a runtime failure
				3394	if we leave something out of the switch statement. */
				3395	RRETURN_32:
				3396	RRETURN_33:
				3397	RRETURN_34:
				3398	RRETURN_35:
				3399	RRETURN_36:
				3400	RRETURN_37:
				3401	#endif
				3402
				3403	abort();
sfalken	38c99b4	2007-02-06 22:38:04 +0000	[diff] [blame]	3404	return 0;
darin	ed76fb5	2007-02-06 21:55:25 +0000	[diff] [blame]	3405
				3406	#endif
				3407	#endif
				3408
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3409	}
				3410
				3411
				3412	/***************************************************************************
				3413	****************************************************************************
				3414	RECURSION IN THE match() FUNCTION
				3415
				3416	Undefine all the macros that were defined above to handle this. */
				3417
				3418	#ifdef NO_RECURSE
				3419	#undef eptr
				3420	#undef ecode
				3421	#undef offset_top
				3422	#undef ims
				3423	#undef eptrb
				3424	#undef flags
				3425
				3426	#undef callpat
				3427	#undef charptr
				3428	#undef data
				3429	#undef next
				3430	#undef pp
				3431	#undef prev
				3432	#undef saved_eptr
				3433
				3434	#undef new_recursive
				3435
				3436	#undef cur_is_word
				3437	#undef condition
				3438	#undef minimize
				3439	#undef prev_is_word
				3440
				3441	#undef original_ims
				3442
				3443	#undef ctype
				3444	#undef length
				3445	#undef max
				3446	#undef min
				3447	#undef number
				3448	#undef offset
				3449	#undef op
				3450	#undef save_capture_last
				3451	#undef save_offset1
				3452	#undef save_offset2
				3453	#undef save_offset3
				3454	#undef stacksave
				3455
				3456	#undef newptrb
				3457
				3458	#endif
				3459
				3460	/* These two are defined as macros in both cases */
				3461
				3462	#undef fc
				3463	#undef fi
				3464
				3465	/***************************************************************************
				3466	***************************************************************************/
				3467
				3468
				3469
				3470	/*************************************************
				3471	* Execute a Regular Expression *
				3472	*************************************************/
				3473
				3474	/* This function applies a compiled re to a subject string and picks out
				3475	portions of the string if it matches. Two elements in the vector are set for
				3476	each substring: the offsets to the start and end of the substring.
				3477
				3478	Arguments:
				3479	argument_re points to the compiled expression
				3480	extra_data points to extra data or is NULL
				3481	subject points to the subject string
				3482	length length of subject string (may contain binary zeros)
				3483	start_offset where to start in the subject string
				3484	options option bits
				3485	offsets points to a vector of ints to be filled in with offsets
				3486	offsetcount the number of elements in the vector
				3487
				3488	Returns: > 0 => success; value is the number of elements filled in
				3489	= 0 => success, but offsets is not big enough
				3490	-1 => failed to match
				3491	< -1 => some kind of unexpected problem
				3492	*/
				3493
ddkilzer	60a7a80	2007-01-01 05:07:40 +0000	[diff] [blame]	3494	PCRE_EXPORT int
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3495	pcre_exec(const pcre argument_re, const pcre_extra extra_data,
				3496	const pcre_char subject, int length, int start_offset, int options, int offsets,
				3497	int offsetcount)
				3498	{
				3499	int rc, resetcount, ocount;
				3500	int first_byte = -1;
				3501	int req_byte = -1;
				3502	int req_byte2 = -1;
				3503	unsigned long int ims = 0;
				3504	BOOL using_temporary_offsets = FALSE;
				3505	BOOL anchored;
				3506	BOOL startline;
				3507	BOOL firstline;
				3508	BOOL first_byte_caseless = FALSE;
				3509	BOOL req_byte_caseless = FALSE;
				3510	match_data match_block;
				3511	const uschar *tables;
				3512	const uschar *start_bits = NULL;
				3513	const pcre_uchar start_match = (const pcre_uchar )subject + start_offset;
				3514	const pcre_uchar *end_subject;
				3515	const pcre_uchar *req_byte_ptr = start_match - 1;
				3516
				3517	pcre_study_data internal_study;
				3518	const pcre_study_data *study;
				3519
				3520	real_pcre internal_re;
				3521	const real_pcre external_re = (const real_pcre )argument_re;
				3522	const real_pcre *re = external_re;
				3523
				3524	/* Plausibility checks */
				3525
				3526	if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
				3527	if (re == NULL \|\| subject == NULL \|\|
				3528	(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
				3529	if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
				3530
				3531	/* Fish out the optional data from the extra_data structure, first setting
				3532	the default values. */
				3533
				3534	study = NULL;
				3535	match_block.match_limit = MATCH_LIMIT;
				3536	match_block.callout_data = NULL;
				3537
				3538	/* The table pointer is always in native byte order. */
				3539
				3540	tables = external_re->tables;
				3541
				3542	if (extra_data != NULL)
				3543	{
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	3544	register unsigned long flags = extra_data->flags;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3545	if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
				3546	study = (const pcre_study_data *)extra_data->study_data;
				3547	if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
				3548	match_block.match_limit = extra_data->match_limit;
				3549	if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
				3550	match_block.callout_data = extra_data->callout_data;
				3551	if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
				3552	}
				3553
				3554	/* If the exec call supplied NULL for tables, use the inbuilt ones. This
				3555	is a feature that makes it possible to save compiled regex and re-use them
				3556	in other programs later. */
				3557
				3558	if (tables == NULL) tables = _pcre_default_tables;
				3559
				3560	/* Check that the first field in the block is the magic number. If it is not,
				3561	test for a regex that was compiled on a host of opposite endianness. If this is
				3562	the case, flipped values are put in internal_re and internal_study if there was
				3563	study data too. */
				3564
				3565	if (re->magic_number != MAGIC_NUMBER)
				3566	{
				3567	re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
				3568	if (re == NULL) return PCRE_ERROR_BADMAGIC;
				3569	if (study != NULL) study = &internal_study;
				3570	}
				3571
				3572	/* Set up other data */
				3573
				3574	anchored = ((re->options \| options) & PCRE_ANCHORED) != 0;
				3575	startline = (re->options & PCRE_STARTLINE) != 0;
				3576	firstline = (re->options & PCRE_FIRSTLINE) != 0;
				3577
				3578	/* The code starts after the real_pcre block and the capture name table. */
				3579
				3580	match_block.start_code = (const uschar *)external_re + re->name_table_offset +
				3581	re->name_count * re->name_entry_size;
				3582
				3583	match_block.start_subject = (const pcre_uchar *)subject;
				3584	match_block.start_offset = start_offset;
				3585	match_block.end_subject = match_block.start_subject + length;
				3586	end_subject = match_block.end_subject;
				3587
				3588	match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
				3589	match_block.utf8 = (re->options & PCRE_UTF8) != 0;
				3590
				3591	match_block.notbol = (options & PCRE_NOTBOL) != 0;
				3592	match_block.noteol = (options & PCRE_NOTEOL) != 0;
				3593	match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
				3594	match_block.partial = (options & PCRE_PARTIAL) != 0;
				3595	match_block.hitend = FALSE;
				3596
				3597	match_block.recursive = NULL; /* No recursion at top level */
				3598
				3599	match_block.lcc = tables + lcc_offset;
				3600	match_block.ctypes = tables + ctypes_offset;
				3601
				3602	/* Partial matching is supported only for a restricted set of regexes at the
				3603	moment. */
				3604
				3605	if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
				3606	return PCRE_ERROR_BADPARTIAL;
				3607
				3608	/* Check a UTF-8 string if required. Unfortunately there's no way of passing
				3609	back the character offset. */
				3610
				3611	#if !PCRE_UTF16
				3612	#ifdef SUPPORT_UTF8
				3613	if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
				3614	{
				3615	if (_pcre_valid_utf8((pcre_uchar *)subject, length) >= 0)
				3616	return PCRE_ERROR_BADUTF8;
				3617	if (start_offset > 0 && start_offset < length)
				3618	{
				3619	int tb = ((pcre_uchar *)subject)[start_offset];
				3620	if (tb > 127)
				3621	{
				3622	tb &= 0xc0;
				3623	if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
				3624	}
				3625	}
				3626	}
				3627	#endif
				3628	#endif
				3629
				3630	/* The ims options can vary during the matching as a result of the presence
				3631	of (?ims) items in the pattern. They are kept in a local variable so that
				3632	restoring at the exit of a group is easy. */
				3633
				3634	ims = re->options & (PCRE_CASELESS\|PCRE_MULTILINE\|PCRE_DOTALL);
				3635
				3636	/* If the expression has got more back references than the offsets supplied can
				3637	hold, we get a temporary chunk of working store to use during the matching.
				3638	Otherwise, we can use the vector supplied, rounding down its size to a multiple
				3639	of 3. */
				3640
				3641	ocount = offsetcount - (offsetcount % 3);
				3642
				3643	if (re->top_backref > 0 && re->top_backref >= ocount/3)
				3644	{
				3645	ocount = re->top_backref * 3 + 3;
				3646	match_block.offset_vector = (int )(pcre_malloc)(ocount sizeof(int));
				3647	if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
				3648	using_temporary_offsets = TRUE;
				3649	DPRINTF(("Got memory to hold back references\n"));
				3650	}
				3651	else match_block.offset_vector = offsets;
				3652
				3653	match_block.offset_end = ocount;
				3654	match_block.offset_max = (2*ocount)/3;
				3655	match_block.offset_overflow = FALSE;
				3656	match_block.capture_last = -1;
				3657
				3658	/* Compute the minimum number of offsets that we need to reset each time. Doing
				3659	this makes a huge difference to execution time when there aren't many brackets
				3660	in the pattern. */
				3661
				3662	resetcount = 2 + re->top_bracket * 2;
				3663	if (resetcount > offsetcount) resetcount = ocount;
				3664
				3665	/* Reset the working variable associated with each extraction. These should
				3666	never be used unless previously set, but they get saved and restored, and so we
				3667	initialize them to avoid reading uninitialized locations. */
				3668
				3669	if (match_block.offset_vector != NULL)
				3670	{
				3671	register int *iptr = match_block.offset_vector + ocount;
				3672	register int *iend = iptr - resetcount/2 + 1;
				3673	while (--iptr >= iend) *iptr = -1;
				3674	}
				3675
				3676	/* Set up the first character to match, if available. The first_byte value is
				3677	never set for an anchored regular expression, but the anchoring may be forced
				3678	at run time, so we have to test for anchoring. The first char may be unset for
				3679	an unanchored pattern, of course. If there's no first char and the pattern was
				3680	studied, there may be a bitmap of possible first characters. */
				3681
				3682	if (!anchored)
				3683	{
				3684	if ((re->options & PCRE_FIRSTSET) != 0)
				3685	{
				3686	first_byte = re->first_byte & 255;
				3687	if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
				3688	first_byte = match_block.lcc[first_byte];
				3689	}
				3690	else
				3691	if (!startline && study != NULL &&
				3692	(study->options & PCRE_STUDY_MAPPED) != 0)
				3693	start_bits = study->start_bits;
				3694	}
				3695
				3696	/* For anchored or unanchored matches, there may be a "last known required
				3697	character" set. */
				3698
				3699	if ((re->options & PCRE_REQCHSET) != 0)
				3700	{
				3701	req_byte = re->req_byte & 255;
				3702	req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
				3703	req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
				3704	}
				3705
				3706	/* Loop for handling unanchored repeated matching attempts; for anchored regexs
				3707	the loop runs just once. */
				3708
				3709	do
				3710	{
				3711	const pcre_uchar *save_end_subject = end_subject;
				3712
				3713	/* Reset the maximum number of extractions we might see. */
				3714
				3715	if (match_block.offset_vector != NULL)
				3716	{
				3717	register int *iptr = match_block.offset_vector;
				3718	register int *iend = iptr + resetcount;
				3719	while (iptr < iend) *iptr++ = -1;
				3720	}
				3721
				3722	/* Advance to a unique first char if possible. If firstline is TRUE, the
				3723	start of the match is constrained to the first line of a multiline string.
				3724	Implement this by temporarily adjusting end_subject so that we stop scanning
				3725	at a newline. If the match fails at the newline, later code breaks this loop.
				3726	*/
				3727
				3728	if (firstline)
				3729	{
				3730	const pcre_uchar *t = start_match;
				3731	while (t < save_end_subject && *t != '\n') t++;
				3732	end_subject = t;
				3733	}
				3734
				3735	/* Now test for a unique first byte */
				3736
				3737	if (first_byte >= 0)
				3738	{
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	3739	pcre_uchar first_char = first_byte;
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3740	if (first_byte_caseless)
				3741	while (start_match < end_subject)
				3742	{
				3743	int sm = *start_match;
				3744	#if PCRE_UTF16
				3745	if (sm > 127)
				3746	break;
				3747	#endif
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	3748	if (match_block.lcc[sm] == first_char)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3749	break;
				3750	start_match++;
				3751	}
				3752	else
darin	ce72b7a	2007-02-06 19:42:35 +0000	[diff] [blame]	3753	while (start_match < end_subject && *start_match != first_char)
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3754	start_match++;
				3755	}
				3756
				3757	/* Or to just after \n for a multiline match if possible */
				3758
				3759	else if (startline)
				3760	{
				3761	if (start_match > match_block.start_subject + start_offset)
				3762	{
				3763	while (start_match < end_subject && start_match[-1] != NEWLINE)
				3764	start_match++;
				3765	}
				3766	}
				3767
				3768	/* Or to a non-unique first char after study */
				3769
				3770	else if (start_bits != NULL)
				3771	{
				3772	while (start_match < end_subject)
				3773	{
				3774	register unsigned int c = *start_match;
				3775	if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
				3776	}
				3777	}
				3778
				3779	/* Restore fudged end_subject */
				3780
				3781	end_subject = save_end_subject;
				3782
				3783	#ifdef DEBUG /* Sigh. Some compilers never learn. */
				3784	printf(">>>> Match against: ");
				3785	pchars(start_match, end_subject - start_match, TRUE, &match_block);
				3786	printf("\n");
				3787	#endif
				3788
				3789	/* If req_byte is set, we know that that character must appear in the subject
				3790	for the match to succeed. If the first character is set, req_byte must be
				3791	later in the subject; otherwise the test starts at the match point. This
				3792	optimization can save a huge amount of backtracking in patterns with nested
				3793	unlimited repeats that aren't going to match. Writing separate code for
				3794	cased/caseless versions makes it go faster, as does using an autoincrement
				3795	and backing off on a match.
				3796
				3797	HOWEVER: when the subject string is very, very long, searching to its end can
				3798	take a long time, and give bad performance on quite ordinary patterns. This
				3799	showed up when somebody was matching /^C/ on a 32-megabyte string... so we
				3800	don't do this when the string is sufficiently long.
				3801
				3802	ALSO: this processing is disabled when partial matching is requested.
				3803	*/
				3804
				3805	if (req_byte >= 0 &&
				3806	end_subject - start_match < REQ_BYTE_MAX &&
				3807	!match_block.partial)
				3808	{
				3809	register const pcre_uchar *p = start_match + ((first_byte >= 0)? 1 : 0);
				3810
				3811	/* We don't need to repeat the search if we haven't yet reached the
				3812	place we found it at last time. */
				3813
				3814	if (p > req_byte_ptr)
				3815	{
				3816	if (req_byte_caseless)
				3817	{
				3818	while (p < end_subject)
				3819	{
				3820	register int pp = *p++;
				3821	if (pp == req_byte \|\| pp == req_byte2) { p--; break; }
				3822	}
				3823	}
				3824	else
				3825	{
				3826	while (p < end_subject)
				3827	{
				3828	if (*p++ == req_byte) { p--; break; }
				3829	}
				3830	}
				3831
				3832	/* If we can't find the required character, break the matching loop */
				3833
				3834	if (p >= end_subject) break;
				3835
				3836	/* If we have found the required character, save the point where we
				3837	found it, so that we don't search again next time round the loop if
				3838	the start hasn't passed this character yet. */
				3839
				3840	req_byte_ptr = p;
				3841	}
				3842	}
				3843
				3844	/* When a match occurs, substrings will be set for all internal extractions;
				3845	we just need to set up the whole thing as substring 0 before returning. If
				3846	there were too many extractions, set the return code to zero. In the case
				3847	where we had to get some local store to hold offsets for backreferences, copy
				3848	those back references that we can. In this case there need not be overflow
				3849	if certain parts of the pattern were not used. */
				3850
				3851	match_block.start_match = start_match;
				3852	match_block.match_call_count = 0;
				3853
				3854	rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
				3855	match_isgroup);
				3856
				3857	/* When the result is no match, if the subject's first character was a
				3858	newline and the PCRE_FIRSTLINE option is set, break (which will return
				3859	PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
				3860	newline in the subject. Otherwise, advance the pointer to the next character
				3861	and continue - but the continuation will actually happen only when the
				3862	pattern is not anchored. */
				3863
				3864	if (rc == MATCH_NOMATCH)
				3865	{
				3866	if (firstline && *start_match == NEWLINE) break;
				3867	start_match++;
				3868	#ifdef SUPPORT_UTF8
				3869	if (match_block.utf8)
darin	496882e	2006-07-15 15:30:03 +0000	[diff] [blame]	3870	while(start_match < end_subject && ISMIDCHAR(*start_match))
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3871	start_match++;
				3872	#endif
				3873	continue;
				3874	}
				3875
				3876	if (rc != MATCH_MATCH)
				3877	{
				3878	DPRINTF((">>>> error: returning %d\n", rc));
				3879	return rc;
				3880	}
				3881
				3882	/* We have a match! Copy the offset information from temporary store if
				3883	necessary */
				3884
				3885	if (using_temporary_offsets)
				3886	{
				3887	if (offsetcount >= 4)
				3888	{
				3889	memcpy(offsets + 2, match_block.offset_vector + 2,
				3890	(offsetcount - 2) * sizeof(int));
				3891	DPRINTF(("Copied offsets from temporary memory\n"));
				3892	}
				3893	if (match_block.end_offset_top > offsetcount)
				3894	match_block.offset_overflow = TRUE;
				3895
				3896	DPRINTF(("Freeing temporary memory\n"));
				3897	(pcre_free)(match_block.offset_vector);
				3898	}
				3899
				3900	rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
				3901
				3902	if (offsetcount < 2) rc = 0; else
				3903	{
thatcher	dc18a36	2006-08-31 21:28:29 +0000	[diff] [blame]	3904	offsets[0] = INT_CAST(start_match - match_block.start_subject);
				3905	offsets[1] = INT_CAST(match_block.end_match_ptr - match_block.start_subject);
darin	d7737ab	2005-09-09 00:51:07 +0000	[diff] [blame]	3906	}
				3907
				3908	DPRINTF((">>>> returning %d\n", rc));
				3909	return rc;
				3910	}
				3911
				3912	/* This "while" is the end of the "do" above */
				3913
				3914	while (!anchored && start_match <= end_subject);
				3915
				3916	if (using_temporary_offsets)
				3917	{
				3918	DPRINTF(("Freeing temporary memory\n"));
				3919	(pcre_free)(match_block.offset_vector);
				3920	}
				3921
				3922	if (match_block.partial && match_block.hitend)
				3923	{
				3924	DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
				3925	return PCRE_ERROR_PARTIAL;
				3926	}
				3927	else
				3928	{
				3929	DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
				3930	return PCRE_ERROR_NOMATCH;
				3931	}
				3932	}
				3933
				3934	/* End of pcre_exec.c */