blob: fdd90af67961bb359b2c4d625cb8d3cae43a17d1 [file] [log] [blame]
darin@apple.comee752e72007-11-11 18:56:13 +00001/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
darind7737ab2005-09-09 00:51:07 +00006
darin@apple.comee752e72007-11-11 18:56:13 +00007 Originally written by Philip Hazel
darinae790da2007-10-17 05:38:39 +00008 Copyright (c) 1997-2006 University of Cambridge
darin@apple.comee752e72007-11-11 18:56:13 +00009 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
eric@webkit.org4cacbe42007-11-29 11:26:33 +000010 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
darind7737ab2005-09-09 00:51:07 +000011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
darin@apple.comee752e72007-11-11 18:56:13 +000041/* This module contains the external function jsRegExpExecute(), along with
darind7737ab2005-09-09 00:51:07 +000042supporting internal functions that are not used by other modules. */
43
darin@apple.com7f984872007-11-12 23:04:41 +000044#include "config.h"
45
darind7737ab2005-09-09 00:51:07 +000046#include "pcre_internal.h"
47
mrowe@apple.com6be89412007-12-17 06:01:22 +000048#include <string.h>
darin@apple.comee752e72007-11-11 18:56:13 +000049#include <wtf/ASCIICType.h>
50#include <wtf/FastMalloc.h>
darind7737ab2005-09-09 00:51:07 +000051
darin@apple.comee752e72007-11-11 18:56:13 +000052using namespace WTF;
53
darin@apple.com28e8eb92007-12-17 04:19:25 +000054/* Negative values for the firstchar and reqchar variables */
55
56#define REQ_UNSET (-2)
57#define REQ_NONE (-1)
58
darind7737ab2005-09-09 00:51:07 +000059/*************************************************
60* Code parameters and static tables *
61*************************************************/
62
63/* Maximum number of items on the nested bracket stacks at compile time. This
64applies to the nesting of all kinds of parentheses. It does not limit
65un-nested, non-capturing parentheses. This number can be made bigger if
66necessary - it is used to dimension one int and one unsigned char vector at
67compile time. */
68
69#define BRASTACK_SIZE 200
70
darind7737ab2005-09-09 00:51:07 +000071/* Table for handling escaped characters in the range '0'-'z'. Positive returns
72are simple data values; negative values are for special things like \d and so
73on. Zero means further processing is needed (for things like \x), or the escape
74is invalid. */
75
darin@apple.comee752e72007-11-11 18:56:13 +000076static const short escapes[] = {
darin74feca62007-07-19 21:10:40 +000077 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
darin@apple.com7ecf0c32007-11-04 06:18:31 +000083 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
84 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
85 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
darin74feca62007-07-19 21:10:40 +000086 0, 0, 0 /* x - z */
87};
darinae790da2007-10-17 05:38:39 +000088
darin@apple.comee752e72007-11-11 18:56:13 +000089/* Error code numbers. They are given names so that they can more easily be
90tracked. */
91
eric@webkit.orgf2e7c2d2007-11-29 11:18:16 +000092enum ErrorCode {
darin@apple.comee752e72007-11-11 18:56:13 +000093 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
94 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
eric@webkit.orgf2e7c2d2007-11-29 11:18:16 +000095};
darin@apple.comee752e72007-11-11 18:56:13 +000096
darind7737ab2005-09-09 00:51:07 +000097/* The texts of compile-time error messages. These are "char *" because they
98are passed to the outside world. */
99
darin@apple.com28e8eb92007-12-17 04:19:25 +0000100static const char* errorText(ErrorCode code)
darin@apple.comee752e72007-11-11 18:56:13 +0000101{
darin@apple.com28e8eb92007-12-17 04:19:25 +0000102 static const char errorTexts[] =
darin@apple.comee752e72007-11-11 18:56:13 +0000103 /* 1 */
104 "\\ at end of pattern\0"
105 "\\c at end of pattern\0"
106 "character value in \\x{...} sequence is too large\0"
107 "numbers out of order in {} quantifier\0"
108 /* 5 */
109 "number too big in {} quantifier\0"
110 "missing terminating ] for character class\0"
111 "internal error: code overflow\0"
112 "range out of order in character class\0"
113 "nothing to repeat\0"
114 /* 10 */
115 "unmatched parentheses\0"
116 "internal error: unexpected repeat\0"
117 "unrecognized character after (?\0"
118 "failed to get memory\0"
119 "missing )\0"
120 /* 15 */
121 "reference to non-existent subpattern\0"
122 "regular expression too large\0"
123 "parentheses nested too deeply"
124 ;
darind7737ab2005-09-09 00:51:07 +0000125
darin@apple.comee752e72007-11-11 18:56:13 +0000126 int i = code;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000127 const char* text = errorTexts;
darin@apple.comee752e72007-11-11 18:56:13 +0000128 while (i > 1)
129 i -= !*text++;
130 return text;
131}
darind7737ab2005-09-09 00:51:07 +0000132
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000133/* Structure for passing "static" information around between the functions
134doing the compiling. */
135
136struct CompileData {
137 CompileData() {
darin@apple.com51d6b262008-06-15 06:23:50 +0000138 topBackref = 0;
darin@apple.comd5cbb632007-12-10 06:22:04 +0000139 backrefMap = 0;
darin@apple.com51d6b262008-06-15 06:23:50 +0000140 reqVaryOpt = 0;
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000141 needOuterBracket = false;
darin@apple.comf9c8dd52008-01-03 06:39:50 +0000142 numCapturingBrackets = 0;
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000143 }
darin@apple.com51d6b262008-06-15 06:23:50 +0000144 int topBackref; /* Maximum back reference */
darin@apple.comd5cbb632007-12-10 06:22:04 +0000145 unsigned backrefMap; /* Bitmap of low back refs */
darin@apple.com51d6b262008-06-15 06:23:50 +0000146 int reqVaryOpt; /* "After variable item" flag for reqByte */
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000147 bool needOuterBracket;
darin@apple.comf9c8dd52008-01-03 06:39:50 +0000148 int numCapturingBrackets;
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000149};
150
darin@apple.comd5cbb632007-12-10 06:22:04 +0000151/* Definitions to allow mutual recursion */
darind7737ab2005-09-09 00:51:07 +0000152
darin@apple.com28e8eb92007-12-17 04:19:25 +0000153static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
154static bool bracketIsAnchored(const unsigned char* code);
155static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
156static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
darind7737ab2005-09-09 00:51:07 +0000157
darind7737ab2005-09-09 00:51:07 +0000158/*************************************************
159* Handle escapes *
160*************************************************/
161
162/* This function is called when a \ has been encountered. It either returns a
163positive value for a simple escape such as \n, or a negative value which
164encodes one of the more complicated things such as \d. When UTF-8 is enabled,
165a positive value greater than 255 may be returned. On entry, ptr is pointing at
166the \. On exit, it is on the final character of the escape sequence.
167
168Arguments:
darin@apple.com51d6b262008-06-15 06:23:50 +0000169 ptrPtr points to the pattern position pointer
170 errorCodePtr points to the errorcode variable
darind7737ab2005-09-09 00:51:07 +0000171 bracount number of previous extracting brackets
172 options the options bits
darin@apple.com51d6b262008-06-15 06:23:50 +0000173 isClass true if inside a character class
darind7737ab2005-09-09 00:51:07 +0000174
175Returns: zero or positive => a data character
176 negative => a special escape sequence
darin@apple.com51d6b262008-06-15 06:23:50 +0000177 on error, errorPtr is set
darind7737ab2005-09-09 00:51:07 +0000178*/
179
darin@apple.com51d6b262008-06-15 06:23:50 +0000180static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
darind7737ab2005-09-09 00:51:07 +0000181{
darin@apple.com51d6b262008-06-15 06:23:50 +0000182 const UChar* ptr = *ptrPtr + 1;
darind7737ab2005-09-09 00:51:07 +0000183
eric@webkit.org3029ab82007-11-29 11:17:32 +0000184 /* If backslash is at the end of the pattern, it's an error. */
185 if (ptr == patternEnd) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000186 *errorCodePtr = ERR1;
187 *ptrPtr = ptr;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000188 return 0;
darind7737ab2005-09-09 00:51:07 +0000189 }
eric@webkit.org3029ab82007-11-29 11:17:32 +0000190
191 int c = *ptr;
192
193 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
194 a table. A non-zero result is something that can be returned immediately.
195 Otherwise further processing may be required. */
196
197 if (c < '0' || c > 'z') { /* Not alphameric */
darin@apple.com9be07b62007-12-17 01:26:54 +0000198 } else if (int escapeValue = escapes[c - '0']) {
eric@webkit.org3029ab82007-11-29 11:17:32 +0000199 c = escapeValue;
darin@apple.com51d6b262008-06-15 06:23:50 +0000200 if (isClass) {
darin@apple.com9be07b62007-12-17 01:26:54 +0000201 if (-c == ESC_b)
202 c = '\b'; /* \b is backslash in a class */
203 else if (-c == ESC_B)
204 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
205 }
eric@webkit.org3029ab82007-11-29 11:17:32 +0000206 /* Escapes that need further processing, or are illegal. */
207
darin@apple.com9be07b62007-12-17 01:26:54 +0000208 } else {
eric@webkit.org3029ab82007-11-29 11:17:32 +0000209 switch (c) {
darin@apple.com28e8eb92007-12-17 04:19:25 +0000210 case '1':
211 case '2':
212 case '3':
213 case '4':
214 case '5':
215 case '6':
216 case '7':
217 case '8':
218 case '9':
219 /* Escape sequences starting with a non-zero digit are backreferences,
220 unless there are insufficient brackets, in which case they are octal
221 escape sequences. Those sequences end on the first non-octal character
222 or when we overflow 0-255, whichever comes first. */
223
darin@apple.com51d6b262008-06-15 06:23:50 +0000224 if (!isClass) {
darin@apple.com28e8eb92007-12-17 04:19:25 +0000225 const UChar* oldptr = ptr;
226 c -= '0';
227 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
228 c = c * 10 + *(++ptr) - '0';
229 if (c <= bracount) {
230 c = -(ESC_REF + c);
231 break;
232 }
233 ptr = oldptr; /* Put the pointer back and fall through */
eric@webkit.org3029ab82007-11-29 11:17:32 +0000234 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000235
236 /* Handle an octal number following \. If the first digit is 8 or 9,
237 this is not octal. */
238
239 if ((c = *ptr) >= '8')
240 break;
241
eric@webkit.org3029ab82007-11-29 11:17:32 +0000242 /* \0 always starts an octal number, but we may drop through to here with a
243 larger first octal digit. */
darin@apple.com28e8eb92007-12-17 04:19:25 +0000244
245 case '0': {
246 c -= '0';
247 int i;
248 for (i = 1; i <= 2; ++i) {
249 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
250 break;
251 int cc = c * 8 + ptr[i] - '0';
252 if (cc > 255)
253 break;
254 c = cc;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000255 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000256 ptr += i - 1;
257 break;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000258 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000259
260 case 'x': {
261 c = 0;
262 int i;
263 for (i = 1; i <= 2; ++i) {
264 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
265 c = 'x';
266 i = 1;
267 break;
268 }
269 int cc = ptr[i];
270 if (cc >= 'a')
271 cc -= 32; /* Convert to upper case */
272 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
eric@webkit.org3029ab82007-11-29 11:17:32 +0000273 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000274 ptr += i - 1;
275 break;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000276 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000277
278 case 'u': {
279 c = 0;
280 int i;
281 for (i = 1; i <= 4; ++i) {
282 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
283 c = 'u';
284 i = 1;
285 break;
286 }
287 int cc = ptr[i];
288 if (cc >= 'a')
289 cc -= 32; /* Convert to upper case */
290 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
291 }
292 ptr += i - 1;
293 break;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000294 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000295
296 case 'c':
297 if (++ptr == patternEnd) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000298 *errorCodePtr = ERR2;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000299 return 0;
300 }
301 c = *ptr;
302
303 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
304 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
305 c = toASCIIUpper(c) ^ 0x40;
306 break;
307 }
eric@webkit.org3029ab82007-11-29 11:17:32 +0000308 }
309
darin@apple.com51d6b262008-06-15 06:23:50 +0000310 *ptrPtr = ptr;
eric@webkit.org3029ab82007-11-29 11:17:32 +0000311 return c;
darind7737ab2005-09-09 00:51:07 +0000312}
313
darind7737ab2005-09-09 00:51:07 +0000314/*************************************************
315* Check for counted repeat *
316*************************************************/
317
318/* This function is called when a '{' is encountered in a place where it might
319start a quantifier. It looks ahead to see if it really is a quantifier or not.
320It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
321where the ddds are digits.
322
323Arguments:
324 p pointer to the first char after '{'
325
darin@apple.com7f984872007-11-12 23:04:41 +0000326Returns: true or false
darind7737ab2005-09-09 00:51:07 +0000327*/
328
darin@apple.com28e8eb92007-12-17 04:19:25 +0000329static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
darind7737ab2005-09-09 00:51:07 +0000330{
eric@webkit.orgf2e7c2d2007-11-29 11:18:16 +0000331 if (p >= patternEnd || !isASCIIDigit(*p))
332 return false;
ddkilzere1714632007-01-02 05:13:00 +0000333 p++;
eric@webkit.orgf2e7c2d2007-11-29 11:18:16 +0000334 while (p < patternEnd && isASCIIDigit(*p))
335 p++;
336 if (p < patternEnd && *p == '}')
337 return true;
338
339 if (p >= patternEnd || *p++ != ',')
340 return false;
341 if (p < patternEnd && *p == '}')
342 return true;
343
344 if (p >= patternEnd || !isASCIIDigit(*p))
345 return false;
ddkilzere1714632007-01-02 05:13:00 +0000346 p++;
eric@webkit.orgf2e7c2d2007-11-29 11:18:16 +0000347 while (p < patternEnd && isASCIIDigit(*p))
348 p++;
349
350 return (p < patternEnd && *p == '}');
darind7737ab2005-09-09 00:51:07 +0000351}
352
darind7737ab2005-09-09 00:51:07 +0000353/*************************************************
354* Read repeat counts *
355*************************************************/
356
357/* Read an item of the form {n,m} and return the values. This is called only
darin@apple.com28e8eb92007-12-17 04:19:25 +0000358after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
darind7737ab2005-09-09 00:51:07 +0000359so the syntax is guaranteed to be correct, but we need to check the values.
360
361Arguments:
362 p pointer to first char after '{'
363 minp pointer to int for min
364 maxp pointer to int for max
365 returned as -1 if no max
darin@apple.com51d6b262008-06-15 06:23:50 +0000366 errorCodePtr points to error code variable
darind7737ab2005-09-09 00:51:07 +0000367
368Returns: pointer to '}' on success;
darin@apple.com51d6b262008-06-15 06:23:50 +0000369 current ptr on error, with errorCodePtr set non-zero
darind7737ab2005-09-09 00:51:07 +0000370*/
371
darin@apple.com51d6b262008-06-15 06:23:50 +0000372static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
darind7737ab2005-09-09 00:51:07 +0000373{
eric@webkit.org3c8d15cd2007-11-29 11:07:57 +0000374 int min = 0;
375 int max = -1;
376
377 /* Read the minimum value and do a paranoid check: a negative value indicates
378 an integer overflow. */
379
380 while (isASCIIDigit(*p))
381 min = min * 10 + *p++ - '0';
382 if (min < 0 || min > 65535) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000383 *errorCodePtr = ERR5;
eric@webkit.org3c8d15cd2007-11-29 11:07:57 +0000384 return p;
darind7737ab2005-09-09 00:51:07 +0000385 }
eric@webkit.org3c8d15cd2007-11-29 11:07:57 +0000386
387 /* Read the maximum value if there is one, and again do a paranoid on its size.
388 Also, max must not be less than min. */
389
390 if (*p == '}')
391 max = min;
392 else {
393 if (*(++p) != '}') {
394 max = 0;
395 while (isASCIIDigit(*p))
396 max = max * 10 + *p++ - '0';
397 if (max < 0 || max > 65535) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000398 *errorCodePtr = ERR5;
eric@webkit.org3c8d15cd2007-11-29 11:07:57 +0000399 return p;
400 }
401 if (max < min) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000402 *errorCodePtr = ERR4;
eric@webkit.org3c8d15cd2007-11-29 11:07:57 +0000403 return p;
404 }
405 }
406 }
407
408 /* Fill in the required variables, and pass back the pointer to the terminating
409 '}'. */
410
411 *minp = min;
412 *maxp = max;
413 return p;
darind7737ab2005-09-09 00:51:07 +0000414}
415
darind7737ab2005-09-09 00:51:07 +0000416/*************************************************
417* Find first significant op code *
418*************************************************/
419
420/* This is called by several functions that scan a compiled expression looking
421for a fixed first character, or an anchoring op code etc. It skips over things
darin@apple.com28e8eb92007-12-17 04:19:25 +0000422that do not influence this.
darind7737ab2005-09-09 00:51:07 +0000423
424Arguments:
425 code pointer to the start of the group
darind7737ab2005-09-09 00:51:07 +0000426Returns: pointer to the first significant opcode
427*/
428
darin@apple.com28e8eb92007-12-17 04:19:25 +0000429static const unsigned char* firstSignificantOpcode(const unsigned char* code)
darind7737ab2005-09-09 00:51:07 +0000430{
eric@webkit.orgd0cc3fd2007-11-29 11:18:59 +0000431 while (*code == OP_BRANUMBER)
darin@apple.com28e8eb92007-12-17 04:19:25 +0000432 code += 3;
darind7737ab2005-09-09 00:51:07 +0000433 return code;
darind7737ab2005-09-09 00:51:07 +0000434}
435
darin@apple.com28e8eb92007-12-17 04:19:25 +0000436static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
eric@webkit.orgd0cc3fd2007-11-29 11:18:59 +0000437{
438 while (true) {
439 switch (*code) {
eric@webkit.org740cc732007-11-29 11:16:48 +0000440 case OP_ASSERT_NOT:
darin@apple.com28e8eb92007-12-17 04:19:25 +0000441 advanceToEndOfBracket(code);
442 code += 1 + LINK_SIZE;
443 break;
eric@webkit.org740cc732007-11-29 11:16:48 +0000444 case OP_WORD_BOUNDARY:
darin@apple.com28e8eb92007-12-17 04:19:25 +0000445 case OP_NOT_WORD_BOUNDARY:
446 ++code;
eric@webkit.org740cc732007-11-29 11:16:48 +0000447 break;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000448 case OP_BRANUMBER:
449 code += 3;
eric@webkit.org740cc732007-11-29 11:16:48 +0000450 break;
eric@webkit.org740cc732007-11-29 11:16:48 +0000451 default:
darin@apple.com28e8eb92007-12-17 04:19:25 +0000452 return code;
eric@webkit.org740cc732007-11-29 11:16:48 +0000453 }
darind7737ab2005-09-09 00:51:07 +0000454 }
darind7737ab2005-09-09 00:51:07 +0000455}
456
darind7737ab2005-09-09 00:51:07 +0000457/*************************************************
458* Get othercase range *
459*************************************************/
460
461/* This function is passed the start and end of a class range, in UTF-8 mode
462with UCP support. It searches up the characters, looking for internal ranges of
463characters in the "other" case. Each call returns the next one, updating the
464start address.
465
466Arguments:
467 cptr points to starting character value; updated
468 d end value
469 ocptr where to put start of othercase range
470 odptr where to put end of othercase range
471
darin@apple.com7f984872007-11-12 23:04:41 +0000472Yield: true when range returned; false when no more
darind7737ab2005-09-09 00:51:07 +0000473*/
474
darin@apple.com28e8eb92007-12-17 04:19:25 +0000475static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
darind7737ab2005-09-09 00:51:07 +0000476{
eric@webkit.org32f7c432007-11-29 11:12:11 +0000477 int c, othercase = 0;
eric@webkit.org5fa98322007-11-29 11:06:02 +0000478
479 for (c = *cptr; c <= d; c++) {
darin@apple.com28e8eb92007-12-17 04:19:25 +0000480 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)
eric@webkit.org5fa98322007-11-29 11:06:02 +0000481 break;
482 }
483
484 if (c > d)
485 return false;
486
487 *ocptr = othercase;
eric@webkit.org32f7c432007-11-29 11:12:11 +0000488 int next = othercase + 1;
eric@webkit.org5fa98322007-11-29 11:06:02 +0000489
490 for (++c; c <= d; c++) {
darin@apple.com28e8eb92007-12-17 04:19:25 +0000491 if (kjs_pcre_ucp_othercase(c) != next)
eric@webkit.org5fa98322007-11-29 11:06:02 +0000492 break;
493 next++;
494 }
495
496 *odptr = next - 1;
497 *cptr = c;
498
499 return true;
darind7737ab2005-09-09 00:51:07 +0000500}
darind7737ab2005-09-09 00:51:07 +0000501
eric@webkit.org90a7a132007-11-29 11:10:33 +0000502/*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
505
506/* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
508
509 Arguments:
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
512
513 Returns: number of characters placed in the buffer
514 */
515
darin@apple.com28e8eb92007-12-17 04:19:25 +0000516static int encodeUTF8(int cvalue, unsigned char *buffer)
eric@webkit.org90a7a132007-11-29 11:10:33 +0000517{
518 int i;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000519 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
520 if (cvalue <= kjs_pcre_utf8_table1[i])
eric@webkit.org90a7a132007-11-29 11:10:33 +0000521 break;
522 buffer += i;
523 for (int j = i; j > 0; j--) {
524 *buffer-- = 0x80 | (cvalue & 0x3f);
525 cvalue >>= 6;
526 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000527 *buffer = kjs_pcre_utf8_table2[i] | cvalue;
eric@webkit.org90a7a132007-11-29 11:10:33 +0000528 return i + 1;
529}
darind7737ab2005-09-09 00:51:07 +0000530
531/*************************************************
532* Compile one branch *
533*************************************************/
534
darin@apple.com8ad18ea2007-12-07 19:59:58 +0000535/* Scan the pattern, compiling it into the code vector.
darind7737ab2005-09-09 00:51:07 +0000536
537Arguments:
darin@apple.comee752e72007-11-11 18:56:13 +0000538 options the option bits
darind7737ab2005-09-09 00:51:07 +0000539 brackets points to number of extracting brackets used
darin@apple.com51d6b262008-06-15 06:23:50 +0000540 codePtr points to the pointer to the current code point
541 ptrPtr points to the current pattern pointer
542 errorCodePtr points to error code variable
darind7737ab2005-09-09 00:51:07 +0000543 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
544 reqbyteptr set to the last literal character required, else < 0
darind7737ab2005-09-09 00:51:07 +0000545 cd contains pointers to tables etc.
546
darin@apple.com7f984872007-11-12 23:04:41 +0000547Returns: true on success
darin@apple.com51d6b262008-06-15 06:23:50 +0000548 false, with *errorCodePtr set non-zero on error
darind7737ab2005-09-09 00:51:07 +0000549*/
550
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000551static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
552{
553 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
554}
555
eric@webkit.orgc8d4c7d2007-11-29 11:09:42 +0000556static bool
darin@apple.com51d6b262008-06-15 06:23:50 +0000557compileBranch(int options, int* brackets, unsigned char** codePtr,
558 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
eric@webkit.org32f7c432007-11-29 11:12:11 +0000559 int* reqbyteptr, CompileData& cd)
darind7737ab2005-09-09 00:51:07 +0000560{
darin@apple.com51d6b262008-06-15 06:23:50 +0000561 int repeatType, opType;
562 int repeatMin = 0, repeat_max = 0; /* To please picky compilers */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000563 int bravalue = 0;
564 int reqvary, tempreqvary;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000565 int c;
darin@apple.com51d6b262008-06-15 06:23:50 +0000566 unsigned char* code = *codePtr;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000567 unsigned char* tempcode;
darin@apple.com51d6b262008-06-15 06:23:50 +0000568 bool didGroupSetFirstByte = false;
569 const UChar* ptr = *ptrPtr;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000570 const UChar* tempptr;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000571 unsigned char* previous = NULL;
572 unsigned char classbits[32];
eric@webkit.org5c35e222007-11-29 11:08:50 +0000573
eric@webkit.orgc8d4c7d2007-11-29 11:09:42 +0000574 bool class_utf8;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000575 unsigned char* class_utf8data;
576 unsigned char utf8_char[6];
eric@webkit.org5c35e222007-11-29 11:08:50 +0000577
578 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
579 matching encountered yet". It gets changed to REQ_NONE if we hit something that
darin@apple.com51d6b262008-06-15 06:23:50 +0000580 matches a non-fixed char first char; reqByte just remains unset if we never
eric@webkit.org5c35e222007-11-29 11:08:50 +0000581 find one.
582
583 When we hit a repeat whose minimum is zero, we may have to adjust these values
584 to take the zero repeat into account. This is implemented by setting them to
darin@apple.com51d6b262008-06-15 06:23:50 +0000585 zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
eric@webkit.org5c35e222007-11-29 11:08:50 +0000586 item types that can be repeated set these backoff variables appropriately. */
587
darin@apple.com51d6b262008-06-15 06:23:50 +0000588 int firstByte = REQ_UNSET;
589 int reqByte = REQ_UNSET;
590 int zeroReqByte = REQ_UNSET;
591 int zeroFirstByte = REQ_UNSET;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000592
darin@apple.com51d6b262008-06-15 06:23:50 +0000593 /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
eric@webkit.org81716d82007-11-29 11:19:42 +0000594 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
darin@apple.com51d6b262008-06-15 06:23:50 +0000595 value > 255. It is added into the firstByte or reqByte variables to record the
eric@webkit.org5c35e222007-11-29 11:08:50 +0000596 case status of the value. This is used only for ASCII characters. */
597
darin@apple.com51d6b262008-06-15 06:23:50 +0000598 int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000599
600 /* Switch on next character until the end of the branch */
601
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000602 for (;; ptr++) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000603 bool negateClass;
604 bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
605 int classCharCount;
606 int classLastChar;
607 int skipBytes;
608 int subReqByte;
609 int subFirstByte;
610 int mcLength;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000611 unsigned char mcbuffer[8];
eric@webkit.org5c35e222007-11-29 11:08:50 +0000612
613 /* Next byte in the pattern */
614
615 c = ptr < patternEnd ? *ptr : 0;
616
617 /* Fill in length of a previous callout, except when the next thing is
618 a quantifier. */
619
darin@apple.com51d6b262008-06-15 06:23:50 +0000620 bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
eric@webkit.org5c35e222007-11-29 11:08:50 +0000621
darin@apple.com6b652e82007-11-30 18:54:34 +0000622 switch (c) {
623 /* The branch terminates at end of string, |, or ). */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000624
625 case 0:
626 if (ptr < patternEnd)
627 goto NORMAL_CHAR;
628 // End of string; fall through
629 case '|':
630 case ')':
darin@apple.com51d6b262008-06-15 06:23:50 +0000631 *firstbyteptr = firstByte;
632 *reqbyteptr = reqByte;
633 *codePtr = code;
634 *ptrPtr = ptr;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000635 return true;
636
darin@apple.com6b652e82007-11-30 18:54:34 +0000637 /* Handle single-character metacharacters. In multiline mode, ^ disables
638 the setting of any following char as a first character. */
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000639
eric@webkit.org5c35e222007-11-29 11:08:50 +0000640 case '^':
eric@webkit.org8a344672007-11-29 11:38:26 +0000641 if (options & MatchAcrossMultipleLinesOption) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000642 if (firstByte == REQ_UNSET)
643 firstByte = REQ_NONE;
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000644 *code++ = OP_BOL;
645 } else
646 *code++ = OP_CIRC;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000647 previous = NULL;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000648 break;
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000649
eric@webkit.org5c35e222007-11-29 11:08:50 +0000650 case '$':
651 previous = NULL;
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000652 if (options & MatchAcrossMultipleLinesOption)
653 *code++ = OP_EOL;
654 else
655 *code++ = OP_DOLL;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000656 break;
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000657
darin@apple.com6b652e82007-11-30 18:54:34 +0000658 /* There can never be a first char if '.' is first, whatever happens about
darin@apple.com51d6b262008-06-15 06:23:50 +0000659 repeats. The value of reqByte doesn't change either. */
eric@webkit.org7fbd8932008-03-31 08:01:09 +0000660
eric@webkit.org5c35e222007-11-29 11:08:50 +0000661 case '.':
darin@apple.com51d6b262008-06-15 06:23:50 +0000662 if (firstByte == REQ_UNSET)
663 firstByte = REQ_NONE;
664 zeroFirstByte = firstByte;
665 zeroReqByte = reqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000666 previous = code;
darin@apple.com6b652e82007-11-30 18:54:34 +0000667 *code++ = OP_NOT_NEWLINE;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000668 break;
669
darin@apple.com6b652e82007-11-30 18:54:34 +0000670 /* Character classes. If the included characters are all < 256, we build a
671 32-byte bitmap of the permitted characters, except in the special case
672 where there is only one such character. For negated classes, we build the
673 map as usual, then invert it at the end. However, we use a different opcode
674 so that data characters > 255 can be handled correctly.
675
676 If the class contains characters outside the 0-255 range, a different
677 opcode is compiled. It may optionally have a bit map for characters < 256,
678 but those above are are explicitly listed afterwards. A flag byte tells
679 whether the bitmap is present, and whether this is a negated class or not.
680 */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000681
darin@apple.com6b652e82007-11-30 18:54:34 +0000682 case '[': {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000683 previous = code;
darin@apple.com51d6b262008-06-15 06:23:50 +0000684 shouldFlipNegation = false;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000685
686 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
687 they are encountered at the top level, so we'll do that too. */
688
689 /* If the first character is '^', set the negation flag and skip it. */
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000690
aroben@apple.com98467532007-12-01 02:25:07 +0000691 if (ptr + 1 >= patternEnd) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000692 *errorCodePtr = ERR6;
aroben@apple.com98467532007-12-01 02:25:07 +0000693 return false;
694 }
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000695
eric@webkit.org5c35e222007-11-29 11:08:50 +0000696 if (ptr[1] == '^') {
darin@apple.com51d6b262008-06-15 06:23:50 +0000697 negateClass = true;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000698 ++ptr;
699 } else
darin@apple.com51d6b262008-06-15 06:23:50 +0000700 negateClass = false;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000701
702 /* Keep a count of chars with values < 256 so that we can optimize the case
703 of just a single character (as long as it's < 256). For higher valued UTF-8
704 characters, we don't yet do any optimization. */
705
darin@apple.com51d6b262008-06-15 06:23:50 +0000706 classCharCount = 0;
707 classLastChar = -1;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000708
709 class_utf8 = false; /* No chars >= 256 */
710 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
711
712 /* Initialize the 32-char bit map to all zeros. We have to build the
713 map in a temporary bit of store, in case the class contains only 1
714 character (< 256), because in that case the compiled code doesn't use the
715 bit map. */
716
darin@apple.com28e8eb92007-12-17 04:19:25 +0000717 memset(classbits, 0, 32 * sizeof(unsigned char));
eric@webkit.org5c35e222007-11-29 11:08:50 +0000718
719 /* Process characters until ] is reached. The first pass
720 through the regex checked the overall syntax, so we don't need to be very
721 strict here. At the start of the loop, c contains the first byte of the
722 character. */
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000723
724 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000725 /* Backslash may introduce a single character, or it may introduce one
726 of the specials, which just set a flag. Escaped items are checked for
727 validity in the pre-compiling pass. The sequence \b is a special case.
728 Inside a class (and only there) it is treated as backspace. Elsewhere
729 it marks a word boundary. Other escapes have preset maps ready to
730 or into the one we are building. We assume they have more than one
darin@apple.com51d6b262008-06-15 06:23:50 +0000731 character in them, so set classCharCount bigger than one. */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000732
733 if (c == '\\') {
darin@apple.com51d6b262008-06-15 06:23:50 +0000734 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000735 if (c < 0) {
darin@apple.com51d6b262008-06-15 06:23:50 +0000736 classCharCount += 2; /* Greater than 1 is what matters */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000737 switch (-c) {
738 case ESC_d:
739 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000740 classbits[c] |= classBitmapForChar(c + cbit_digit);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000741 continue;
742
743 case ESC_D:
darin@apple.com51d6b262008-06-15 06:23:50 +0000744 shouldFlipNegation = true;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000745 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000746 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000747 continue;
748
749 case ESC_w:
750 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000751 classbits[c] |= classBitmapForChar(c + cbit_word);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000752 continue;
753
754 case ESC_W:
darin@apple.com51d6b262008-06-15 06:23:50 +0000755 shouldFlipNegation = true;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000756 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000757 classbits[c] |= ~classBitmapForChar(c + cbit_word);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000758 continue;
759
760 case ESC_s:
761 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000762 classbits[c] |= classBitmapForChar(c + cbit_space);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000763 continue;
764
765 case ESC_S:
darin@apple.com51d6b262008-06-15 06:23:50 +0000766 shouldFlipNegation = true;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000767 for (c = 0; c < 32; c++)
eric@webkit.org4c023352007-11-29 11:22:30 +0000768 classbits[c] |= ~classBitmapForChar(c + cbit_space);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000769 continue;
770
771 /* Unrecognized escapes are faulted if PCRE is running in its
772 strict mode. By default, for compatibility with Perl, they are
773 treated as literals. */
774
775 default:
776 c = *ptr; /* The final character */
darin@apple.com51d6b262008-06-15 06:23:50 +0000777 classCharCount -= 2; /* Undo the default count from above */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000778 }
779 }
780
781 /* Fall through if we have a single character (c >= 0). This may be
782 > 256 in UTF-8 mode. */
783
784 } /* End of backslash handling */
785
786 /* A single character may be followed by '-' to form a range. However,
787 Perl does not permit ']' to be the end of the range. A '-' character
788 here is treated as a literal. */
789
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000790 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000791 ptr += 2;
792
darin@apple.com6b652e82007-11-30 18:54:34 +0000793 int d = *ptr;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000794
795 /* The second part of a range can be a single-character escape, but
796 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
797 in such circumstances. */
798
799 if (d == '\\') {
800 const UChar* oldptr = ptr;
darin@apple.com51d6b262008-06-15 06:23:50 +0000801 d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000802
darin@apple.com9be07b62007-12-17 01:26:54 +0000803 /* \X is literal X; any other special means the '-' was literal */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000804 if (d < 0) {
darin@apple.com9be07b62007-12-17 01:26:54 +0000805 ptr = oldptr - 2;
806 goto LONE_SINGLE_CHARACTER; /* A few lines below */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000807 }
808 }
809
810 /* The check that the two values are in the correct order happens in
811 the pre-pass. Optimize one-character ranges */
812
813 if (d == c)
814 goto LONE_SINGLE_CHARACTER; /* A few lines below */
815
816 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
817 matching, we have to use an XCLASS with extra data items. Caseless
818 matching for characters > 127 is available only if UCP support is
819 available. */
820
eric@webkit.org8a344672007-11-29 11:38:26 +0000821 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000822 class_utf8 = true;
823
824 /* With UCP support, we can find the other case equivalents of
825 the relevant characters. There may be several ranges. Optimize how
826 they fit with the basic range. */
827
eric@webkit.org8a344672007-11-29 11:38:26 +0000828 if (options & IgnoreCaseOption) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000829 int occ, ocd;
830 int cc = c;
831 int origd = d;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000832 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000833 if (occ >= c && ocd <= d)
834 continue; /* Skip embedded ranges */
835
836 if (occ < c && ocd >= c - 1) /* Extend the basic range */
837 { /* if there is overlap, */
838 c = occ; /* noting that if occ < c */
839 continue; /* we can't have ocd > d */
840 } /* because a subrange is */
841 if (ocd > d && occ <= d + 1) /* always shorter than */
842 { /* the basic range. */
843 d = ocd;
844 continue;
845 }
846
847 if (occ == ocd)
848 *class_utf8data++ = XCL_SINGLE;
849 else {
850 *class_utf8data++ = XCL_RANGE;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000851 class_utf8data += encodeUTF8(occ, class_utf8data);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000852 }
darin@apple.com28e8eb92007-12-17 04:19:25 +0000853 class_utf8data += encodeUTF8(ocd, class_utf8data);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000854 }
855 }
856
857 /* Now record the original range, possibly modified for UCP caseless
858 overlapping ranges. */
859
860 *class_utf8data++ = XCL_RANGE;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000861 class_utf8data += encodeUTF8(c, class_utf8data);
862 class_utf8data += encodeUTF8(d, class_utf8data);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000863
864 /* With UCP support, we are done. Without UCP support, there is no
865 caseless matching for UTF-8 characters > 127; we can use the bit map
866 for the smaller ones. */
867
868 continue; /* With next character in the class */
869 }
870
871 /* We use the bit map for all cases when not in UTF-8 mode; else
872 ranges that lie entirely within 0-127 when there is UCP support; else
873 for partial ranges without UCP support. */
874
875 for (; c <= d; c++) {
876 classbits[c/8] |= (1 << (c&7));
eric@webkit.org8a344672007-11-29 11:38:26 +0000877 if (options & IgnoreCaseOption) {
eric@webkit.org4c023352007-11-29 11:22:30 +0000878 int uc = flipCase(c);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000879 classbits[uc/8] |= (1 << (uc&7));
880 }
darin@apple.com51d6b262008-06-15 06:23:50 +0000881 classCharCount++; /* in case a one-char range */
882 classLastChar = c;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000883 }
884
885 continue; /* Go get the next char in the class */
886 }
887
888 /* Handle a lone single character - we can get here for a normal
889 non-escape char, or after \ that introduces a single character or for an
890 apparent range that isn't. */
891
892 LONE_SINGLE_CHARACTER:
893
894 /* Handle a character that cannot go in the bit map */
895
eric@webkit.org8a344672007-11-29 11:38:26 +0000896 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000897 class_utf8 = true;
898 *class_utf8data++ = XCL_SINGLE;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000899 class_utf8data += encodeUTF8(c, class_utf8data);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000900
eric@webkit.org8a344672007-11-29 11:38:26 +0000901 if (options & IgnoreCaseOption) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000902 int othercase;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000903 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000904 *class_utf8data++ = XCL_SINGLE;
darin@apple.com28e8eb92007-12-17 04:19:25 +0000905 class_utf8data += encodeUTF8(othercase, class_utf8data);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000906 }
907 }
eric@webkit.org61e47bf2007-11-30 23:43:45 +0000908 } else {
909 /* Handle a single-byte character */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000910 classbits[c/8] |= (1 << (c&7));
eric@webkit.org8a344672007-11-29 11:38:26 +0000911 if (options & IgnoreCaseOption) {
eric@webkit.org4c023352007-11-29 11:22:30 +0000912 c = flipCase(c);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000913 classbits[c/8] |= (1 << (c&7));
914 }
darin@apple.com51d6b262008-06-15 06:23:50 +0000915 classCharCount++;
916 classLastChar = c;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000917 }
918 }
919
darin@apple.com51d6b262008-06-15 06:23:50 +0000920 /* If classCharCount is 1, we saw precisely one character whose value is
eric@webkit.org5c35e222007-11-29 11:08:50 +0000921 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
922 can optimize the negative case only if there were no characters >= 128
923 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
924 single-bytes only. This is an historical hangover. Maybe one day we can
925 tidy these opcodes to handle multi-byte characters.
926
927 The optimization throws away the bit map. We turn the item into a
928 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
929 that OP_NOT does not support multibyte characters. In the positive case, it
darin@apple.com51d6b262008-06-15 06:23:50 +0000930 can cause firstByte to be set. Otherwise, there can be no first char if
eric@webkit.org5c35e222007-11-29 11:08:50 +0000931 this item is first, whatever repeat count may follow. In the case of
darin@apple.com51d6b262008-06-15 06:23:50 +0000932 reqByte, save the previous value for reinstating. */
eric@webkit.org5c35e222007-11-29 11:08:50 +0000933
darin@apple.com51d6b262008-06-15 06:23:50 +0000934 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
935 zeroReqByte = reqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000936
937 /* The OP_NOT opcode works on one-byte characters only. */
938
darin@apple.com51d6b262008-06-15 06:23:50 +0000939 if (negateClass) {
940 if (firstByte == REQ_UNSET)
941 firstByte = REQ_NONE;
942 zeroFirstByte = firstByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000943 *code++ = OP_NOT;
darin@apple.com51d6b262008-06-15 06:23:50 +0000944 *code++ = classLastChar;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000945 break;
946 }
947
948 /* For a single, positive character, get the value into c, and
949 then we can handle this with the normal one-character code. */
950
darin@apple.com51d6b262008-06-15 06:23:50 +0000951 c = classLastChar;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000952 goto NORMAL_CHAR;
953 } /* End of 1-char optimization */
954
955 /* The general case - not the one-char optimization. If this is the first
956 thing in the branch, there can be no first char setting, whatever the
darin@apple.com51d6b262008-06-15 06:23:50 +0000957 repeat count. Any reqByte setting must remain unchanged after any kind of
eric@webkit.org5c35e222007-11-29 11:08:50 +0000958 repeat. */
959
darin@apple.com51d6b262008-06-15 06:23:50 +0000960 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
961 zeroFirstByte = firstByte;
962 zeroReqByte = reqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000963
964 /* If there are characters with values > 255, we have to compile an
965 extended class, with its own opcode. If there are no characters < 256,
966 we can omit the bitmap. */
967
darin@apple.com51d6b262008-06-15 06:23:50 +0000968 if (class_utf8 && !shouldFlipNegation) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000969 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
970 *code++ = OP_XCLASS;
971 code += LINK_SIZE;
darin@apple.com51d6b262008-06-15 06:23:50 +0000972 *code = negateClass? XCL_NOT : 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +0000973
974 /* If the map is required, install it, and move on to the end of
975 the extra data */
976
darin@apple.com51d6b262008-06-15 06:23:50 +0000977 if (classCharCount > 0) {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000978 *code++ |= XCL_MAP;
979 memcpy(code, classbits, 32);
980 code = class_utf8data;
981 }
982
983 /* If the map is not required, slide down the extra data. */
984
eric@webkit.org43ea6d72007-11-29 11:11:23 +0000985 else {
eric@webkit.org5c35e222007-11-29 11:08:50 +0000986 int len = class_utf8data - (code + 33);
987 memmove(code + 1, code + 33, len);
988 code += len + 1;
989 }
990
991 /* Now fill in the complete length of the item */
992
darin@apple.com28e8eb92007-12-17 04:19:25 +0000993 putLinkValue(previous + 1, code - previous);
eric@webkit.org5c35e222007-11-29 11:08:50 +0000994 break; /* End of class handling */
darind7737ab2005-09-09 00:51:07 +0000995 }
eric@webkit.org5c35e222007-11-29 11:08:50 +0000996
997 /* If there are no characters > 255, negate the 32-byte map if necessary,
998 and copy it into the code vector. If this is the first thing in the branch,
darin@apple.com51d6b262008-06-15 06:23:50 +0000999 there can be no first char setting, whatever the repeat count. Any reqByte
eric@webkit.org5c35e222007-11-29 11:08:50 +00001000 setting must remain unchanged after any kind of repeat. */
1001
darin@apple.com51d6b262008-06-15 06:23:50 +00001002 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1003 if (negateClass)
eric@webkit.org43ea6d72007-11-29 11:11:23 +00001004 for (c = 0; c < 32; c++)
1005 code[c] = ~classbits[c];
eric@webkit.org5c35e222007-11-29 11:08:50 +00001006 else
eric@webkit.org5c35e222007-11-29 11:08:50 +00001007 memcpy(code, classbits, 32);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001008 code += 32;
1009 break;
eric@webkit.orge5416c02007-11-29 11:30:20 +00001010 }
darin@apple.com6b652e82007-11-30 18:54:34 +00001011
1012 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1013 has been tested above. */
1014
eric@webkit.org5c35e222007-11-29 11:08:50 +00001015 case '{':
darin@apple.com51d6b262008-06-15 06:23:50 +00001016 if (!isQuantifier)
eric@webkit.org5c35e222007-11-29 11:08:50 +00001017 goto NORMAL_CHAR;
darin@apple.com51d6b262008-06-15 06:23:50 +00001018 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
1019 if (*errorCodePtr)
eric@webkit.org5c35e222007-11-29 11:08:50 +00001020 goto FAILED;
1021 goto REPEAT;
1022
1023 case '*':
darin@apple.com51d6b262008-06-15 06:23:50 +00001024 repeatMin = 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001025 repeat_max = -1;
1026 goto REPEAT;
1027
1028 case '+':
darin@apple.com51d6b262008-06-15 06:23:50 +00001029 repeatMin = 1;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001030 repeat_max = -1;
1031 goto REPEAT;
1032
1033 case '?':
darin@apple.com51d6b262008-06-15 06:23:50 +00001034 repeatMin = 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001035 repeat_max = 1;
1036
1037 REPEAT:
1038 if (!previous) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001039 *errorCodePtr = ERR9;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001040 goto FAILED;
1041 }
1042
darin@apple.com51d6b262008-06-15 06:23:50 +00001043 if (repeatMin == 0) {
1044 firstByte = zeroFirstByte; /* Adjust for zero repeat */
1045 reqByte = zeroReqByte; /* Ditto */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001046 }
1047
1048 /* Remember whether this is a variable length repeat */
1049
darin@apple.com51d6b262008-06-15 06:23:50 +00001050 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001051
darin@apple.com51d6b262008-06-15 06:23:50 +00001052 opType = 0; /* Default single-char op codes */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001053
1054 /* Save start of previous item, in case we have to move it up to make space
1055 for an inserted OP_ONCE for the additional '+' extension. */
darin@apple.com28e8eb92007-12-17 04:19:25 +00001056 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001057
1058 tempcode = previous;
1059
1060 /* If the next character is '+', we have a possessive quantifier. This
1061 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1062 If the next character is '?' this is a minimizing repeat, by default,
1063 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1064 repeat type to the non-default. */
1065
eric@webkit.org61e47bf2007-11-30 23:43:45 +00001066 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001067 repeatType = 1;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001068 ptr++;
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001069 } else
darin@apple.com51d6b262008-06-15 06:23:50 +00001070 repeatType = 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001071
1072 /* If previous was a character match, abolish the item and generate a
1073 repeat item instead. If a char item has a minumum of more than one, ensure
darin@apple.com51d6b262008-06-15 06:23:50 +00001074 that it is set in reqByte - it might not be if a sequence such as x{3} is
1075 the first thing in a branch because the x will have gone into firstByte
eric@webkit.org5c35e222007-11-29 11:08:50 +00001076 instead. */
1077
eric@webkit.org5bc5d412007-11-29 11:32:46 +00001078 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001079 /* Deal with UTF-8 characters that take up more than one byte. It's
1080 easier to write this out separately than try to macrify it. Use c to
1081 hold the length of the character in bytes, plus 0x80 to flag that it's a
1082 length rather than a small character. */
1083
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001084 if (code[-1] & 0x80) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00001085 unsigned char *lastchar = code - 1;
eric@webkit.orgcc9c2772007-11-29 11:32:01 +00001086 while((*lastchar & 0xc0) == 0x80)
1087 lastchar--;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001088 c = code - lastchar; /* Length of UTF-8 character */
1089 memcpy(utf8_char, lastchar, c); /* Save the char */
1090 c |= 0x80; /* Flag c as a length */
1091 }
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001092 else {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001093 c = code[-1];
darin@apple.com51d6b262008-06-15 06:23:50 +00001094 if (repeatMin > 1)
1095 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001096 }
1097
1098 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1099 }
1100
eric@webkit.org5bc5d412007-11-29 11:32:46 +00001101 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001102 c = previous[1];
darin@apple.com51d6b262008-06-15 06:23:50 +00001103 if (repeatMin > 1)
1104 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001105 goto OUTPUT_SINGLE_REPEAT;
1106 }
1107
1108 /* If previous was a single negated character ([^a] or similar), we use
1109 one of the special opcodes, replacing it. The code is shared with single-
1110 character repeats by setting opt_type to add a suitable offset into
darin@apple.com51d6b262008-06-15 06:23:50 +00001111 repeatType. OP_NOT is currently used only for single-byte chars. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001112
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001113 else if (*previous == OP_NOT) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001114 opType = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001115 c = previous[1];
1116 goto OUTPUT_SINGLE_REPEAT;
1117 }
1118
1119 /* If previous was a character type match (\d or similar), abolish it and
1120 create a suitable repeat item. The code is shared with single-character
darin@apple.com51d6b262008-06-15 06:23:50 +00001121 repeats by setting opType to add a suitable offset into repeatType. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001122
darin@apple.com6b652e82007-11-30 18:54:34 +00001123 else if (*previous <= OP_NOT_NEWLINE) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001124 opType = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001125 c = *previous;
1126
1127 OUTPUT_SINGLE_REPEAT:
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001128 int prop_type = -1;
1129 int prop_value = -1;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001130
darin@apple.com28e8eb92007-12-17 04:19:25 +00001131 unsigned char* oldcode = code;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001132 code = previous; /* Usually overwrite previous item */
1133
1134 /* If the maximum is zero then the minimum must also be zero; Perl allows
1135 this case, so we do too - by simply omitting the item altogether. */
1136
1137 if (repeat_max == 0)
1138 goto END_REPEAT;
1139
darin@apple.com51d6b262008-06-15 06:23:50 +00001140 /* Combine the opType with the repeatType */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001141
darin@apple.com51d6b262008-06-15 06:23:50 +00001142 repeatType += opType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001143
1144 /* A minimum of zero is handled either as the special case * or ?, or as
1145 an UPTO, with the maximum given. */
1146
darin@apple.com51d6b262008-06-15 06:23:50 +00001147 if (repeatMin == 0) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001148 if (repeat_max == -1)
darin@apple.com51d6b262008-06-15 06:23:50 +00001149 *code++ = OP_STAR + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001150 else if (repeat_max == 1)
darin@apple.com51d6b262008-06-15 06:23:50 +00001151 *code++ = OP_QUERY + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001152 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001153 *code++ = OP_UPTO + repeatType;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001154 put2ByteValueAndAdvance(code, repeat_max);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001155 }
1156 }
1157
1158 /* A repeat minimum of 1 is optimized into some special cases. If the
1159 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1160 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1161 one less than the maximum. */
1162
darin@apple.com51d6b262008-06-15 06:23:50 +00001163 else if (repeatMin == 1) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001164 if (repeat_max == -1)
darin@apple.com51d6b262008-06-15 06:23:50 +00001165 *code++ = OP_PLUS + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001166 else {
1167 code = oldcode; /* leave previous item in place */
1168 if (repeat_max == 1)
1169 goto END_REPEAT;
darin@apple.com51d6b262008-06-15 06:23:50 +00001170 *code++ = OP_UPTO + repeatType;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001171 put2ByteValueAndAdvance(code, repeat_max - 1);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001172 }
1173 }
1174
1175 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1176 handled as an EXACT followed by an UPTO. */
1177
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001178 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001179 *code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType */
1180 put2ByteValueAndAdvance(code, repeatMin);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001181
1182 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1183 we have to insert the character for the previous code. For a repeated
1184 Unicode property match, there are two extra bytes that define the
1185 required property. In UTF-8 mode, long characters have their length in
1186 c, with the 0x80 bit as a flag. */
1187
1188 if (repeat_max < 0) {
1189 if (c >= 128) {
1190 memcpy(code, utf8_char, c & 7);
1191 code += c & 7;
1192 } else {
1193 *code++ = c;
1194 if (prop_type >= 0) {
1195 *code++ = prop_type;
1196 *code++ = prop_value;
1197 }
1198 }
darin@apple.com51d6b262008-06-15 06:23:50 +00001199 *code++ = OP_STAR + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001200 }
1201
1202 /* Else insert an UPTO if the max is greater than the min, again
1203 preceded by the character, for the previously inserted code. */
1204
darin@apple.com51d6b262008-06-15 06:23:50 +00001205 else if (repeat_max != repeatMin) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001206 if (c >= 128) {
1207 memcpy(code, utf8_char, c & 7);
1208 code += c & 7;
1209 } else
1210 *code++ = c;
1211 if (prop_type >= 0) {
1212 *code++ = prop_type;
1213 *code++ = prop_value;
1214 }
darin@apple.com51d6b262008-06-15 06:23:50 +00001215 repeat_max -= repeatMin;
1216 *code++ = OP_UPTO + repeatType;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001217 put2ByteValueAndAdvance(code, repeat_max);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001218 }
1219 }
1220
1221 /* The character or character type itself comes last in all cases. */
1222
1223 if (c >= 128) {
1224 memcpy(code, utf8_char, c & 7);
1225 code += c & 7;
1226 } else
1227 *code++ = c;
1228
1229 /* For a repeated Unicode property match, there are two extra bytes that
1230 define the required property. */
1231
1232 if (prop_type >= 0) {
1233 *code++ = prop_type;
1234 *code++ = prop_value;
1235 }
1236 }
1237
1238 /* If previous was a character class or a back reference, we put the repeat
1239 stuff after it, but just skip the item if the repeat was {0,0}. */
1240
1241 else if (*previous == OP_CLASS ||
1242 *previous == OP_NCLASS ||
1243 *previous == OP_XCLASS ||
1244 *previous == OP_REF)
1245 {
1246 if (repeat_max == 0) {
1247 code = previous;
1248 goto END_REPEAT;
1249 }
1250
darin@apple.com51d6b262008-06-15 06:23:50 +00001251 if (repeatMin == 0 && repeat_max == -1)
1252 *code++ = OP_CRSTAR + repeatType;
1253 else if (repeatMin == 1 && repeat_max == -1)
1254 *code++ = OP_CRPLUS + repeatType;
1255 else if (repeatMin == 0 && repeat_max == 1)
1256 *code++ = OP_CRQUERY + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001257 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001258 *code++ = OP_CRRANGE + repeatType;
1259 put2ByteValueAndAdvance(code, repeatMin);
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001260 if (repeat_max == -1)
1261 repeat_max = 0; /* 2-byte encoding for max */
darin@apple.com28e8eb92007-12-17 04:19:25 +00001262 put2ByteValueAndAdvance(code, repeat_max);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001263 }
1264 }
1265
1266 /* If previous was a bracket group, we may have to replicate it in certain
1267 cases. */
1268
darin@apple.com28e8eb92007-12-17 04:19:25 +00001269 else if (*previous >= OP_BRA) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001270 int ketoffset = 0;
1271 int len = code - previous;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001272 unsigned char* bralink = NULL;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001273
1274 /* If the maximum repeat count is unlimited, find the end of the bracket
1275 by scanning through from the start, and compute the offset back to it
1276 from the current code pointer. There may be an OP_OPT setting following
1277 the final KET, so we can't find the end just by going back from the code
1278 pointer. */
1279
1280 if (repeat_max == -1) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00001281 const unsigned char* ket = previous;
1282 advanceToEndOfBracket(ket);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001283 ketoffset = code - ket;
1284 }
1285
1286 /* The case of a zero minimum is special because of the need to stick
1287 OP_BRAZERO in front of it, and because the group appears once in the
1288 data, whereas in other cases it appears the minimum number of times. For
1289 this reason, it is simplest to treat this case separately, as otherwise
1290 the code gets far too messy. There are several special subcases when the
1291 minimum is zero. */
1292
darin@apple.com51d6b262008-06-15 06:23:50 +00001293 if (repeatMin == 0) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001294 /* If the maximum is also zero, we just omit the group from the output
1295 altogether. */
1296
1297 if (repeat_max == 0) {
1298 code = previous;
1299 goto END_REPEAT;
1300 }
1301
1302 /* If the maximum is 1 or unlimited, we just have to stick in the
1303 BRAZERO and do no more at this point. However, we do need to adjust
1304 any OP_RECURSE calls inside the group that refer to the group itself or
1305 any internal group, because the offset is from the start of the whole
1306 regex. Temporarily terminate the pattern while doing this. */
1307
1308 if (repeat_max <= 1) {
1309 *code = OP_END;
1310 memmove(previous+1, previous, len);
1311 code++;
darin@apple.com51d6b262008-06-15 06:23:50 +00001312 *previous++ = OP_BRAZERO + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001313 }
1314
1315 /* If the maximum is greater than 1 and limited, we have to replicate
1316 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1317 The first one has to be handled carefully because it's the original
1318 copy, which has to be moved up. The remainder can be handled by code
1319 that is common with the non-zero minimum case below. We have to
darin@apple.come410bdf2007-12-04 19:08:28 +00001320 adjust the value of repeat_max, since one less copy is required. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001321
1322 else {
1323 *code = OP_END;
1324 memmove(previous + 2 + LINK_SIZE, previous, len);
1325 code += 2 + LINK_SIZE;
darin@apple.com51d6b262008-06-15 06:23:50 +00001326 *previous++ = OP_BRAZERO + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001327 *previous++ = OP_BRA;
1328
1329 /* We chain together the bracket offset fields that have to be
1330 filled in later when the ends of the brackets are reached. */
1331
eric@webkit.orga818f3a2007-11-29 11:23:11 +00001332 int offset = (!bralink) ? 0 : previous - bralink;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001333 bralink = previous;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001334 putLinkValueAllowZeroAndAdvance(previous, offset);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001335 }
1336
1337 repeat_max--;
1338 }
1339
1340 /* If the minimum is greater than zero, replicate the group as many
1341 times as necessary, and adjust the maximum to the number of subsequent
1342 copies that we need. If we set a first char from the group, and didn't
1343 set a required char, copy the latter from the former. */
1344
1345 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001346 if (repeatMin > 1) {
1347 if (didGroupSetFirstByte && reqByte < 0)
1348 reqByte = firstByte;
1349 for (int i = 1; i < repeatMin; i++) {
eric@webkit.org5c35e222007-11-29 11:08:50 +00001350 memcpy(code, previous, len);
1351 code += len;
1352 }
1353 }
1354 if (repeat_max > 0)
darin@apple.com51d6b262008-06-15 06:23:50 +00001355 repeat_max -= repeatMin;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001356 }
1357
1358 /* This code is common to both the zero and non-zero minimum cases. If
1359 the maximum is limited, it replicates the group in a nested fashion,
1360 remembering the bracket starts on a stack. In the case of a zero minimum,
1361 the first one was set up above. In all cases the repeat_max now specifies
1362 the number of additional copies needed. */
1363
1364 if (repeat_max >= 0) {
1365 for (int i = repeat_max - 1; i >= 0; i--) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001366 *code++ = OP_BRAZERO + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001367
1368 /* All but the final copy start a new nesting, maintaining the
1369 chain of brackets outstanding. */
1370
1371 if (i != 0) {
1372 *code++ = OP_BRA;
1373 int offset = (!bralink) ? 0 : code - bralink;
1374 bralink = code;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001375 putLinkValueAllowZeroAndAdvance(code, offset);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001376 }
1377
1378 memcpy(code, previous, len);
1379 code += len;
1380 }
1381
1382 /* Now chain through the pending brackets, and fill in their length
1383 fields (which are holding the chain links pro tem). */
1384
1385 while (bralink) {
1386 int offset = code - bralink + 1;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001387 unsigned char* bra = code - offset;
1388 int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1389 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001390 *code++ = OP_KET;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001391 putLinkValueAndAdvance(code, offset);
1392 putLinkValue(bra + 1, offset);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001393 }
1394 }
1395
1396 /* If the maximum is unlimited, set a repeater in the final copy. We
1397 can't just offset backwards from the current code point, because we
1398 don't know if there's been an options resetting after the ket. The
1399 correct offset was computed above. */
1400
1401 else
darin@apple.com51d6b262008-06-15 06:23:50 +00001402 code[-ketoffset] = OP_KETRMAX + repeatType;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001403 }
1404
1405 /* Else there's some kind of shambles */
1406
1407 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001408 *errorCodePtr = ERR11;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001409 goto FAILED;
1410 }
1411
1412 /* In all case we no longer have a previous item. We also set the
1413 "follows varying string" flag for subsequently encountered reqbytes if
1414 it isn't already set and we have just passed a varying length item. */
1415
1416 END_REPEAT:
1417 previous = NULL;
darin@apple.com51d6b262008-06-15 06:23:50 +00001418 cd.reqVaryOpt |= reqvary;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001419 break;
1420
darin@apple.com6b652e82007-11-30 18:54:34 +00001421 /* Start of nested bracket sub-expression, or comment or lookahead or
1422 lookbehind or option setting or condition. First deal with special things
1423 that can come after a bracket; all are introduced by ?, and the appearance
1424 of any of them means that this is not a referencing group. They were
1425 checked for validity in the first pass over the string, so we don't have to
1426 check for syntax errors here. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001427
1428 case '(':
darin@apple.com51d6b262008-06-15 06:23:50 +00001429 skipBytes = 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001430
1431 if (*(++ptr) == '?') {
1432 switch (*(++ptr)) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00001433 case ':': /* Non-extracting bracket */
1434 bravalue = OP_BRA;
1435 ptr++;
1436 break;
1437
1438 case '=': /* Positive lookahead */
1439 bravalue = OP_ASSERT;
1440 ptr++;
1441 break;
1442
1443 case '!': /* Negative lookahead */
1444 bravalue = OP_ASSERT_NOT;
1445 ptr++;
1446 break;
1447
eric@webkit.org5c35e222007-11-29 11:08:50 +00001448 /* Character after (? not specially recognized */
darin@apple.com28e8eb92007-12-17 04:19:25 +00001449
1450 default:
darin@apple.com51d6b262008-06-15 06:23:50 +00001451 *errorCodePtr = ERR12;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001452 goto FAILED;
1453 }
eric@webkit.org5c35e222007-11-29 11:08:50 +00001454 }
1455
1456 /* Else we have a referencing group; adjust the opcode. If the bracket
1457 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1458 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1459
1460 else {
1461 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1462 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
eric@webkit.orga69618d2007-11-29 11:21:07 +00001463 code[1 + LINK_SIZE] = OP_BRANUMBER;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001464 put2ByteValue(code + 2 + LINK_SIZE, *brackets);
darin@apple.com51d6b262008-06-15 06:23:50 +00001465 skipBytes = 3;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001466 }
1467 else
1468 bravalue = OP_BRA + *brackets;
1469 }
1470
1471 /* Process nested bracketed re. Assertions may not be repeated, but other
1472 kinds can be. We copy code into a non-variable in order to be able
1473 to pass its address because some compilers complain otherwise. Pass in a
1474 new setting for the ims options if they have changed. */
1475
darin@apple.com28e8eb92007-12-17 04:19:25 +00001476 previous = (bravalue >= OP_BRAZERO) ? code : 0;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001477 *code = bravalue;
1478 tempcode = code;
darin@apple.com51d6b262008-06-15 06:23:50 +00001479 tempreqvary = cd.reqVaryOpt; /* Save value before bracket */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001480
darin@apple.comd5cbb632007-12-10 06:22:04 +00001481 if (!compileBracket(
eric@webkit.org5c35e222007-11-29 11:08:50 +00001482 options,
1483 brackets, /* Extracting bracket count */
1484 &tempcode, /* Where to put code (updated) */
1485 &ptr, /* Input pointer (updated) */
1486 patternEnd,
darin@apple.com51d6b262008-06-15 06:23:50 +00001487 errorCodePtr, /* Where to put an error message */
1488 skipBytes, /* Skip over OP_BRANUMBER */
1489 &subFirstByte, /* For possible first char */
1490 &subReqByte, /* For possible last char */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001491 cd)) /* Tables block */
1492 goto FAILED;
1493
1494 /* At the end of compiling, code is still pointing to the start of the
1495 group, while tempcode has been updated to point past the end of the group
1496 and any option resetting that may follow it. The pattern pointer (ptr)
1497 is on the bracket. */
1498
1499 /* Handle updating of the required and first characters. Update for normal
1500 brackets of all kinds, and conditions with two branches (see code above).
1501 If the bracket is followed by a quantifier with zero repeat, we have to
darin@apple.com51d6b262008-06-15 06:23:50 +00001502 back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
eric@webkit.org5c35e222007-11-29 11:08:50 +00001503 main loop so that they can be accessed for the back off. */
1504
darin@apple.com51d6b262008-06-15 06:23:50 +00001505 zeroReqByte = reqByte;
1506 zeroFirstByte = firstByte;
1507 didGroupSetFirstByte = false;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001508
darin@apple.com28e8eb92007-12-17 04:19:25 +00001509 if (bravalue >= OP_BRA) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001510 /* If we have not yet set a firstByte in this branch, take it from the
eric@webkit.org5c35e222007-11-29 11:08:50 +00001511 subpattern, remembering that it was set here so that a repeat of more
darin@apple.com51d6b262008-06-15 06:23:50 +00001512 than one can replicate it as reqByte if necessary. If the subpattern has
1513 no firstByte, set "none" for the whole branch. In both cases, a zero
1514 repeat forces firstByte to "none". */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001515
darin@apple.com51d6b262008-06-15 06:23:50 +00001516 if (firstByte == REQ_UNSET) {
1517 if (subFirstByte >= 0) {
1518 firstByte = subFirstByte;
1519 didGroupSetFirstByte = true;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001520 }
1521 else
darin@apple.com51d6b262008-06-15 06:23:50 +00001522 firstByte = REQ_NONE;
1523 zeroFirstByte = REQ_NONE;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001524 }
1525
darin@apple.com51d6b262008-06-15 06:23:50 +00001526 /* If firstByte was previously set, convert the subpattern's firstByte
1527 into reqByte if there wasn't one, using the vary flag that was in
eric@webkit.org5c35e222007-11-29 11:08:50 +00001528 existence beforehand. */
1529
darin@apple.com51d6b262008-06-15 06:23:50 +00001530 else if (subFirstByte >= 0 && subReqByte < 0)
1531 subReqByte = subFirstByte | tempreqvary;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001532
1533 /* If the subpattern set a required byte (or set a first byte that isn't
1534 really the first byte - see above), set it. */
1535
darin@apple.com51d6b262008-06-15 06:23:50 +00001536 if (subReqByte >= 0)
1537 reqByte = subReqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001538 }
1539
darin@apple.com51d6b262008-06-15 06:23:50 +00001540 /* For a forward assertion, we take the reqByte, if set. This can be
eric@webkit.org5c35e222007-11-29 11:08:50 +00001541 helpful if the pattern that follows the assertion doesn't set a different
darin@apple.com51d6b262008-06-15 06:23:50 +00001542 char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
eric@webkit.org5c35e222007-11-29 11:08:50 +00001543 for an assertion, however because it leads to incorrect effect for patterns
darin@apple.com51d6b262008-06-15 06:23:50 +00001544 such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1545 of a firstByte. This is overcome by a scan at the end if there's no
1546 firstByte, looking for an asserted first char. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001547
darin@apple.com51d6b262008-06-15 06:23:50 +00001548 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1549 reqByte = subReqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001550
1551 /* Now update the main code pointer to the end of the group. */
1552
1553 code = tempcode;
1554
1555 /* Error if hit end of pattern */
1556
1557 if (ptr >= patternEnd || *ptr != ')') {
darin@apple.com51d6b262008-06-15 06:23:50 +00001558 *errorCodePtr = ERR14;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001559 goto FAILED;
1560 }
1561 break;
1562
darin@apple.com6b652e82007-11-30 18:54:34 +00001563 /* Check \ for being a real metacharacter; if not, fall through and handle
1564 it as a data character at the start of a string. Escape items are checked
1565 for validity in the pre-compiling pass. */
eric@webkit.org5c35e222007-11-29 11:08:50 +00001566
1567 case '\\':
1568 tempptr = ptr;
darin@apple.com51d6b262008-06-15 06:23:50 +00001569 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001570
1571 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1572 are arranged to be the negation of the corresponding OP_values. For the
1573 back references, the values are ESC_REF plus the reference number. Only
1574 back references and those types that consume a character may be repeated.
1575 We can test for values between ESC_b and ESC_w for the latter; this may
1576 have to change if any new ones are ever created. */
1577
1578 if (c < 0) {
1579 /* For metasequences that actually match a character, we disable the
1580 setting of a first character if it hasn't already been set. */
1581
darin@apple.com51d6b262008-06-15 06:23:50 +00001582 if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1583 firstByte = REQ_NONE;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001584
1585 /* Set values to reset to if this is followed by a zero repeat. */
1586
darin@apple.com51d6b262008-06-15 06:23:50 +00001587 zeroFirstByte = firstByte;
1588 zeroReqByte = reqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001589
1590 /* Back references are handled specially */
1591
1592 if (-c >= ESC_REF) {
1593 int number = -c - ESC_REF;
1594 previous = code;
1595 *code++ = OP_REF;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001596 put2ByteValueAndAdvance(code, number);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001597 }
1598
1599 /* For the rest, we can obtain the OP value by negating the escape
1600 value */
1601
1602 else {
darin@apple.comd5cbb632007-12-10 06:22:04 +00001603 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001604 *code++ = -c;
1605 }
1606 continue;
1607 }
1608
1609 /* Fall through. */
1610
1611 /* Handle a literal character. It is guaranteed not to be whitespace or #
1612 when the extended flag is set. If we are in UTF-8 mode, it may be a
1613 multi-byte literal character. */
1614
1615 default:
1616 NORMAL_CHAR:
1617
1618 previous = code;
1619
1620 if (c < 128) {
darin@apple.com51d6b262008-06-15 06:23:50 +00001621 mcLength = 1;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001622 mcbuffer[0] = c;
1623
eric@webkit.org8a344672007-11-29 11:38:26 +00001624 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
eric@webkit.org5bc5d412007-11-29 11:32:46 +00001625 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001626 *code++ = c | 0x20;
1627 } else {
1628 *code++ = OP_ASCII_CHAR;
1629 *code++ = c;
1630 }
1631 } else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001632 mcLength = encodeUTF8(c, mcbuffer);
eric@webkit.org5c35e222007-11-29 11:08:50 +00001633
eric@webkit.org8a344672007-11-29 11:38:26 +00001634 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
darin@apple.com51d6b262008-06-15 06:23:50 +00001635 for (c = 0; c < mcLength; c++)
eric@webkit.org5c35e222007-11-29 11:08:50 +00001636 *code++ = mcbuffer[c];
1637 }
1638
1639 /* Set the first and required bytes appropriately. If no previous first
1640 byte, set it from this character, but revert to none on a zero repeat.
darin@apple.com51d6b262008-06-15 06:23:50 +00001641 Otherwise, leave the firstByte value alone, and don't change it on a zero
eric@webkit.org5c35e222007-11-29 11:08:50 +00001642 repeat. */
1643
darin@apple.com51d6b262008-06-15 06:23:50 +00001644 if (firstByte == REQ_UNSET) {
1645 zeroFirstByte = REQ_NONE;
1646 zeroReqByte = reqByte;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001647
darin@apple.com51d6b262008-06-15 06:23:50 +00001648 /* If the character is more than one byte long, we can set firstByte
eric@webkit.org5c35e222007-11-29 11:08:50 +00001649 only if it is not to be matched caselessly. */
1650
darin@apple.com51d6b262008-06-15 06:23:50 +00001651 if (mcLength == 1 || reqCaseOpt == 0) {
1652 firstByte = mcbuffer[0] | reqCaseOpt;
1653 if (mcLength != 1)
1654 reqByte = code[-1] | cd.reqVaryOpt;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001655 }
1656 else
darin@apple.com51d6b262008-06-15 06:23:50 +00001657 firstByte = reqByte = REQ_NONE;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001658 }
1659
darin@apple.com51d6b262008-06-15 06:23:50 +00001660 /* firstByte was previously set; we can set reqByte only the length is
eric@webkit.org5c35e222007-11-29 11:08:50 +00001661 1 or the matching is caseful. */
1662
1663 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001664 zeroFirstByte = firstByte;
1665 zeroReqByte = reqByte;
1666 if (mcLength == 1 || reqCaseOpt == 0)
1667 reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001668 }
1669
1670 break; /* End of literal character handling */
darind7737ab2005-09-09 00:51:07 +00001671 }
eric@webkit.org5c35e222007-11-29 11:08:50 +00001672 } /* end of big loop */
1673
1674 /* Control never reaches here by falling through, only by a goto for all the
1675 error states. Pass back the position in the pattern so that it can be displayed
1676 to the user for diagnosing the error. */
1677
darind7737ab2005-09-09 00:51:07 +00001678FAILED:
darin@apple.com51d6b262008-06-15 06:23:50 +00001679 *ptrPtr = ptr;
eric@webkit.org5c35e222007-11-29 11:08:50 +00001680 return false;
darind7737ab2005-09-09 00:51:07 +00001681}
1682
darind7737ab2005-09-09 00:51:07 +00001683/*************************************************
1684* Compile sequence of alternatives *
1685*************************************************/
1686
1687/* On entry, ptr is pointing past the bracket character, but on return
1688it points to the closing bracket, or vertical bar, or end of string.
1689The code variable is pointing at the byte into which the BRA operator has been
1690stored. If the ims options are changed at the start (for a (?ims: group) or
1691during any branch, we need to insert an OP_OPT item at the start of every
1692following branch to ensure they get set correctly at run time, and also pass
1693the new options into every subsequent branch compile.
1694
1695Argument:
1696 options option bits, including any changes for this subpattern
darind7737ab2005-09-09 00:51:07 +00001697 brackets -> int containing the number of extracting brackets used
darin@apple.com51d6b262008-06-15 06:23:50 +00001698 codePtr -> the address of the current code pointer
1699 ptrPtr -> the address of the current pattern pointer
1700 errorCodePtr -> pointer to error code variable
1701 skipBytes skip this many bytes at start (for OP_BRANUMBER)
darind7737ab2005-09-09 00:51:07 +00001702 firstbyteptr place to put the first required character, or a negative number
1703 reqbyteptr place to put the last required character, or a negative number
darind7737ab2005-09-09 00:51:07 +00001704 cd points to the data block with tables pointers etc.
1705
darin@apple.com7f984872007-11-12 23:04:41 +00001706Returns: true on success
darind7737ab2005-09-09 00:51:07 +00001707*/
1708
eric@webkit.orgc8d4c7d2007-11-29 11:09:42 +00001709static bool
darin@apple.com51d6b262008-06-15 06:23:50 +00001710compileBracket(int options, int* brackets, unsigned char** codePtr,
1711 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
darin@apple.com8ad18ea2007-12-07 19:59:58 +00001712 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
darind7737ab2005-09-09 00:51:07 +00001713{
darin@apple.com51d6b262008-06-15 06:23:50 +00001714 const UChar* ptr = *ptrPtr;
1715 unsigned char* code = *codePtr;
1716 unsigned char* lastBranch = code;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001717 unsigned char* start_bracket = code;
darin@apple.com51d6b262008-06-15 06:23:50 +00001718 int firstByte = REQ_UNSET;
1719 int reqByte = REQ_UNSET;
eric@webkit.org326e2022007-11-15 02:40:31 +00001720
1721 /* Offset is set zero to mark that this bracket is still open */
1722
darin@apple.com28e8eb92007-12-17 04:19:25 +00001723 putLinkValueAllowZero(code + 1, 0);
darin@apple.com51d6b262008-06-15 06:23:50 +00001724 code += 1 + LINK_SIZE + skipBytes;
eric@webkit.org326e2022007-11-15 02:40:31 +00001725
1726 /* Loop for each alternative branch */
1727
1728 while (true) {
1729 /* Now compile the branch */
1730
darin@apple.com51d6b262008-06-15 06:23:50 +00001731 int branchFirstByte;
1732 int branchReqByte;
1733 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
1734 &branchFirstByte, &branchReqByte, cd)) {
1735 *ptrPtr = ptr;
eric@webkit.org326e2022007-11-15 02:40:31 +00001736 return false;
1737 }
1738
darin@apple.com51d6b262008-06-15 06:23:50 +00001739 /* If this is the first branch, the firstByte and reqByte values for the
eric@webkit.org326e2022007-11-15 02:40:31 +00001740 branch become the values for the regex. */
1741
darin@apple.com51d6b262008-06-15 06:23:50 +00001742 if (*lastBranch != OP_ALT) {
1743 firstByte = branchFirstByte;
1744 reqByte = branchReqByte;
eric@webkit.org326e2022007-11-15 02:40:31 +00001745 }
1746
darin@apple.com51d6b262008-06-15 06:23:50 +00001747 /* If this is not the first branch, the first char and reqByte have to
eric@webkit.org326e2022007-11-15 02:40:31 +00001748 match the values from all the previous branches, except that if the previous
darin@apple.com51d6b262008-06-15 06:23:50 +00001749 value for reqByte didn't have REQ_VARY set, it can still match, and we set
eric@webkit.org326e2022007-11-15 02:40:31 +00001750 REQ_VARY for the regex. */
1751
1752 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00001753 /* If we previously had a firstByte, but it doesn't match the new branch,
1754 we have to abandon the firstByte for the regex, but if there was previously
1755 no reqByte, it takes on the value of the old firstByte. */
eric@webkit.org326e2022007-11-15 02:40:31 +00001756
darin@apple.com51d6b262008-06-15 06:23:50 +00001757 if (firstByte >= 0 && firstByte != branchFirstByte) {
1758 if (reqByte < 0)
1759 reqByte = firstByte;
1760 firstByte = REQ_NONE;
eric@webkit.org326e2022007-11-15 02:40:31 +00001761 }
1762
darin@apple.com51d6b262008-06-15 06:23:50 +00001763 /* If we (now or from before) have no firstByte, a firstByte from the
1764 branch becomes a reqByte if there isn't a branch reqByte. */
eric@webkit.org326e2022007-11-15 02:40:31 +00001765
darin@apple.com51d6b262008-06-15 06:23:50 +00001766 if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1767 branchReqByte = branchFirstByte;
eric@webkit.org326e2022007-11-15 02:40:31 +00001768
1769 /* Now ensure that the reqbytes match */
1770
darin@apple.com51d6b262008-06-15 06:23:50 +00001771 if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1772 reqByte = REQ_NONE;
eric@webkit.org326e2022007-11-15 02:40:31 +00001773 else
darin@apple.com51d6b262008-06-15 06:23:50 +00001774 reqByte |= branchReqByte; /* To "or" REQ_VARY */
eric@webkit.org326e2022007-11-15 02:40:31 +00001775 }
1776
1777 /* Reached end of expression, either ')' or end of pattern. Go back through
1778 the alternative branches and reverse the chain of offsets, with the field in
1779 the BRA item now becoming an offset to the first alternative. If there are
1780 no alternatives, it points to the end of the group. The length in the
1781 terminating ket is always the length of the whole bracketed item. If any of
1782 the ims options were changed inside the group, compile a resetting op-code
1783 following, except at the very end of the pattern. Return leaving the pointer
1784 at the terminating char. */
1785
1786 if (ptr >= patternEnd || *ptr != '|') {
darin@apple.com51d6b262008-06-15 06:23:50 +00001787 int length = code - lastBranch;
eric@webkit.org326e2022007-11-15 02:40:31 +00001788 do {
darin@apple.com51d6b262008-06-15 06:23:50 +00001789 int prevLength = getLinkValueAllowZero(lastBranch + 1);
1790 putLinkValue(lastBranch + 1, length);
1791 length = prevLength;
1792 lastBranch -= length;
eric@webkit.org326e2022007-11-15 02:40:31 +00001793 } while (length > 0);
1794
1795 /* Fill in the ket */
1796
1797 *code = OP_KET;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001798 putLinkValue(code + 1, code - start_bracket);
eric@webkit.org326e2022007-11-15 02:40:31 +00001799 code += 1 + LINK_SIZE;
1800
1801 /* Set values to pass back */
1802
darin@apple.com51d6b262008-06-15 06:23:50 +00001803 *codePtr = code;
1804 *ptrPtr = ptr;
1805 *firstbyteptr = firstByte;
1806 *reqbyteptr = reqByte;
eric@webkit.org326e2022007-11-15 02:40:31 +00001807 return true;
1808 }
1809
1810 /* Another branch follows; insert an "or" node. Its length field points back
1811 to the previous branch while the bracket remains open. At the end the chain
1812 is reversed. It's done like this so that the start of the bracket has a
1813 zero offset until it is closed, making it possible to detect recursion. */
1814
1815 *code = OP_ALT;
darin@apple.com51d6b262008-06-15 06:23:50 +00001816 putLinkValue(code + 1, code - lastBranch);
1817 lastBranch = code;
eric@webkit.org326e2022007-11-15 02:40:31 +00001818 code += 1 + LINK_SIZE;
1819 ptr++;
darind7737ab2005-09-09 00:51:07 +00001820 }
eric@webkit.org326e2022007-11-15 02:40:31 +00001821 ASSERT_NOT_REACHED();
darind7737ab2005-09-09 00:51:07 +00001822}
1823
darind7737ab2005-09-09 00:51:07 +00001824/*************************************************
1825* Check for anchored expression *
1826*************************************************/
1827
1828/* Try to find out if this is an anchored regular expression. Consider each
darin@apple.comd5cbb632007-12-10 06:22:04 +00001829alternative branch. If they all start OP_CIRC, or with a bracket
1830all of whose alternatives start OP_CIRC (recurse ad lib), then
1831it's anchored.
darind7737ab2005-09-09 00:51:07 +00001832
1833Arguments:
darin@apple.comd5cbb632007-12-10 06:22:04 +00001834 code points to start of expression (the bracket)
1835 captureMap a bitmap of which brackets we are inside while testing; this
1836 handles up to substring 31; all brackets after that share
1837 the zero bit
1838 backrefMap the back reference bitmap
darind7737ab2005-09-09 00:51:07 +00001839*/
1840
darin@apple.com28e8eb92007-12-17 04:19:25 +00001841static bool branchIsAnchored(const unsigned char* code)
darin@apple.comd5cbb632007-12-10 06:22:04 +00001842{
darin@apple.com28e8eb92007-12-17 04:19:25 +00001843 const unsigned char* scode = firstSignificantOpcode(code);
darin@apple.comd5cbb632007-12-10 06:22:04 +00001844 int op = *scode;
1845
1846 /* Brackets */
darin@apple.com28e8eb92007-12-17 04:19:25 +00001847 if (op >= OP_BRA || op == OP_ASSERT)
darin@apple.comd5cbb632007-12-10 06:22:04 +00001848 return bracketIsAnchored(scode);
1849
1850 /* Check for explicit anchoring */
1851 return op == OP_CIRC;
1852}
1853
darin@apple.com28e8eb92007-12-17 04:19:25 +00001854static bool bracketIsAnchored(const unsigned char* code)
darind7737ab2005-09-09 00:51:07 +00001855{
eric@webkit.org76e87c22007-11-29 11:16:01 +00001856 do {
darin@apple.comd5cbb632007-12-10 06:22:04 +00001857 if (!branchIsAnchored(code + 1 + LINK_SIZE))
eric@webkit.org76e87c22007-11-29 11:16:01 +00001858 return false;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001859 code += getLinkValue(code + 1);
eric@webkit.org76e87c22007-11-29 11:16:01 +00001860 } while (*code == OP_ALT); /* Loop for each alternative */
1861 return true;
darind7737ab2005-09-09 00:51:07 +00001862}
1863
darind7737ab2005-09-09 00:51:07 +00001864/*************************************************
1865* Check for starting with ^ or .* *
1866*************************************************/
1867
1868/* This is called to find out if every branch starts with ^ or .* so that
1869"first char" processing can be done to speed things up in multiline
1870matching and for non-DOTALL patterns that start with .* (which must start at
darin@apple.comd5cbb632007-12-10 06:22:04 +00001871the beginning or after \n)
1872
1873Except when the .* appears inside capturing parentheses, and there is a
1874subsequent back reference to those parentheses. By keeping a bitmap of the
1875first 31 back references, we can catch some of the more common cases more
1876precisely; all the greater back references share a single bit.
darind7737ab2005-09-09 00:51:07 +00001877
1878Arguments:
darin@apple.comd5cbb632007-12-10 06:22:04 +00001879 code points to start of expression (the bracket)
1880 captureMap a bitmap of which brackets we are inside while testing; this
1881 handles up to substring 31; all brackets after that share
1882 the zero bit
1883 backrefMap the back reference bitmap
darind7737ab2005-09-09 00:51:07 +00001884*/
1885
darin@apple.com28e8eb92007-12-17 04:19:25 +00001886static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
darin@apple.comd5cbb632007-12-10 06:22:04 +00001887{
darin@apple.com28e8eb92007-12-17 04:19:25 +00001888 const unsigned char* scode = firstSignificantOpcode(code);
darin@apple.comd5cbb632007-12-10 06:22:04 +00001889 int op = *scode;
1890
1891 /* Capturing brackets */
1892 if (op > OP_BRA) {
1893 int captureNum = op - OP_BRA;
1894 if (captureNum > EXTRACT_BASIC_MAX)
darin@apple.com28e8eb92007-12-17 04:19:25 +00001895 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
darin@apple.comd5cbb632007-12-10 06:22:04 +00001896 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1897 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1898 }
1899
1900 /* Other brackets */
darin@apple.com28e8eb92007-12-17 04:19:25 +00001901 if (op == OP_BRA || op == OP_ASSERT)
darin@apple.comd5cbb632007-12-10 06:22:04 +00001902 return bracketNeedsLineStart(scode, captureMap, backrefMap);
1903
1904 /* .* means "start at start or after \n" if it isn't in brackets that
1905 may be referenced. */
1906
1907 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1908 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1909
1910 /* Explicit ^ */
eric@webkit.org7fbd8932008-03-31 08:01:09 +00001911 return op == OP_CIRC || op == OP_BOL;
darin@apple.comd5cbb632007-12-10 06:22:04 +00001912}
1913
darin@apple.com28e8eb92007-12-17 04:19:25 +00001914static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
darind7737ab2005-09-09 00:51:07 +00001915{
eric@webkit.orgdbdae8d2007-11-29 11:13:33 +00001916 do {
darin@apple.comd5cbb632007-12-10 06:22:04 +00001917 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
eric@webkit.orgdbdae8d2007-11-29 11:13:33 +00001918 return false;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001919 code += getLinkValue(code + 1);
eric@webkit.org96e3d9b2007-11-29 11:15:13 +00001920 } while (*code == OP_ALT); /* Loop for each alternative */
eric@webkit.orgdbdae8d2007-11-29 11:13:33 +00001921 return true;
darind7737ab2005-09-09 00:51:07 +00001922}
1923
darind7737ab2005-09-09 00:51:07 +00001924/*************************************************
1925* Check for asserted fixed first char *
1926*************************************************/
1927
1928/* During compilation, the "first char" settings from forward assertions are
1929discarded, because they can cause conflicts with actual literals that follow.
1930However, if we end up without a first char setting for an unanchored pattern,
1931it is worth scanning the regex to see if there is an initial asserted first
1932char. If all branches start with the same asserted char, or with a bracket all
1933of whose alternatives start with the same asserted char (recurse ad lib), then
1934we return that char, otherwise -1.
1935
1936Arguments:
1937 code points to start of expression (the bracket)
1938 options pointer to the options (used to check casing changes)
darin@apple.com7f984872007-11-12 23:04:41 +00001939 inassert true if in an assertion
darind7737ab2005-09-09 00:51:07 +00001940
1941Returns: -1 or the fixed first char
1942*/
1943
darin@apple.com28e8eb92007-12-17 04:19:25 +00001944static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
darin@apple.com4c77b712007-12-11 06:08:24 +00001945{
darin@apple.com28e8eb92007-12-17 04:19:25 +00001946 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
darin@apple.com4c77b712007-12-11 06:08:24 +00001947 int op = *scode;
1948
1949 if (op >= OP_BRA)
1950 op = OP_BRA;
1951
1952 switch (op) {
1953 default:
1954 return -1;
1955
1956 case OP_BRA:
1957 case OP_ASSERT:
darin@apple.com4c77b712007-12-11 06:08:24 +00001958 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1959
1960 case OP_EXACT:
1961 scode += 2;
1962 /* Fall through */
1963
1964 case OP_CHAR:
1965 case OP_CHAR_IGNORING_CASE:
1966 case OP_ASCII_CHAR:
1967 case OP_ASCII_LETTER_IGNORING_CASE:
1968 case OP_PLUS:
1969 case OP_MINPLUS:
1970 if (!inassert)
1971 return -1;
1972 return scode[1];
1973 }
1974}
1975
darin@apple.com28e8eb92007-12-17 04:19:25 +00001976static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
darind7737ab2005-09-09 00:51:07 +00001977{
eric@webkit.org32f7c432007-11-29 11:12:11 +00001978 int c = -1;
1979 do {
darin@apple.com4c77b712007-12-11 06:08:24 +00001980 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
1981 if (d < 0)
1982 return -1;
1983 if (c < 0)
1984 c = d;
1985 else if (c != d)
1986 return -1;
darin@apple.com28e8eb92007-12-17 04:19:25 +00001987 code += getLinkValue(code + 1);
eric@webkit.org32f7c432007-11-29 11:12:11 +00001988 } while (*code == OP_ALT);
1989 return c;
darind7737ab2005-09-09 00:51:07 +00001990}
1991
mrowe@apple.comcb17bd02008-03-28 06:41:17 +00001992static inline int multiplyWithOverflowCheck(int a, int b)
1993{
1994 if (!a || !b)
1995 return 0;
1996 if (a > MAX_PATTERN_SIZE / b)
1997 return -1;
1998 return a * b;
1999}
2000
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002001static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002002 CompileData& cd, ErrorCode& errorcode)
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002003{
2004 /* Make a pass over the pattern to compute the
2005 amount of store required to hold the compiled code. This does not have to be
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002006 perfect as long as errors are overestimates. */
mrowe@apple.comcb17bd02008-03-28 06:41:17 +00002007
2008 if (patternLength > MAX_PATTERN_SIZE) {
2009 errorcode = ERR16;
2010 return -1;
2011 }
2012
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002013 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2014 int branch_extra = 0;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002015 int lastitemlength = 0;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002016 unsigned brastackptr = 0;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002017 int brastack[BRASTACK_SIZE];
darin@apple.com28e8eb92007-12-17 04:19:25 +00002018 unsigned char bralenstack[BRASTACK_SIZE];
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002019 int bracount = 0;
2020
eric@webkit.org64029102007-11-29 11:06:57 +00002021 const UChar* ptr = (const UChar*)(pattern - 1);
2022 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002023
eric@webkit.org70d2b412007-11-29 11:39:12 +00002024 while (++ptr < patternEnd) {
2025 int minRepeats = 0, maxRepeats = 0;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002026 int c = *ptr;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002027
darin@apple.com6b652e82007-11-30 18:54:34 +00002028 switch (c) {
2029 /* A backslashed item may be an escaped data character or it may be a
2030 character type. */
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002031
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002032 case '\\':
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002033 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002034 if (errorcode != 0)
eric@webkit.org70d2b412007-11-29 11:39:12 +00002035 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002036
2037 lastitemlength = 1; /* Default length of last item for repeats */
2038
eric@webkit.org90a7a132007-11-29 11:10:33 +00002039 if (c >= 0) { /* Data character */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002040 length += 2; /* For a one-byte character */
2041
eric@webkit.org90a7a132007-11-29 11:10:33 +00002042 if (c > 127) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002043 int i;
darin@apple.com28e8eb92007-12-17 04:19:25 +00002044 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2045 if (c <= kjs_pcre_utf8_table1[i]) break;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002046 length += i;
2047 lastitemlength += i;
2048 }
2049
2050 continue;
2051 }
2052
2053 /* Other escapes need one byte */
2054
2055 length++;
2056
2057 /* A back reference needs an additional 2 bytes, plus either one or 5
2058 bytes for a repeat. We also need to keep the value of the highest
2059 back reference. */
2060
eric@webkit.org90a7a132007-11-29 11:10:33 +00002061 if (c <= -ESC_REF) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002062 int refnum = -c - ESC_REF;
darin@apple.comd5cbb632007-12-10 06:22:04 +00002063 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
darin@apple.com51d6b262008-06-15 06:23:50 +00002064 if (refnum > cd.topBackref)
2065 cd.topBackref = refnum;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002066 length += 2; /* For single back reference */
darin@apple.com28e8eb92007-12-17 04:19:25 +00002067 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2068 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
eric@webkit.org90a7a132007-11-29 11:10:33 +00002069 if (errorcode)
2070 return -1;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002071 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2072 (minRepeats == 1 && maxRepeats == -1))
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002073 length++;
eric@webkit.org90a7a132007-11-29 11:10:33 +00002074 else
2075 length += 5;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002076 if (safelyCheckNextChar(ptr, patternEnd, '?'))
darin@apple.com856001d2007-11-27 00:50:36 +00002077 ptr++;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002078 }
2079 }
2080 continue;
2081
darin@apple.com6b652e82007-11-30 18:54:34 +00002082 case '^': /* Single-byte metacharacters */
2083 case '.':
2084 case '$':
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002085 length++;
2086 lastitemlength = 1;
2087 continue;
2088
darin@apple.com6b652e82007-11-30 18:54:34 +00002089 case '*': /* These repeats won't be after brackets; */
2090 case '+': /* those are handled separately */
2091 case '?':
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002092 length++;
darin@apple.com6b652e82007-11-30 18:54:34 +00002093 goto POSSESSIVE;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002094
darin@apple.com6b652e82007-11-30 18:54:34 +00002095 /* This covers the cases of braced repeats after a single char, metachar,
2096 class, or back reference. */
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002097
darin@apple.com6b652e82007-11-30 18:54:34 +00002098 case '{':
darin@apple.com28e8eb92007-12-17 04:19:25 +00002099 if (!isCountedRepeat(ptr + 1, patternEnd))
eric@webkit.org90a7a132007-11-29 11:10:33 +00002100 goto NORMAL_CHAR;
darin@apple.com28e8eb92007-12-17 04:19:25 +00002101 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
eric@webkit.org90a7a132007-11-29 11:10:33 +00002102 if (errorcode != 0)
eric@webkit.org70d2b412007-11-29 11:39:12 +00002103 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002104
2105 /* These special cases just insert one extra opcode */
2106
eric@webkit.org70d2b412007-11-29 11:39:12 +00002107 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2108 (minRepeats == 1 && maxRepeats == -1))
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002109 length++;
2110
2111 /* These cases might insert additional copies of a preceding character. */
2112
eric@webkit.org90a7a132007-11-29 11:10:33 +00002113 else {
eric@webkit.org70d2b412007-11-29 11:39:12 +00002114 if (minRepeats != 1) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002115 length -= lastitemlength; /* Uncount the original char or metachar */
eric@webkit.org70d2b412007-11-29 11:39:12 +00002116 if (minRepeats > 0)
eric@webkit.org90a7a132007-11-29 11:10:33 +00002117 length += 3 + lastitemlength;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002118 }
darin@apple.comd5cbb632007-12-10 06:22:04 +00002119 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002120 }
2121
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002122 if (safelyCheckNextChar(ptr, patternEnd, '?'))
eric@webkit.org90a7a132007-11-29 11:10:33 +00002123 ptr++; /* Needs no extra length */
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002124
darin@apple.com6b652e82007-11-30 18:54:34 +00002125 POSSESSIVE: /* Test for possessive quantifier */
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002126 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002127 ptr++;
eric@webkit.orgc83d57e2007-11-29 11:28:29 +00002128 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002129 }
2130 continue;
2131
darin@apple.com6b652e82007-11-30 18:54:34 +00002132 /* An alternation contains an offset to the next branch or ket. If any ims
2133 options changed in the previous branch(es), and/or if we are in a
2134 lookbehind assertion, extra space will be needed at the start of the
2135 branch. This is handled by branch_extra. */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002136
eric@webkit.org70d2b412007-11-29 11:39:12 +00002137 case '|':
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002138 if (brastackptr == 0)
2139 cd.needOuterBracket = true;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002140 length += 1 + LINK_SIZE + branch_extra;
2141 continue;
2142
darin@apple.com6b652e82007-11-30 18:54:34 +00002143 /* A character class uses 33 characters provided that all the character
2144 values are less than 256. Otherwise, it uses a bit map for low valued
2145 characters, and individual items for others. Don't worry about character
2146 types that aren't allowed in classes - they'll get picked up during the
2147 compile. A character class that contains only one single-byte character
2148 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2149 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002150
darin@apple.com6b652e82007-11-30 18:54:34 +00002151 case '[': {
eric@webkit.org70d2b412007-11-29 11:39:12 +00002152 int class_optcount;
eric@webkit.org90a7a132007-11-29 11:10:33 +00002153 if (*(++ptr) == '^') {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002154 class_optcount = 10; /* Greater than one */
2155 ptr++;
2156 }
eric@webkit.org90a7a132007-11-29 11:10:33 +00002157 else
2158 class_optcount = 0;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002159
eric@webkit.org70d2b412007-11-29 11:39:12 +00002160 bool class_utf8 = false;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002161
eric@webkit.org90a7a132007-11-29 11:10:33 +00002162 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002163 /* Check for escapes */
2164
eric@webkit.org90a7a132007-11-29 11:10:33 +00002165 if (*ptr == '\\') {
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002166 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
eric@webkit.org90a7a132007-11-29 11:10:33 +00002167 if (errorcode != 0)
eric@webkit.org70d2b412007-11-29 11:39:12 +00002168 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002169
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002170 /* Handle escapes that turn into characters */
2171
eric@webkit.org90a7a132007-11-29 11:10:33 +00002172 if (c >= 0)
2173 goto NON_SPECIAL_CHARACTER;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002174
2175 /* Escapes that are meta-things. The normal ones just affect the
2176 bit map, but Unicode properties require an XCLASS extended item. */
2177
2178 else
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002179 class_optcount = 10; /* \d, \s etc; make sure > 1 */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002180 }
2181
2182 /* Anything else increments the possible optimization count. We have to
2183 detect ranges here so that we can compute the number of extra ranges for
2184 caseless wide characters when UCP support is available. If there are wide
2185 characters, we are going to have to use an XCLASS, even for single
2186 characters. */
2187
eric@webkit.org90a7a132007-11-29 11:10:33 +00002188 else {
darin@apple.com6b652e82007-11-30 18:54:34 +00002189 c = *ptr;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002190
2191 /* Come here from handling \ above when it escapes to a char value */
2192
2193 NON_SPECIAL_CHARACTER:
2194 class_optcount++;
2195
eric@webkit.org3c683512007-11-29 11:24:39 +00002196 int d = -1;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002197 if (safelyCheckNextChar(ptr, patternEnd, '-')) {
eric@webkit.org64029102007-11-29 11:06:57 +00002198 UChar const *hyptr = ptr++;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002199 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002200 ptr++;
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002201 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
eric@webkit.org64029102007-11-29 11:06:57 +00002202 if (errorcode != 0)
eric@webkit.org70d2b412007-11-29 11:39:12 +00002203 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002204 }
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002205 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
darin@apple.com6b652e82007-11-30 18:54:34 +00002206 d = *++ptr;
eric@webkit.org64029102007-11-29 11:06:57 +00002207 if (d < 0)
2208 ptr = hyptr; /* go back to hyphen as data */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002209 }
2210
2211 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2212 127 for caseless matching, we will need to use an XCLASS. */
2213
eric@webkit.org64029102007-11-29 11:06:57 +00002214 if (d >= 0) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002215 class_optcount = 10; /* Ensure > 1 */
eric@webkit.org64029102007-11-29 11:06:57 +00002216 if (d < c) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002217 errorcode = ERR8;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002218 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002219 }
2220
eric@webkit.org64029102007-11-29 11:06:57 +00002221 if ((d > 255 || (ignoreCase && d > 127))) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00002222 unsigned char buffer[6];
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002223 if (!class_utf8) /* Allow for XCLASS overhead */
2224 {
2225 class_utf8 = true;
2226 length += LINK_SIZE + 2;
2227 }
2228
2229 /* If we have UCP support, find out how many extra ranges are
2230 needed to map the other case of characters within this range. We
2231 have to mimic the range optimization here, because extending the
eric@webkit.org32f7c432007-11-29 11:12:11 +00002232 range upwards might push d over a boundary that makes it use
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002233 another byte in the UTF-8 representation. */
2234
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002235 if (ignoreCase) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002236 int occ, ocd;
2237 int cc = c;
2238 int origd = d;
darin@apple.com28e8eb92007-12-17 04:19:25 +00002239 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
eric@webkit.org32f7c432007-11-29 11:12:11 +00002240 if (occ >= c && ocd <= d)
2241 continue; /* Skip embedded */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002242
2243 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2244 { /* if there is overlap, */
2245 c = occ; /* noting that if occ < c */
2246 continue; /* we can't have ocd > d */
2247 } /* because a subrange is */
2248 if (ocd > d && occ <= d + 1) /* always shorter than */
2249 { /* the basic range. */
2250 d = ocd;
2251 continue;
2252 }
2253
2254 /* An extra item is needed */
2255
darin@apple.com28e8eb92007-12-17 04:19:25 +00002256 length += 1 + encodeUTF8(occ, buffer) +
2257 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002258 }
2259 }
2260
2261 /* The length of the (possibly extended) range */
2262
darin@apple.com28e8eb92007-12-17 04:19:25 +00002263 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002264 }
2265
2266 }
2267
2268 /* We have a single character. There is nothing to be done unless we
2269 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2270 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2271 support. */
2272
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002273 else {
2274 if ((c > 255 || (ignoreCase && c > 127))) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00002275 unsigned char buffer[6];
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002276 class_optcount = 10; /* Ensure > 1 */
2277 if (!class_utf8) /* Allow for XCLASS overhead */
2278 {
2279 class_utf8 = true;
2280 length += LINK_SIZE + 2;
2281 }
darin@apple.com28e8eb92007-12-17 04:19:25 +00002282 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002283 }
2284 }
2285 }
2286 }
2287
eric@webkit.org70d2b412007-11-29 11:39:12 +00002288 if (ptr >= patternEnd) { /* Missing terminating ']' */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002289 errorcode = ERR6;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002290 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002291 }
2292
darin@apple.coma7c53852007-12-06 20:24:52 +00002293 /* We can optimize when there was only one optimizable character.
2294 Note that this does not detect the case of a negated single character.
2295 In that case we do an incorrect length computation, but it's not a serious
2296 problem because the computed length is too large rather than too small. */
2297
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002298 if (class_optcount == 1)
darin@apple.coma7c53852007-12-06 20:24:52 +00002299 goto NORMAL_CHAR;
2300
2301 /* Here, we handle repeats for the class opcodes. */
2302 {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002303 length += 33;
2304
2305 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2306 we also need extra for wrapping the whole thing in a sub-pattern. */
2307
darin@apple.com28e8eb92007-12-17 04:19:25 +00002308 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2309 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
eric@webkit.org70d2b412007-11-29 11:39:12 +00002310 if (errorcode != 0)
2311 return -1;
2312 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2313 (minRepeats == 1 && maxRepeats == -1))
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002314 length++;
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002315 else
2316 length += 5;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002317 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002318 ptr++;
eric@webkit.orgc83d57e2007-11-29 11:28:29 +00002319 length += 2 + 2 * LINK_SIZE;
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002320 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002321 ptr++;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002322 }
2323 }
2324 continue;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002325 }
darin@apple.com6b652e82007-11-30 18:54:34 +00002326
2327 /* Brackets may be genuine groups or special things */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002328
darin@apple.com6b652e82007-11-30 18:54:34 +00002329 case '(': {
eric@webkit.org70d2b412007-11-29 11:39:12 +00002330 int branch_newextra = 0;
2331 int bracket_length = 1 + LINK_SIZE;
2332 bool capturing = false;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002333
2334 /* Handle special forms of bracket, which all start (? */
2335
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002336 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002337 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
darin@apple.com28e8eb92007-12-17 04:19:25 +00002338 /* Non-referencing groups and lookaheads just move the pointer on, and
2339 then behave like a non-special bracket, except that they don't increment
2340 the count of extracting brackets. Ditto for the "once only" bracket,
2341 which is in Perl from version 5.005. */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002342
2343 case ':':
2344 case '=':
2345 case '!':
2346 ptr += 2;
2347 break;
2348
darin@apple.com28e8eb92007-12-17 04:19:25 +00002349 /* Else loop checking valid options until ) is met. Anything else is an
2350 error. If we are without any brackets, i.e. at top level, the settings
2351 act as if specified in the options, so massage the options immediately.
2352 This is for backward compatibility with Perl 5.004. */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002353
2354 default:
2355 errorcode = ERR12;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002356 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002357 }
eric@webkit.org70d2b412007-11-29 11:39:12 +00002358 } else
2359 capturing = 1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002360
2361 /* Capturing brackets must be counted so we can process escapes in a
2362 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2363 an additional 3 bytes of memory per capturing bracket. */
2364
eric@webkit.org70d2b412007-11-29 11:39:12 +00002365 if (capturing) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002366 bracount++;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002367 if (bracount > EXTRACT_BASIC_MAX)
2368 bracket_length += 3;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002369 }
2370
2371 /* Save length for computing whole length at end if there's a repeat that
2372 requires duplication of the group. Also save the current value of
2373 branch_extra, and start the new group with the new value. If non-zero, this
2374 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2375
eric@webkit.org70d2b412007-11-29 11:39:12 +00002376 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002377 errorcode = ERR17;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002378 return -1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002379 }
2380
2381 bralenstack[brastackptr] = branch_extra;
2382 branch_extra = branch_newextra;
2383
2384 brastack[brastackptr++] = length;
2385 length += bracket_length;
2386 continue;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002387 }
darin@apple.com6b652e82007-11-30 18:54:34 +00002388
2389 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2390 have to replicate this bracket up to that many times. If brastackptr is
2391 0 this is an unmatched bracket which will generate an error, but take care
2392 not to try to access brastack[-1] when computing the length and restoring
2393 the branch_extra value. */
2394
2395 case ')': {
eric@webkit.org70d2b412007-11-29 11:39:12 +00002396 int duplength;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002397 length += 1 + LINK_SIZE;
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002398 if (brastackptr > 0) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002399 duplength = length - brastack[--brastackptr];
2400 branch_extra = bralenstack[brastackptr];
2401 }
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002402 else
2403 duplength = 0;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002404
darin@apple.com28e8eb92007-12-17 04:19:25 +00002405 /* Leave ptr at the final char; for readRepeatCounts this happens
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002406 automatically; for the others we need an increment. */
2407
darin@apple.com28e8eb92007-12-17 04:19:25 +00002408 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2409 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
eric@webkit.org70d2b412007-11-29 11:39:12 +00002410 if (errorcode)
2411 return -1;
2412 } else if (c == '*') {
2413 minRepeats = 0;
2414 maxRepeats = -1;
2415 ptr++;
2416 } else if (c == '+') {
2417 minRepeats = 1;
2418 maxRepeats = -1;
2419 ptr++;
2420 } else if (c == '?') {
2421 minRepeats = 0;
2422 maxRepeats = 1;
2423 ptr++;
2424 } else {
2425 minRepeats = 1;
2426 maxRepeats = 1;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002427 }
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002428
2429 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2430 group, and if the maximum is greater than zero, we have to replicate
2431 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2432 bracket set. */
2433
mrowe@apple.comcb17bd02008-03-28 06:41:17 +00002434 int repeatsLength;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002435 if (minRepeats == 0) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002436 length++;
mrowe@apple.comcb17bd02008-03-28 06:41:17 +00002437 if (maxRepeats > 0) {
2438 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
2439 if (repeatsLength < 0) {
2440 errorcode = ERR16;
2441 return -1;
2442 }
2443 length += repeatsLength;
2444 if (length > MAX_PATTERN_SIZE) {
2445 errorcode = ERR16;
2446 return -1;
2447 }
2448 }
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002449 }
2450
2451 /* When the minimum is greater than zero, we have to replicate up to
2452 minval-1 times, with no additions required in the copies. Then, if there
2453 is a limited maximum we have to replicate up to maxval-1 times allowing
2454 for a BRAZERO item before each optional copy and nesting brackets for all
2455 but one of the optional copies. */
2456
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002457 else {
mrowe@apple.comcb17bd02008-03-28 06:41:17 +00002458 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2459 if (repeatsLength < 0) {
2460 errorcode = ERR16;
2461 return -1;
2462 }
2463 length += repeatsLength;
2464 if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2465 repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
2466 if (repeatsLength < 0) {
2467 errorcode = ERR16;
2468 return -1;
2469 }
2470 length += repeatsLength - (2 + 2 * LINK_SIZE);
2471 }
2472 if (length > MAX_PATTERN_SIZE) {
2473 errorcode = ERR16;
2474 return -1;
2475 }
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002476 }
2477
2478 /* Allow space for once brackets for "possessive quantifier" */
2479
eric@webkit.org61e47bf2007-11-30 23:43:45 +00002480 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002481 ptr++;
eric@webkit.orgc83d57e2007-11-29 11:28:29 +00002482 length += 2 + 2 * LINK_SIZE;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002483 }
2484 continue;
eric@webkit.org70d2b412007-11-29 11:39:12 +00002485 }
darin@apple.com6b652e82007-11-30 18:54:34 +00002486
2487 /* Non-special character. It won't be space or # in extended mode, so it is
2488 always a genuine character. If we are in a \Q...\E sequence, check for the
2489 end; if not, we have a literal. */
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002490
eric@webkit.org70d2b412007-11-29 11:39:12 +00002491 default:
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002492 NORMAL_CHAR:
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002493 length += 2; /* For a one-byte character */
2494 lastitemlength = 1; /* Default length of last item for repeats */
darin@apple.com6b652e82007-11-30 18:54:34 +00002495
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002496 if (c > 127) {
darin@apple.com6b652e82007-11-30 18:54:34 +00002497 int i;
darin@apple.com28e8eb92007-12-17 04:19:25 +00002498 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2499 if (c <= kjs_pcre_utf8_table1[i])
darin@apple.com6b652e82007-11-30 18:54:34 +00002500 break;
2501 length += i;
2502 lastitemlength += i;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002503 }
2504
2505 continue;
2506 }
2507 }
2508
2509 length += 2 + LINK_SIZE; /* For final KET and END */
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002510
2511 cd.numCapturingBrackets = bracount;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002512 return length;
2513}
darind7737ab2005-09-09 00:51:07 +00002514
darind7737ab2005-09-09 00:51:07 +00002515/*************************************************
2516* Compile a Regular Expression *
2517*************************************************/
2518
2519/* This function takes a string and returns a pointer to a block of store
2520holding a compiled version of the expression. The original API for this
2521function had no error code return variable; it is retained for backwards
2522compatibility. The new function is given a new name.
2523
2524Arguments:
2525 pattern the regular expression
2526 options various option bits
darin@apple.com51d6b262008-06-15 06:23:50 +00002527 errorCodePtr pointer to error code variable (pcre_compile2() only)
darind7737ab2005-09-09 00:51:07 +00002528 can be NULL if you don't want a code value
darin@apple.com51d6b262008-06-15 06:23:50 +00002529 errorPtr pointer to pointer to error text
darind7737ab2005-09-09 00:51:07 +00002530 erroroffset ptr offset in pattern where error was detected
2531 tables pointer to character tables or NULL
2532
2533Returns: pointer to compiled data block, or NULL on error,
darin@apple.com51d6b262008-06-15 06:23:50 +00002534 with errorPtr and erroroffset set
darind7737ab2005-09-09 00:51:07 +00002535*/
2536
darin@apple.com51d6b262008-06-15 06:23:50 +00002537static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002538{
darin@apple.com51d6b262008-06-15 06:23:50 +00002539 *errorPtr = errorText(errorcode);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002540 return 0;
2541}
2542
eric@webkit.org64029102007-11-29 11:06:57 +00002543JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002544 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
darin@apple.com51d6b262008-06-15 06:23:50 +00002545 unsigned* numSubpatterns, const char** errorPtr)
darind7737ab2005-09-09 00:51:07 +00002546{
darin@apple.com51d6b262008-06-15 06:23:50 +00002547 /* We can't pass back an error message if errorPtr is NULL; I guess the best we
eric@webkit.org487902b2007-11-15 02:00:15 +00002548 can do is just return NULL, but we can set a code value if there is a code pointer. */
darin@apple.com51d6b262008-06-15 06:23:50 +00002549 if (!errorPtr)
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002550 return 0;
darin@apple.com51d6b262008-06-15 06:23:50 +00002551 *errorPtr = NULL;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002552
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002553 CompileData cd;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002554
2555 ErrorCode errorcode = ERR0;
darin@apple.comf9c8dd52008-01-03 06:39:50 +00002556 /* Call this once just to count the brackets. */
2557 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2558 /* Call it again to compute the length. */
2559 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002560 if (errorcode)
darin@apple.com51d6b262008-06-15 06:23:50 +00002561 return returnError(errorcode, errorPtr);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002562
2563 if (length > MAX_PATTERN_SIZE)
darin@apple.com51d6b262008-06-15 06:23:50 +00002564 return returnError(ERR16, errorPtr);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002565
eric@webkit.org386a6022007-11-29 11:04:06 +00002566 size_t size = length + sizeof(JSRegExp);
darin@apple.comcff41592008-06-28 22:21:20 +00002567#if REGEXP_HISTOGRAM
2568 size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
2569 size = stringOffset + patternLength * sizeof(UChar);
2570#endif
eric@webkit.org386a6022007-11-29 11:04:06 +00002571 JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002572
2573 if (!re)
darin@apple.com51d6b262008-06-15 06:23:50 +00002574 return returnError(ERR13, errorPtr);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002575
eric@webkit.org8a344672007-11-29 11:38:26 +00002576 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002577
2578 /* The starting points of the name/number translation table and of the code are
2579 passed around in the compile data block. */
2580
darin@apple.com28e8eb92007-12-17 04:19:25 +00002581 const unsigned char* codeStart = (const unsigned char*)(re + 1);
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002582
2583 /* Set up a starting, non-extracting bracket, then compile the expression. On
2584 error, errorcode will be set non-zero, so we don't need to look at the result
2585 of the function here. */
2586
eric@webkit.org64029102007-11-29 11:06:57 +00002587 const UChar* ptr = (const UChar*)pattern;
2588 const UChar* patternEnd = pattern + patternLength;
darin@apple.com28e8eb92007-12-17 04:19:25 +00002589 unsigned char* code = (unsigned char*)codeStart;
darin@apple.com51d6b262008-06-15 06:23:50 +00002590 int firstByte, reqByte;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002591 int bracketCount = 0;
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002592 if (!cd.needOuterBracket)
darin@apple.com51d6b262008-06-15 06:23:50 +00002593 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002594 else {
2595 *code = OP_BRA;
darin@apple.com51d6b262008-06-15 06:23:50 +00002596 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd);
darin@apple.com8ad18ea2007-12-07 19:59:58 +00002597 }
darin@apple.com51d6b262008-06-15 06:23:50 +00002598 re->topBracket = bracketCount;
2599 re->topBackref = cd.topBackref;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002600
2601 /* If not reached end of pattern on success, there's an excess bracket. */
2602
2603 if (errorcode == 0 && ptr < patternEnd)
2604 errorcode = ERR10;
2605
2606 /* Fill in the terminating state and check for disastrous overflow, but
2607 if debugging, leave the test till after things are printed out. */
2608
2609 *code++ = OP_END;
darin@apple.come410bdf2007-12-04 19:08:28 +00002610
darin@apple.comd5cbb632007-12-10 06:22:04 +00002611 ASSERT(code - codeStart <= length);
2612 if (code - codeStart > length)
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002613 errorcode = ERR7;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002614
2615 /* Give an error if there's back reference to a non-existent capturing
2616 subpattern. */
2617
darin@apple.com51d6b262008-06-15 06:23:50 +00002618 if (re->topBackref > re->topBracket)
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002619 errorcode = ERR15;
2620
2621 /* Failed to compile, or error while post-processing */
2622
2623 if (errorcode != ERR0) {
2624 delete [] reinterpret_cast<char*>(re);
darin@apple.com51d6b262008-06-15 06:23:50 +00002625 return returnError(errorcode, errorPtr);
darind7737ab2005-09-09 00:51:07 +00002626 }
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002627
2628 /* If the anchored option was not passed, set the flag if we can determine that
2629 the pattern is anchored by virtue of ^ characters or \A or anything else (such
2630 as starting with .* when DOTALL is set).
2631
2632 Otherwise, if we know what the first character has to be, save it, because that
2633 speeds up unanchored matches no end. If not, see if we can set the
eric@webkit.org8a344672007-11-29 11:38:26 +00002634 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002635 start with ^. and also when all branches start with .* for non-DOTALL matches.
2636 */
2637
darin@apple.comd5cbb632007-12-10 06:22:04 +00002638 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
eric@webkit.org8a344672007-11-29 11:38:26 +00002639 re->options |= IsAnchoredOption;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002640 else {
darin@apple.com51d6b262008-06-15 06:23:50 +00002641 if (firstByte < 0) {
2642 firstByte = (cd.needOuterBracket
darin@apple.com4c77b712007-12-11 06:08:24 +00002643 ? bracketFindFirstAssertedCharacter(codeStart, false)
2644 : branchFindFirstAssertedCharacter(codeStart, false))
2645 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2646 }
darin@apple.com51d6b262008-06-15 06:23:50 +00002647 if (firstByte >= 0) {
2648 int ch = firstByte & 255;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002649 if (ch < 127) {
darin@apple.com51d6b262008-06-15 06:23:50 +00002650 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
eric@webkit.org8a344672007-11-29 11:38:26 +00002651 re->options |= UseFirstByteOptimizationOption;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002652 }
darin@apple.comd5cbb632007-12-10 06:22:04 +00002653 } else {
2654 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2655 re->options |= UseMultiLineFirstByteOptimizationOption;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002656 }
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002657 }
2658
2659 /* For an anchored pattern, we use the "required byte" only if it follows a
2660 variable length item in the regex. Remove the caseless flag for non-caseable
2661 bytes. */
2662
darin@apple.com51d6b262008-06-15 06:23:50 +00002663 if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2664 int ch = reqByte & 255;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002665 if (ch < 127) {
darin@apple.com51d6b262008-06-15 06:23:50 +00002666 re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
eric@webkit.org8a344672007-11-29 11:38:26 +00002667 re->options |= UseRequiredByteOptimizationOption;
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002668 }
2669 }
2670
darin@apple.comcff41592008-06-28 22:21:20 +00002671#if REGEXP_HISTOGRAM
2672 re->stringOffset = stringOffset;
2673 re->stringLength = patternLength;
2674 memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
2675#endif
2676
eric@webkit.org1dab6f52007-11-15 01:17:31 +00002677 if (numSubpatterns)
darin@apple.com51d6b262008-06-15 06:23:50 +00002678 *numSubpatterns = re->topBracket;
eric@webkit.orga28f14e2007-11-29 11:05:07 +00002679 return re;
darind7737ab2005-09-09 00:51:07 +00002680}
2681
darin@apple.coma7c3b872007-11-04 05:22:44 +00002682void jsRegExpFree(JSRegExp* re)
2683{
darin@apple.comee752e72007-11-11 18:56:13 +00002684 delete [] reinterpret_cast<char*>(re);
darin@apple.coma7c3b872007-11-04 05:22:44 +00002685}