| /* |
| * Copyright (C) 2009 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef RegexParser_h |
| #define RegexParser_h |
| |
| #include <wtf/Platform.h> |
| |
| #if ENABLE(YARR) |
| |
| #include <UString.h> |
| #include <wtf/ASCIICType.h> |
| #include <wtf/unicode/Unicode.h> |
| #include <limits.h> |
| |
| namespace JSC { namespace Yarr { |
| |
| enum BuiltInCharacterClassID { |
| DigitClassID, |
| SpaceClassID, |
| WordClassID, |
| NewlineClassID, |
| }; |
| |
| // The Parser class should not be used directly - only via the Yarr::parse() method. |
| template<class Delegate> |
| class Parser { |
| private: |
| template<class FriendDelegate> |
| friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); |
| |
| enum ErrorCode { |
| NoError, |
| PatternTooLarge, |
| QuantifierOutOfOrder, |
| QuantifierWithoutAtom, |
| MissingParentheses, |
| ParenthesesUnmatched, |
| ParenthesesTypeInvalid, |
| CharacterClassUnmatched, |
| CharacterClassOutOfOrder, |
| EscapeUnterminated, |
| NumberOfErrorCodes |
| }; |
| |
| /* |
| * CharacterClassParserDelegate: |
| * |
| * The class CharacterClassParserDelegate is used in the parsing of character |
| * classes. This class handles detection of character ranges. This class |
| * implements enough of the delegate interface such that it can be passed to |
| * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
| * to perform the parsing of escape characters in character sets. |
| */ |
| class CharacterClassParserDelegate { |
| public: |
| CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) |
| : m_delegate(delegate) |
| , m_err(err) |
| , m_state(empty) |
| { |
| } |
| |
| /* |
| * begin(): |
| * |
| * Called at beginning of construction. |
| */ |
| void begin(bool invert) |
| { |
| m_delegate.atomCharacterClassBegin(invert); |
| } |
| |
| /* |
| * atomPatternCharacterUnescaped(): |
| * |
| * This method is called directly from parseCharacterClass(), to report a new |
| * pattern character token. This method differs from atomPatternCharacter(), |
| * which will be called from parseEscape(), since a hypen provided via this |
| * method may be indicating a character range, but a hyphen parsed by |
| * parseEscape() cannot be interpreted as doing so. |
| */ |
| void atomPatternCharacterUnescaped(UChar ch) |
| { |
| switch (m_state) { |
| case empty: |
| m_character = ch; |
| m_state = cachedCharacter; |
| break; |
| |
| case cachedCharacter: |
| if (ch == '-') |
| m_state = cachedCharacterHyphen; |
| else { |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_character = ch; |
| } |
| break; |
| |
| case cachedCharacterHyphen: |
| if (ch >= m_character) |
| m_delegate.atomCharacterClassRange(m_character, ch); |
| else |
| m_err = CharacterClassOutOfOrder; |
| m_state = empty; |
| } |
| } |
| |
| /* |
| * atomPatternCharacter(): |
| * |
| * Adds a pattern character, called by parseEscape(), as such will not |
| * interpret a hyphen as indicating a character range. |
| */ |
| void atomPatternCharacter(UChar ch) |
| { |
| // Flush if a character is already pending to prevent the |
| // hyphen from begin interpreted as indicating a range. |
| if((ch == '-') && (m_state == cachedCharacter)) |
| flush(); |
| |
| atomPatternCharacterUnescaped(ch); |
| } |
| |
| /* |
| * atomBuiltInCharacterClass(): |
| * |
| * Adds a built-in character class, called by parseEscape(). |
| */ |
| void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
| { |
| flush(); |
| m_delegate.atomCharacterClassBuiltIn(classID, invert); |
| } |
| |
| /* |
| * end(): |
| * |
| * Called at end of construction. |
| */ |
| void end() |
| { |
| flush(); |
| m_delegate.atomCharacterClassEnd(); |
| } |
| |
| // parseEscape() should never call these delegate methods when |
| // invoked with inCharacterClass set. |
| void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } |
| void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } |
| |
| private: |
| void flush() |
| { |
| if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen |
| m_delegate.atomCharacterClassAtom(m_character); |
| if (m_state == cachedCharacterHyphen) |
| m_delegate.atomCharacterClassAtom('-'); |
| m_state = empty; |
| } |
| |
| Delegate& m_delegate; |
| ErrorCode& m_err; |
| enum CharacterClassConstructionState { |
| empty, |
| cachedCharacter, |
| cachedCharacterHyphen, |
| } m_state; |
| UChar m_character; |
| }; |
| |
| Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) |
| : m_delegate(delegate) |
| , m_backReferenceLimit(backReferenceLimit) |
| , m_err(NoError) |
| , m_data(pattern.data()) |
| , m_size(pattern.size()) |
| , m_index(0) |
| , m_parenthesesNestingDepth(0) |
| { |
| } |
| |
| /* |
| * parseEscape(): |
| * |
| * Helper for parseTokens() AND parseCharacterClass(). |
| * Unlike the other parser methods, this function does not report tokens |
| * directly to the member delegate (m_delegate), instead tokens are |
| * emitted to the delegate provided as an argument. In the case of atom |
| * escapes, parseTokens() will call parseEscape() passing m_delegate as |
| * an argument, and as such the escape will be reported to the delegate. |
| * |
| * However this method may also be used by parseCharacterClass(), in which |
| * case a CharacterClassParserDelegate will be passed as the delegate that |
| * tokens should be added to. A boolean flag is also provided to indicate |
| * whether that an escape in a CharacterClass is being parsed (some parsing |
| * rules change in this context). |
| * |
| * The boolean value returned by this method indicates whether the token |
| * parsed was an atom (outside of a characted class \b and \B will be |
| * interpreted as assertions). |
| */ |
| template<bool inCharacterClass, class EscapeDelegate> |
| bool parseEscape(EscapeDelegate& delegate) |
| { |
| ASSERT(!m_err); |
| ASSERT(peek() == '\\'); |
| consume(); |
| |
| if (atEndOfPattern()) { |
| m_err = EscapeUnterminated; |
| return false; |
| } |
| |
| switch (peek()) { |
| // Assertions |
| case 'b': |
| consume(); |
| if (inCharacterClass) |
| delegate.atomPatternCharacter('\b'); |
| else { |
| delegate.assertionWordBoundary(false); |
| return false; |
| } |
| break; |
| case 'B': |
| consume(); |
| if (inCharacterClass) |
| delegate.atomPatternCharacter('B'); |
| else { |
| delegate.assertionWordBoundary(true); |
| return false; |
| } |
| break; |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| delegate.atomBuiltInCharacterClass(DigitClassID, false); |
| break; |
| case 's': |
| consume(); |
| delegate.atomBuiltInCharacterClass(SpaceClassID, false); |
| break; |
| case 'w': |
| consume(); |
| delegate.atomBuiltInCharacterClass(WordClassID, false); |
| break; |
| case 'D': |
| consume(); |
| delegate.atomBuiltInCharacterClass(DigitClassID, true); |
| break; |
| case 'S': |
| consume(); |
| delegate.atomBuiltInCharacterClass(SpaceClassID, true); |
| break; |
| case 'W': |
| consume(); |
| delegate.atomBuiltInCharacterClass(WordClassID, true); |
| break; |
| |
| // DecimalEscape |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. |
| // First, try to parse this as backreference. |
| if (!inCharacterClass) { |
| ParseState state = saveState(); |
| |
| unsigned backReference = consumeNumber(); |
| if (backReference <= m_backReferenceLimit) { |
| delegate.atomBackReference(backReference); |
| break; |
| } |
| |
| restoreState(state); |
| } |
| |
| // Not a backreference, and not octal. |
| if (peek() >= '8') { |
| delegate.atomPatternCharacter('\\'); |
| break; |
| } |
| |
| // Fall-through to handle this as an octal escape. |
| } |
| |
| // Octal escape |
| case '0': |
| delegate.atomPatternCharacter(consumeOctal()); |
| break; |
| |
| // ControlEscape |
| case 'f': |
| consume(); |
| delegate.atomPatternCharacter('\f'); |
| break; |
| case 'n': |
| consume(); |
| delegate.atomPatternCharacter('\n'); |
| break; |
| case 'r': |
| consume(); |
| delegate.atomPatternCharacter('\r'); |
| break; |
| case 't': |
| consume(); |
| delegate.atomPatternCharacter('\t'); |
| break; |
| case 'v': |
| consume(); |
| delegate.atomPatternCharacter('\v'); |
| break; |
| |
| // ControlLetter |
| case 'c': { |
| ParseState state = saveState(); |
| consume(); |
| if (!atEndOfPattern()) { |
| int control = consume(); |
| |
| // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. |
| if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { |
| delegate.atomPatternCharacter(control & 0x1f); |
| break; |
| } |
| } |
| restoreState(state); |
| delegate.atomPatternCharacter('\\'); |
| break; |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| int x = tryConsumeHex(2); |
| if (x == -1) |
| delegate.atomPatternCharacter('x'); |
| else |
| delegate.atomPatternCharacter(x); |
| break; |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| consume(); |
| int u = tryConsumeHex(4); |
| if (u == -1) |
| delegate.atomPatternCharacter('u'); |
| else |
| delegate.atomPatternCharacter(u); |
| break; |
| } |
| |
| // IdentityEscape |
| default: |
| delegate.atomPatternCharacter(consume()); |
| } |
| |
| return true; |
| } |
| |
| /* |
| * parseAtomEscape(), parseCharacterClassEscape(): |
| * |
| * These methods alias to parseEscape(). |
| */ |
| bool parseAtomEscape() |
| { |
| return parseEscape<false>(m_delegate); |
| } |
| void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
| { |
| parseEscape<true>(delegate); |
| } |
| |
| /* |
| * parseCharacterClass(): |
| * |
| * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) |
| * to an instance of CharacterClassParserDelegate, to describe the character class to the |
| * delegate. |
| */ |
| void parseCharacterClass() |
| { |
| ASSERT(!m_err); |
| ASSERT(peek() == '['); |
| consume(); |
| |
| CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); |
| |
| characterClassConstructor.begin(tryConsume('^')); |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case ']': |
| consume(); |
| characterClassConstructor.end(); |
| return; |
| |
| case '\\': |
| parseCharacterClassEscape(characterClassConstructor); |
| break; |
| |
| default: |
| characterClassConstructor.atomPatternCharacterUnescaped(consume()); |
| } |
| |
| if (m_err) |
| return; |
| } |
| |
| m_err = CharacterClassUnmatched; |
| } |
| |
| /* |
| * parseParenthesesBegin(): |
| * |
| * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
| */ |
| void parseParenthesesBegin() |
| { |
| ASSERT(!m_err); |
| ASSERT(peek() == '('); |
| consume(); |
| |
| if (tryConsume('?')) { |
| if (atEndOfPattern()) { |
| m_err = ParenthesesTypeInvalid; |
| return; |
| } |
| |
| switch (consume()) { |
| case ':': |
| m_delegate.atomParenthesesSubpatternBegin(false); |
| break; |
| |
| case '=': |
| m_delegate.atomParentheticalAssertionBegin(); |
| break; |
| |
| case '!': |
| m_delegate.atomParentheticalAssertionBegin(true); |
| break; |
| |
| default: |
| m_err = ParenthesesTypeInvalid; |
| } |
| } else |
| m_delegate.atomParenthesesSubpatternBegin(); |
| |
| ++m_parenthesesNestingDepth; |
| } |
| |
| /* |
| * parseParenthesesEnd(): |
| * |
| * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
| */ |
| void parseParenthesesEnd() |
| { |
| ASSERT(!m_err); |
| ASSERT(peek() == ')'); |
| consume(); |
| |
| if (m_parenthesesNestingDepth > 0) |
| m_delegate.atomParenthesesEnd(); |
| else |
| m_err = ParenthesesUnmatched; |
| |
| --m_parenthesesNestingDepth; |
| } |
| |
| /* |
| * parseQuantifier(): |
| * |
| * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
| */ |
| void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
| { |
| ASSERT(!m_err); |
| ASSERT(min <= max); |
| |
| if (lastTokenWasAnAtom) |
| m_delegate.quantifyAtom(min, max, !tryConsume('?')); |
| else |
| m_err = QuantifierWithoutAtom; |
| } |
| |
| /* |
| * parseTokens(): |
| * |
| * This method loops over the input pattern reporting tokens to the delegate. |
| * The method returns when a parse error is detected, or the end of the pattern |
| * is reached. One piece of state is tracked around the loop, which is whether |
| * the last token passed to the delegate was an atom (this is necessary to detect |
| * a parse error when a quantifier provided without an atom to quantify). |
| */ |
| void parseTokens() |
| { |
| bool lastTokenWasAnAtom = false; |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case '|': |
| consume(); |
| m_delegate.disjunction(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '(': |
| parseParenthesesBegin(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case ')': |
| parseParenthesesEnd(); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '^': |
| consume(); |
| m_delegate.assertionBOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '$': |
| consume(); |
| m_delegate.assertionEOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '.': |
| consume(); |
| m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '[': |
| parseCharacterClass(); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '\\': |
| lastTokenWasAnAtom = parseAtomEscape(); |
| break; |
| |
| case '*': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '+': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '?': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, 1); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '{': { |
| ParseState state = saveState(); |
| |
| consume(); |
| if (peekIsDigit()) { |
| unsigned min = consumeNumber(); |
| unsigned max = min; |
| |
| if (tryConsume(',')) |
| max = peekIsDigit() ? consumeNumber() : UINT_MAX; |
| |
| if (tryConsume('}')) { |
| if (min <= max) |
| parseQuantifier(lastTokenWasAnAtom, min, max); |
| else |
| m_err = QuantifierOutOfOrder; |
| lastTokenWasAnAtom = false; |
| break; |
| } |
| } |
| |
| restoreState(state); |
| } // if we did not find a complete quantifer, fall through to the default case. |
| |
| default: |
| m_delegate.atomPatternCharacter(consume()); |
| lastTokenWasAnAtom = true; |
| } |
| |
| if (m_err) |
| return; |
| } |
| |
| if (m_parenthesesNestingDepth > 0) |
| m_err = MissingParentheses; |
| } |
| |
| /* |
| * parse(): |
| * |
| * This method calls regexBegin(), calls parseTokens() to parse over the input |
| * patterns, calls regexEnd() or regexError() as appropriate, and converts any |
| * error code to a const char* for a result. |
| */ |
| const char* parse() |
| { |
| m_delegate.regexBegin(); |
| |
| if (m_size > MAX_PATTERN_SIZE) |
| m_err = PatternTooLarge; |
| else |
| parseTokens(); |
| ASSERT(atEndOfPattern() || m_err); |
| |
| if (m_err) |
| m_delegate.regexError(); |
| else |
| m_delegate.regexEnd(); |
| |
| // The order of this array must match the ErrorCode enum. |
| static const char* errorMessages[NumberOfErrorCodes] = { |
| 0, // NoError |
| "regular expression too large", |
| "numbers out of order in {} quantifier", |
| "nothing to repeat", |
| "missing )", |
| "unmatched parentheses", |
| "unrecognized character after (?", |
| "missing terminating ] for character class", |
| "range out of order in character class", |
| "\\ at end of pattern" |
| }; |
| |
| return errorMessages[m_err]; |
| } |
| |
| |
| // Misc helper functions: |
| |
| typedef unsigned ParseState; |
| |
| ParseState saveState() |
| { |
| return m_index; |
| } |
| |
| void restoreState(ParseState state) |
| { |
| m_index = state; |
| } |
| |
| bool atEndOfPattern() |
| { |
| ASSERT(m_index <= m_size); |
| return m_index == m_size; |
| } |
| |
| int peek() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index]; |
| } |
| |
| bool peekIsDigit() |
| { |
| return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
| } |
| |
| unsigned peekDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return peek() - '0'; |
| } |
| |
| int consume() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index++]; |
| } |
| |
| unsigned consumeDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return consume() - '0'; |
| } |
| |
| unsigned consumeNumber() |
| { |
| unsigned n = consumeDigit(); |
| // check for overflow. |
| for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { |
| n = newValue; |
| consume(); |
| } |
| return n; |
| } |
| |
| unsigned consumeOctal() |
| { |
| ASSERT(WTF::isASCIIOctalDigit(peek())); |
| |
| unsigned n = consumeDigit(); |
| while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
| n = n * 8 + consumeDigit(); |
| return n; |
| } |
| |
| bool tryConsume(UChar ch) |
| { |
| if (atEndOfPattern() || (m_data[m_index] != ch)) |
| return false; |
| ++m_index; |
| return true; |
| } |
| |
| int tryConsumeHex(int count) |
| { |
| ParseState state = saveState(); |
| |
| int n = 0; |
| while (count--) { |
| if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
| restoreState(state); |
| return -1; |
| } |
| n = (n << 4) | WTF::toASCIIHexValue(consume()); |
| } |
| return n; |
| } |
| |
| Delegate& m_delegate; |
| unsigned m_backReferenceLimit; |
| ErrorCode m_err; |
| const UChar* m_data; |
| unsigned m_size; |
| unsigned m_index; |
| unsigned m_parenthesesNestingDepth; |
| |
| // Derived by empirical testing of compile time in PCRE and WREC. |
| static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
| }; |
| |
| /* |
| * Yarr::parse(): |
| * |
| * The parse method is passed a pattern to be parsed and a delegate upon which |
| * callbacks will be made to record the parsed tokens forming the regex. |
| * Yarr::parse() returns null on success, or a const C string providing an error |
| * message where a parse error occurs. |
| * |
| * The Delegate must implement the following interface: |
| * |
| * void assertionBOL(); |
| * void assertionEOL(); |
| * void assertionWordBoundary(bool invert); |
| * |
| * void atomPatternCharacter(UChar ch); |
| * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
| * void atomCharacterClassBegin(bool invert) |
| * void atomCharacterClassAtom(UChar ch) |
| * void atomCharacterClassRange(UChar begin, UChar end) |
| * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
| * void atomCharacterClassEnd() |
| * void atomParenthesesSubpatternBegin(bool capture = true); |
| * void atomParentheticalAssertionBegin(bool invert = false); |
| * void atomParenthesesEnd(); |
| * void atomBackReference(unsigned subpatternId); |
| * |
| * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
| * |
| * void disjunction(); |
| * |
| * void regexBegin(); |
| * void regexEnd(); |
| * void regexError(); |
| * |
| * Before any call recording tokens are made, regexBegin() will be called on the |
| * delegate once. Once parsing is complete either regexEnd() or regexError() will |
| * be called, as appropriate. |
| * |
| * The regular expression is described by a sequence of assertion*() and atom*() |
| * callbacks to the delegate, describing the terms in the regular expression. |
| * Following an atom a quantifyAtom() call may occur to indicate that the previous |
| * atom should be quantified. In the case of atoms described across multiple |
| * calls (parentheses and character classes) the call to quantifyAtom() will come |
| * after the call to the atom*End() method, never after atom*Begin(). |
| * |
| * Character classes may either be described by a single call to |
| * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
| * In the latter case, ...Begin() will be called, followed by a sequence of |
| * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
| * |
| * Sequences of atoms and assertions are broken into alternatives via calls to |
| * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
| * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
| * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
| * capturing subpattern, this will be the subpatternId associated with these |
| * parentheses, and will also by definition be the lowest subpatternId of these |
| * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
| * is passed the subpatternId of the last capturing subexpression nested within |
| * these paretheses. In the case of a capturing subpattern with no nested |
| * capturing subpatterns, the same subpatternId will be passed to the begin and |
| * end functions. In the case of non-capturing subpatterns the subpatternId |
| * passed to the begin method is also the first possible subpatternId that might |
| * be nested within these paretheses. If a set of non-capturing parentheses does |
| * not contain any capturing subpatterns, then the subpatternId passed to begin |
| * will be greater than the subpatternId passed to end. |
| */ |
| |
| template<class Delegate> |
| const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) |
| { |
| return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); |
| } |
| |
| } } // namespace JSC::Yarr |
| |
| #endif |
| |
| #endif // RegexParser_h |