| /* |
| * Copyright (C) 2009-2019 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #pragma once |
| |
| #include "Yarr.h" |
| #include "YarrPattern.h" |
| #include "YarrUnicodeProperties.h" |
| #include <wtf/ASCIICType.h> |
| #include <wtf/HashSet.h> |
| #include <wtf/Optional.h> |
| #include <wtf/text/StringBuilder.h> |
| #include <wtf/text/WTFString.h> |
| |
| namespace JSC { namespace Yarr { |
| |
| // The Parser class should not be used directly - only via the Yarr::parse() method. |
| template<class Delegate, typename CharType> |
| class Parser { |
| private: |
| template<class FriendDelegate> |
| friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); |
| |
| /* |
| * CharacterClassParserDelegate: |
| * |
| * The class CharacterClassParserDelegate is used in the parsing of character |
| * classes. This class handles detection of character ranges. This class |
| * implements enough of the delegate interface such that it can be passed to |
| * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
| * to perform the parsing of escape characters in character sets. |
| */ |
| class CharacterClassParserDelegate { |
| public: |
| CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) |
| : m_delegate(delegate) |
| , m_errorCode(err) |
| , m_state(Empty) |
| , m_character(0) |
| { |
| } |
| |
| /* |
| * begin(): |
| * |
| * Called at beginning of construction. |
| */ |
| void begin(bool invert) |
| { |
| m_delegate.atomCharacterClassBegin(invert); |
| } |
| |
| /* |
| * atomPatternCharacter(): |
| * |
| * This method is called either from parseCharacterClass() (for an unescaped |
| * character in a character class), or from parseEscape(). In the former case |
| * the value true will be passed for the argument 'hyphenIsRange', and in this |
| * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ |
| * is different to /[a\-z]/). |
| */ |
| void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) |
| { |
| switch (m_state) { |
| case AfterCharacterClass: |
| // Following a builtin character class we need look out for a hyphen. |
| // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. |
| // If we see a hyphen following a charater class then unlike usual |
| // we'll report it to the delegate immediately, and put ourself into |
| // a poisoned state. Any following calls to add another character or |
| // character class will result in an error. (A hypen following a |
| // character-class is itself valid, but only at the end of a regex). |
| if (hyphenIsRange && ch == '-') { |
| m_delegate.atomCharacterClassAtom('-'); |
| m_state = AfterCharacterClassHyphen; |
| return; |
| } |
| // Otherwise just fall through - cached character so treat this as Empty. |
| FALLTHROUGH; |
| |
| case Empty: |
| m_character = ch; |
| m_state = CachedCharacter; |
| return; |
| |
| case CachedCharacter: |
| if (hyphenIsRange && ch == '-') |
| m_state = CachedCharacterHyphen; |
| else { |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_character = ch; |
| } |
| return; |
| |
| case CachedCharacterHyphen: |
| if (ch < m_character) { |
| m_errorCode = ErrorCode::CharacterClassOutOfOrder; |
| return; |
| } |
| m_delegate.atomCharacterClassRange(m_character, ch); |
| m_state = Empty; |
| return; |
| |
| // See coment in atomBuiltInCharacterClass below. |
| // This too is technically an error, per ECMA-262, and again we |
| // we chose to allow this. Note a subtlely here that while we |
| // diverge from the spec's definition of CharacterRange we do |
| // remain in compliance with the grammar. For example, consider |
| // the expression /[\d-a-z]/. We comply with the grammar in |
| // this case by not allowing a-z to be matched as a range. |
| case AfterCharacterClassHyphen: |
| m_delegate.atomCharacterClassAtom(ch); |
| m_state = Empty; |
| return; |
| } |
| } |
| |
| /* |
| * atomBuiltInCharacterClass(): |
| * |
| * Adds a built-in character class, called by parseEscape(). |
| */ |
| void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
| { |
| switch (m_state) { |
| case CachedCharacter: |
| // Flush the currently cached character, then fall through. |
| m_delegate.atomCharacterClassAtom(m_character); |
| FALLTHROUGH; |
| case Empty: |
| case AfterCharacterClass: |
| m_state = AfterCharacterClass; |
| m_delegate.atomCharacterClassBuiltIn(classID, invert); |
| return; |
| |
| // If we hit either of these cases, we have an invalid range that |
| // looks something like /[x-\d]/ or /[\d-\d]/. |
| // According to ECMA-262 this should be a syntax error, but |
| // empirical testing shows this to break teh webz. Instead we |
| // comply with to the ECMA-262 grammar, and assume the grammar to |
| // have matched the range correctly, but tweak our interpretation |
| // of CharacterRange. Effectively we implicitly handle the hyphen |
| // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. |
| case CachedCharacterHyphen: |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_delegate.atomCharacterClassAtom('-'); |
| FALLTHROUGH; |
| case AfterCharacterClassHyphen: |
| m_delegate.atomCharacterClassBuiltIn(classID, invert); |
| m_state = Empty; |
| return; |
| } |
| } |
| |
| /* |
| * end(): |
| * |
| * Called at end of construction. |
| */ |
| void end() |
| { |
| if (m_state == CachedCharacter) |
| m_delegate.atomCharacterClassAtom(m_character); |
| else if (m_state == CachedCharacterHyphen) { |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_delegate.atomCharacterClassAtom('-'); |
| } |
| m_delegate.atomCharacterClassEnd(); |
| } |
| |
| // parseEscape() should never call these delegate methods when |
| // invoked with inCharacterClass set. |
| NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } |
| |
| private: |
| Delegate& m_delegate; |
| ErrorCode& m_errorCode; |
| enum CharacterClassConstructionState { |
| Empty, |
| CachedCharacter, |
| CachedCharacterHyphen, |
| AfterCharacterClass, |
| AfterCharacterClassHyphen, |
| } m_state; |
| UChar32 m_character; |
| }; |
| |
| Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) |
| : m_delegate(delegate) |
| , m_backReferenceLimit(backReferenceLimit) |
| , m_data(pattern.characters<CharType>()) |
| , m_size(pattern.length()) |
| , m_isUnicode(isUnicode) |
| { |
| } |
| |
| // The handling of IdentityEscapes is different depending on the unicode flag. |
| // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. |
| // For non-unicode patterns, most any character can be escaped. |
| bool isIdentityEscapeAnError(int ch) |
| { |
| if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { |
| m_errorCode = ErrorCode::InvalidIdentityEscape; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * parseEscape(): |
| * |
| * Helper for parseTokens() AND parseCharacterClass(). |
| * Unlike the other parser methods, this function does not report tokens |
| * directly to the member delegate (m_delegate), instead tokens are |
| * emitted to the delegate provided as an argument. In the case of atom |
| * escapes, parseTokens() will call parseEscape() passing m_delegate as |
| * an argument, and as such the escape will be reported to the delegate. |
| * |
| * However this method may also be used by parseCharacterClass(), in which |
| * case a CharacterClassParserDelegate will be passed as the delegate that |
| * tokens should be added to. A boolean flag is also provided to indicate |
| * whether that an escape in a CharacterClass is being parsed (some parsing |
| * rules change in this context). |
| * |
| * The boolean value returned by this method indicates whether the token |
| * parsed was an atom (outside of a characted class \b and \B will be |
| * interpreted as assertions). |
| */ |
| template<bool inCharacterClass, class EscapeDelegate> |
| bool parseEscape(EscapeDelegate& delegate) |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '\\'); |
| consume(); |
| |
| if (atEndOfPattern()) { |
| m_errorCode = ErrorCode::EscapeUnterminated; |
| return false; |
| } |
| |
| switch (peek()) { |
| // Assertions |
| case 'b': |
| consume(); |
| if (inCharacterClass) { |
| if (isIdentityEscapeAnError('b')) |
| break; |
| |
| delegate.atomPatternCharacter('\b'); |
| } else { |
| delegate.assertionWordBoundary(false); |
| return false; |
| } |
| break; |
| case 'B': |
| consume(); |
| if (inCharacterClass) { |
| if (isIdentityEscapeAnError('B')) |
| break; |
| |
| delegate.atomPatternCharacter('B'); |
| } else { |
| delegate.assertionWordBoundary(true); |
| return false; |
| } |
| break; |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); |
| break; |
| case 's': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); |
| break; |
| case 'w': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); |
| break; |
| case 'D': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); |
| break; |
| case 'S': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); |
| break; |
| case 'W': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); |
| break; |
| |
| // DecimalEscape |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. |
| // First, try to parse this as backreference. |
| if (!inCharacterClass) { |
| ParseState state = saveState(); |
| |
| unsigned backReference = consumeNumber(); |
| if (backReference <= m_backReferenceLimit) { |
| delegate.atomBackReference(backReference); |
| break; |
| } |
| |
| restoreState(state); |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidBackreference; |
| return false; |
| } |
| } |
| |
| // Not a backreference, and not octal. Just a number. |
| if (peek() >= '8') { |
| delegate.atomPatternCharacter(consume()); |
| break; |
| } |
| |
| // Fall-through to handle this as an octal escape. |
| FALLTHROUGH; |
| } |
| |
| // Octal escape |
| case '0': |
| delegate.atomPatternCharacter(consumeOctal()); |
| break; |
| |
| // ControlEscape |
| case 'f': |
| consume(); |
| delegate.atomPatternCharacter('\f'); |
| break; |
| case 'n': |
| consume(); |
| delegate.atomPatternCharacter('\n'); |
| break; |
| case 'r': |
| consume(); |
| delegate.atomPatternCharacter('\r'); |
| break; |
| case 't': |
| consume(); |
| delegate.atomPatternCharacter('\t'); |
| break; |
| case 'v': |
| consume(); |
| delegate.atomPatternCharacter('\v'); |
| break; |
| |
| // ControlLetter |
| case 'c': { |
| ParseState state = saveState(); |
| consume(); |
| if (!atEndOfPattern()) { |
| int control = consume(); |
| |
| // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. |
| if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { |
| delegate.atomPatternCharacter(control & 0x1f); |
| break; |
| } |
| } |
| restoreState(state); |
| delegate.atomPatternCharacter('\\'); |
| break; |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| int x = tryConsumeHex(2); |
| if (x == -1) { |
| if (isIdentityEscapeAnError('x')) |
| break; |
| |
| delegate.atomPatternCharacter('x'); |
| } else |
| delegate.atomPatternCharacter(x); |
| break; |
| } |
| |
| // Named backreference |
| case 'k': { |
| consume(); |
| ParseState state = saveState(); |
| if (!atEndOfPattern() && !inCharacterClass) { |
| if (consume() == '<') { |
| auto groupName = tryConsumeGroupName(); |
| if (groupName) { |
| if (m_captureGroupNames.contains(groupName.value())) { |
| delegate.atomNamedBackReference(groupName.value()); |
| break; |
| } |
| |
| if (delegate.isValidNamedForwardReference(groupName.value())) { |
| delegate.atomNamedForwardReference(groupName.value()); |
| break; |
| } |
| } |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidBackreference; |
| break; |
| } |
| } |
| } |
| restoreState(state); |
| delegate.atomPatternCharacter('k'); |
| break; |
| } |
| |
| // Unicode property escapes |
| case 'p': |
| case 'P': { |
| int escapeChar = consume(); |
| |
| if (!m_isUnicode) { |
| if (isIdentityEscapeAnError(escapeChar)) |
| break; |
| delegate.atomPatternCharacter(escapeChar); |
| break; |
| } |
| |
| if (!atEndOfPattern() && peek() == '{') { |
| consume(); |
| auto optClassID = tryConsumeUnicodePropertyExpression(); |
| if (!optClassID) { |
| // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression |
| break; |
| } |
| delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); |
| } else |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| break; |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| consume(); |
| if (atEndOfPattern()) { |
| if (isIdentityEscapeAnError('u')) |
| break; |
| |
| delegate.atomPatternCharacter('u'); |
| break; |
| } |
| |
| if (m_isUnicode && peek() == '{') { |
| consume(); |
| UChar32 codePoint = 0; |
| do { |
| if (atEndOfPattern() || !isASCIIHexDigit(peek())) { |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| break; |
| } |
| |
| codePoint = (codePoint << 4) | toASCIIHexValue(consume()); |
| |
| if (codePoint > UCHAR_MAX_VALUE) |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| } while (!atEndOfPattern() && peek() != '}'); |
| if (!atEndOfPattern() && peek() == '}') |
| consume(); |
| else if (!hasError(m_errorCode)) |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| if (hasError(m_errorCode)) |
| return false; |
| |
| delegate.atomPatternCharacter(codePoint); |
| break; |
| } |
| int u = tryConsumeHex(4); |
| if (u == -1) { |
| if (isIdentityEscapeAnError('u')) |
| break; |
| |
| delegate.atomPatternCharacter('u'); |
| } else { |
| // If we have the first of a surrogate pair, look for the second. |
| if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { |
| ParseState state = saveState(); |
| consume(); |
| |
| if (tryConsume('u')) { |
| int surrogate2 = tryConsumeHex(4); |
| if (U16_IS_TRAIL(surrogate2)) { |
| u = U16_GET_SUPPLEMENTARY(u, surrogate2); |
| delegate.atomPatternCharacter(u); |
| break; |
| } |
| } |
| |
| restoreState(state); |
| } |
| delegate.atomPatternCharacter(u); |
| } |
| break; |
| } |
| |
| // IdentityEscape |
| default: |
| int ch = peek(); |
| |
| if (ch == '-' && m_isUnicode && inCharacterClass) { |
| // \- is allowed for ClassEscape with unicode flag. |
| delegate.atomPatternCharacter(consume()); |
| break; |
| } |
| |
| if (isIdentityEscapeAnError(ch)) |
| break; |
| |
| delegate.atomPatternCharacter(consume()); |
| } |
| |
| return true; |
| } |
| |
| UChar32 consumePossibleSurrogatePair() |
| { |
| UChar32 ch = consume(); |
| if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { |
| ParseState state = saveState(); |
| |
| UChar32 surrogate2 = consume(); |
| if (U16_IS_TRAIL(surrogate2)) |
| ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); |
| else |
| restoreState(state); |
| } |
| |
| return ch; |
| } |
| |
| /* |
| * parseAtomEscape(), parseCharacterClassEscape(): |
| * |
| * These methods alias to parseEscape(). |
| */ |
| bool parseAtomEscape() |
| { |
| return parseEscape<false>(m_delegate); |
| } |
| void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
| { |
| parseEscape<true>(delegate); |
| } |
| |
| /* |
| * parseCharacterClass(): |
| * |
| * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape) |
| * to an instance of CharacterClassParserDelegate, to describe the character class to the |
| * delegate. |
| */ |
| void parseCharacterClass() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '['); |
| consume(); |
| |
| CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode); |
| |
| characterClassConstructor.begin(tryConsume('^')); |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case ']': |
| consume(); |
| characterClassConstructor.end(); |
| return; |
| |
| case '\\': |
| parseCharacterClassEscape(characterClassConstructor); |
| break; |
| |
| default: |
| characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); |
| } |
| |
| if (hasError(m_errorCode)) |
| return; |
| } |
| |
| m_errorCode = ErrorCode::CharacterClassUnmatched; |
| } |
| |
| /* |
| * parseParenthesesBegin(): |
| * |
| * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
| */ |
| void parseParenthesesBegin() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '('); |
| consume(); |
| |
| if (tryConsume('?')) { |
| if (atEndOfPattern()) { |
| m_errorCode = ErrorCode::ParenthesesTypeInvalid; |
| return; |
| } |
| |
| switch (consume()) { |
| case ':': |
| m_delegate.atomParenthesesSubpatternBegin(false); |
| break; |
| |
| case '=': |
| m_delegate.atomParentheticalAssertionBegin(); |
| break; |
| |
| case '!': |
| m_delegate.atomParentheticalAssertionBegin(true); |
| break; |
| |
| case '<': { |
| auto groupName = tryConsumeGroupName(); |
| if (groupName) { |
| auto setAddResult = m_captureGroupNames.add(groupName.value()); |
| if (setAddResult.isNewEntry) |
| m_delegate.atomParenthesesSubpatternBegin(true, groupName); |
| else |
| m_errorCode = ErrorCode::DuplicateGroupName; |
| } else |
| m_errorCode = ErrorCode::InvalidGroupName; |
| |
| break; |
| } |
| |
| default: |
| m_errorCode = ErrorCode::ParenthesesTypeInvalid; |
| } |
| } else |
| m_delegate.atomParenthesesSubpatternBegin(); |
| |
| ++m_parenthesesNestingDepth; |
| } |
| |
| /* |
| * parseParenthesesEnd(): |
| * |
| * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
| */ |
| void parseParenthesesEnd() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == ')'); |
| consume(); |
| |
| if (m_parenthesesNestingDepth > 0) |
| m_delegate.atomParenthesesEnd(); |
| else |
| m_errorCode = ErrorCode::ParenthesesUnmatched; |
| |
| --m_parenthesesNestingDepth; |
| } |
| |
| /* |
| * parseQuantifier(): |
| * |
| * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
| */ |
| void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(min <= max); |
| |
| if (min == UINT_MAX) { |
| m_errorCode = ErrorCode::QuantifierTooLarge; |
| return; |
| } |
| |
| if (lastTokenWasAnAtom) |
| m_delegate.quantifyAtom(min, max, !tryConsume('?')); |
| else |
| m_errorCode = ErrorCode::QuantifierWithoutAtom; |
| } |
| |
| /* |
| * parseTokens(): |
| * |
| * This method loops over the input pattern reporting tokens to the delegate. |
| * The method returns when a parse error is detected, or the end of the pattern |
| * is reached. One piece of state is tracked around the loop, which is whether |
| * the last token passed to the delegate was an atom (this is necessary to detect |
| * a parse error when a quantifier provided without an atom to quantify). |
| */ |
| void parseTokens() |
| { |
| bool lastTokenWasAnAtom = false; |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case '|': |
| consume(); |
| m_delegate.disjunction(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '(': |
| parseParenthesesBegin(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case ')': |
| parseParenthesesEnd(); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '^': |
| consume(); |
| m_delegate.assertionBOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '$': |
| consume(); |
| m_delegate.assertionEOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '.': |
| consume(); |
| m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '[': |
| parseCharacterClass(); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '\\': |
| lastTokenWasAnAtom = parseAtomEscape(); |
| break; |
| |
| case '*': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '+': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '?': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, 1); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '{': { |
| ParseState state = saveState(); |
| |
| consume(); |
| if (peekIsDigit()) { |
| unsigned min = consumeNumber(); |
| unsigned max = min; |
| |
| if (tryConsume(',')) |
| max = peekIsDigit() ? consumeNumber() : quantifyInfinite; |
| |
| if (tryConsume('}')) { |
| if (min <= max) |
| parseQuantifier(lastTokenWasAnAtom, min, max); |
| else |
| m_errorCode = ErrorCode::QuantifierOutOfOrder; |
| lastTokenWasAnAtom = false; |
| break; |
| } |
| } |
| |
| restoreState(state); |
| } |
| // if we did not find a complete quantifer, fall through to the default case. |
| FALLTHROUGH; |
| |
| default: |
| m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); |
| lastTokenWasAnAtom = true; |
| } |
| |
| if (hasError(m_errorCode)) |
| return; |
| } |
| |
| if (m_parenthesesNestingDepth > 0) |
| m_errorCode = ErrorCode::MissingParentheses; |
| } |
| |
| /* |
| * parse(): |
| * |
| * This method calls parseTokens() to parse over the input and returns error code for a result. |
| */ |
| ErrorCode parse() |
| { |
| if (m_size > MAX_PATTERN_SIZE) |
| m_errorCode = ErrorCode::PatternTooLarge; |
| else |
| parseTokens(); |
| ASSERT(atEndOfPattern() || hasError(m_errorCode)); |
| |
| return m_errorCode; |
| } |
| |
| // Misc helper functions: |
| |
| typedef unsigned ParseState; |
| |
| ParseState saveState() |
| { |
| return m_index; |
| } |
| |
| void restoreState(ParseState state) |
| { |
| m_index = state; |
| } |
| |
| bool atEndOfPattern() |
| { |
| ASSERT(m_index <= m_size); |
| return m_index == m_size; |
| } |
| |
| unsigned patternRemaining() |
| { |
| ASSERT(m_index <= m_size); |
| return m_size - m_index; |
| } |
| |
| int peek() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index]; |
| } |
| |
| bool peekIsDigit() |
| { |
| return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
| } |
| |
| unsigned peekDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return peek() - '0'; |
| } |
| |
| int tryConsumeUnicodeEscape() |
| { |
| if (!tryConsume('u')) |
| return -1; |
| |
| if (m_isUnicode && tryConsume('{')) { |
| int codePoint = 0; |
| do { |
| if (atEndOfPattern() || !isASCIIHexDigit(peek())) { |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| return -1; |
| } |
| |
| codePoint = (codePoint << 4) | toASCIIHexValue(consume()); |
| |
| if (codePoint > UCHAR_MAX_VALUE) { |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| return -1; |
| } |
| } while (!atEndOfPattern() && peek() != '}'); |
| if (!atEndOfPattern() && peek() == '}') |
| consume(); |
| else if (!hasError(m_errorCode)) |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| if (hasError(m_errorCode)) |
| return -1; |
| |
| return codePoint; |
| } |
| |
| int u = tryConsumeHex(4); |
| if (u == -1) |
| return -1; |
| |
| // If we have the first of a surrogate pair, look for the second. |
| if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { |
| ParseState state = saveState(); |
| consume(); |
| |
| if (tryConsume('u')) { |
| int surrogate2 = tryConsumeHex(4); |
| if (U16_IS_TRAIL(surrogate2)) { |
| u = U16_GET_SUPPLEMENTARY(u, surrogate2); |
| return u; |
| } |
| } |
| |
| restoreState(state); |
| } |
| |
| return u; |
| } |
| |
| int tryConsumeIdentifierCharacter() |
| { |
| int ch = peek(); |
| |
| if (ch == '\\') { |
| consume(); |
| ch = tryConsumeUnicodeEscape(); |
| } else |
| consume(); |
| |
| return ch; |
| } |
| |
| bool isIdentifierStart(int ch) |
| { |
| return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); |
| } |
| |
| bool isIdentifierPart(int ch) |
| { |
| return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; |
| } |
| |
| bool isUnicodePropertyValueExpressionChar(int ch) |
| { |
| return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; |
| } |
| |
| int consume() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index++]; |
| } |
| |
| unsigned consumeDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return consume() - '0'; |
| } |
| |
| unsigned consumeNumber() |
| { |
| Checked<unsigned, RecordOverflow> n = consumeDigit(); |
| while (peekIsDigit()) |
| n = n * 10 + consumeDigit(); |
| return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); |
| } |
| |
| unsigned consumeOctal() |
| { |
| ASSERT(WTF::isASCIIOctalDigit(peek())); |
| |
| unsigned n = consumeDigit(); |
| while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
| n = n * 8 + consumeDigit(); |
| return n; |
| } |
| |
| bool tryConsume(UChar ch) |
| { |
| if (atEndOfPattern() || (m_data[m_index] != ch)) |
| return false; |
| ++m_index; |
| return true; |
| } |
| |
| int tryConsumeHex(int count) |
| { |
| ParseState state = saveState(); |
| |
| int n = 0; |
| while (count--) { |
| if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
| restoreState(state); |
| return -1; |
| } |
| n = (n << 4) | WTF::toASCIIHexValue(consume()); |
| } |
| return n; |
| } |
| |
| Optional<String> tryConsumeGroupName() |
| { |
| if (atEndOfPattern()) |
| return WTF::nullopt; |
| |
| ParseState state = saveState(); |
| |
| int ch = tryConsumeIdentifierCharacter(); |
| |
| if (isIdentifierStart(ch)) { |
| StringBuilder identifierBuilder; |
| identifierBuilder.appendCharacter(ch); |
| |
| while (!atEndOfPattern()) { |
| ch = tryConsumeIdentifierCharacter(); |
| if (ch == '>') |
| return Optional<String>(identifierBuilder.toString()); |
| |
| if (!isIdentifierPart(ch)) |
| break; |
| |
| identifierBuilder.appendCharacter(ch); |
| } |
| } |
| |
| restoreState(state); |
| |
| return WTF::nullopt; |
| } |
| |
| Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() |
| { |
| if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| StringBuilder expressionBuilder; |
| String unicodePropertyName; |
| bool foundEquals = false; |
| unsigned errors = 0; |
| |
| expressionBuilder.appendCharacter(consume()); |
| |
| while (!atEndOfPattern()) { |
| int ch = peek(); |
| if (ch == '}') { |
| consume(); |
| if (errors) { |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| if (foundEquals) { |
| auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); |
| if (!result) |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return result; |
| } |
| |
| auto result = unicodeMatchProperty(expressionBuilder.toString()); |
| if (!result) |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return result; |
| } |
| |
| consume(); |
| if (ch == '=') { |
| if (!foundEquals) { |
| foundEquals = true; |
| unicodePropertyName = expressionBuilder.toString(); |
| expressionBuilder.clear(); |
| } else |
| errors++; |
| } else if (!isUnicodePropertyValueExpressionChar(ch)) |
| errors++; |
| else |
| expressionBuilder.appendCharacter(ch); |
| } |
| |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| Delegate& m_delegate; |
| unsigned m_backReferenceLimit; |
| ErrorCode m_errorCode { ErrorCode::NoError }; |
| const CharType* m_data; |
| unsigned m_size; |
| unsigned m_index { 0 }; |
| bool m_isUnicode; |
| unsigned m_parenthesesNestingDepth { 0 }; |
| HashSet<String> m_captureGroupNames; |
| |
| // Derived by empirical testing of compile time in PCRE and WREC. |
| static constexpr unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
| }; |
| |
| /* |
| * Yarr::parse(): |
| * |
| * The parse method is passed a pattern to be parsed and a delegate upon which |
| * callbacks will be made to record the parsed tokens forming the regex. |
| * Yarr::parse() returns null on success, or a const C string providing an error |
| * message where a parse error occurs. |
| * |
| * The Delegate must implement the following interface: |
| * |
| * void assertionBOL(); |
| * void assertionEOL(); |
| * void assertionWordBoundary(bool invert); |
| * |
| * void atomPatternCharacter(UChar32 ch); |
| * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
| * void atomCharacterClassBegin(bool invert) |
| * void atomCharacterClassAtom(UChar32 ch) |
| * void atomCharacterClassRange(UChar32 begin, UChar32 end) |
| * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
| * void atomCharacterClassEnd() |
| * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); |
| * void atomParentheticalAssertionBegin(bool invert = false); |
| * void atomParenthesesEnd(); |
| * void atomBackReference(unsigned subpatternId); |
| * void atomNamedBackReference(const String& subpatternName); |
| * bool isValidNamedForwardReference(const String& subpatternName); |
| * void atomNamedForwardReference(const String& subpatternName); |
| * |
| * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
| * |
| * void disjunction(); |
| * |
| * The regular expression is described by a sequence of assertion*() and atom*() |
| * callbacks to the delegate, describing the terms in the regular expression. |
| * Following an atom a quantifyAtom() call may occur to indicate that the previous |
| * atom should be quantified. In the case of atoms described across multiple |
| * calls (parentheses and character classes) the call to quantifyAtom() will come |
| * after the call to the atom*End() method, never after atom*Begin(). |
| * |
| * Character classes may either be described by a single call to |
| * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
| * In the latter case, ...Begin() will be called, followed by a sequence of |
| * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
| * |
| * Sequences of atoms and assertions are broken into alternatives via calls to |
| * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
| * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
| * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
| * capturing subpattern, this will be the subpatternId associated with these |
| * parentheses, and will also by definition be the lowest subpatternId of these |
| * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
| * is passed the subpatternId of the last capturing subexpression nested within |
| * these paretheses. In the case of a capturing subpattern with no nested |
| * capturing subpatterns, the same subpatternId will be passed to the begin and |
| * end functions. In the case of non-capturing subpatterns the subpatternId |
| * passed to the begin method is also the first possible subpatternId that might |
| * be nested within these paretheses. If a set of non-capturing parentheses does |
| * not contain any capturing subpatterns, then the subpatternId passed to begin |
| * will be greater than the subpatternId passed to end. |
| */ |
| |
| template<class Delegate> |
| ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) |
| { |
| if (pattern.is8Bit()) |
| return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); |
| return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); |
| } |
| |
| } } // namespace JSC::Yarr |