| /* |
| * Copyright (C) 2009-2020 Apple Inc. All rights reserved. |
| * Copyright (C) 2020 Alexey Shvayka <shvaikalesh@gmail.com>. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
| * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
| * THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #pragma once |
| |
| #include "Yarr.h" |
| #include "YarrPattern.h" |
| #include "YarrUnicodeProperties.h" |
| #include <wtf/ASCIICType.h> |
| #include <wtf/HashSet.h> |
| #include <wtf/Optional.h> |
| #include <wtf/text/StringBuilder.h> |
| #include <wtf/text/WTFString.h> |
| |
| namespace JSC { namespace Yarr { |
| |
| // The Parser class should not be used directly - only via the Yarr::parse() method. |
| template<class Delegate, typename CharType> |
| class Parser { |
| private: |
| template<class FriendDelegate> |
| friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed); |
| |
| enum class UnicodeParseContext : uint8_t { PatternCodePoint, GroupName }; |
| |
| /* |
| * CharacterClassParserDelegate: |
| * |
| * The class CharacterClassParserDelegate is used in the parsing of character |
| * classes. This class handles detection of character ranges. This class |
| * implements enough of the delegate interface such that it can be passed to |
| * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
| * to perform the parsing of escape characters in character sets. |
| */ |
| class CharacterClassParserDelegate { |
| public: |
| CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err, bool isUnicode) |
| : m_delegate(delegate) |
| , m_errorCode(err) |
| , m_isUnicode(isUnicode) |
| , m_state(Empty) |
| , m_character(0) |
| { |
| } |
| |
| /* |
| * begin(): |
| * |
| * Called at beginning of construction. |
| */ |
| void begin(bool invert) |
| { |
| m_delegate.atomCharacterClassBegin(invert); |
| } |
| |
| /* |
| * atomPatternCharacter(): |
| * |
| * This method is called either from parseCharacterClass() (for an unescaped |
| * character in a character class), or from parseEscape(). In the former case |
| * the value true will be passed for the argument 'hyphenIsRange', and in this |
| * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ |
| * is different to /[a\-z]/). |
| */ |
| void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) |
| { |
| switch (m_state) { |
| case AfterCharacterClass: |
| // Following a built-in character class we need look out for a hyphen. |
| // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. |
| // If we see a hyphen following a character class then unlike usual |
| // we'll report it to the delegate immediately, and put ourself into |
| // a poisoned state. In a unicode pattern, any following calls to add |
| // another character or character class will result in syntax error. |
| // A hypen following a character class is itself valid, but only at |
| // the end of a regex. |
| if (hyphenIsRange && ch == '-') { |
| m_delegate.atomCharacterClassAtom('-'); |
| m_state = AfterCharacterClassHyphen; |
| return; |
| } |
| // Otherwise just fall through - cached character so treat this as Empty. |
| FALLTHROUGH; |
| |
| case Empty: |
| m_character = ch; |
| m_state = CachedCharacter; |
| return; |
| |
| case CachedCharacter: |
| if (hyphenIsRange && ch == '-') |
| m_state = CachedCharacterHyphen; |
| else { |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_character = ch; |
| } |
| return; |
| |
| case CachedCharacterHyphen: |
| if (ch < m_character) { |
| m_errorCode = ErrorCode::CharacterClassRangeOutOfOrder; |
| return; |
| } |
| m_delegate.atomCharacterClassRange(m_character, ch); |
| m_state = Empty; |
| return; |
| |
| // If we hit this case, we have an invalid range like /[\d-a]/. |
| // See coment in atomBuiltInCharacterClass() below. |
| case AfterCharacterClassHyphen: |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::CharacterClassRangeInvalid; |
| return; |
| } |
| m_delegate.atomCharacterClassAtom(ch); |
| m_state = Empty; |
| return; |
| } |
| } |
| |
| /* |
| * atomBuiltInCharacterClass(): |
| * |
| * Adds a built-in character class, called by parseEscape(). |
| */ |
| void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
| { |
| switch (m_state) { |
| case CachedCharacter: |
| // Flush the currently cached character, then fall through. |
| m_delegate.atomCharacterClassAtom(m_character); |
| FALLTHROUGH; |
| case Empty: |
| case AfterCharacterClass: |
| m_delegate.atomCharacterClassBuiltIn(classID, invert); |
| m_state = AfterCharacterClass; |
| return; |
| |
| // If we hit either of these cases, we have an invalid range that |
| // looks something like /[a-\d]/ or /[\d-\d]/. |
| // Since ES2015, this should be syntax error in a unicode pattern, |
| // yet gracefully handled in a regular regex to avoid breaking the web. |
| // Effectively we handle the hyphen as if it was (implicitly) escaped, |
| // e.g. /[\d-a-z]/ is treated as /[\d\-a\-z]/. |
| // See usages of CharacterRangeOrUnion abstract op in |
| // https://tc39.es/ecma262/#sec-regular-expression-patterns-semantics |
| case CachedCharacterHyphen: |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_delegate.atomCharacterClassAtom('-'); |
| FALLTHROUGH; |
| case AfterCharacterClassHyphen: |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::CharacterClassRangeInvalid; |
| return; |
| } |
| m_delegate.atomCharacterClassBuiltIn(classID, invert); |
| m_state = Empty; |
| return; |
| } |
| } |
| |
| /* |
| * end(): |
| * |
| * Called at end of construction. |
| */ |
| void end() |
| { |
| if (m_state == CachedCharacter) |
| m_delegate.atomCharacterClassAtom(m_character); |
| else if (m_state == CachedCharacterHyphen) { |
| m_delegate.atomCharacterClassAtom(m_character); |
| m_delegate.atomCharacterClassAtom('-'); |
| } |
| m_delegate.atomCharacterClassEnd(); |
| } |
| |
| // parseEscape() should never call these delegate methods when |
| // invoked with inCharacterClass set. |
| NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } |
| NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } |
| |
| private: |
| Delegate& m_delegate; |
| ErrorCode& m_errorCode; |
| bool m_isUnicode; |
| enum CharacterClassConstructionState { |
| Empty, |
| CachedCharacter, |
| CachedCharacterHyphen, |
| AfterCharacterClass, |
| AfterCharacterClassHyphen, |
| } m_state; |
| UChar32 m_character; |
| }; |
| |
| Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed) |
| : m_delegate(delegate) |
| , m_data(pattern.characters<CharType>()) |
| , m_size(pattern.length()) |
| , m_isUnicode(isUnicode) |
| , m_backReferenceLimit(backReferenceLimit) |
| , m_isNamedForwardReferenceAllowed(isNamedForwardReferenceAllowed) |
| { |
| } |
| |
| // The handling of IdentityEscapes is different depending on the unicode flag. |
| // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. |
| // For non-unicode patterns, most any character can be escaped. |
| bool isIdentityEscapeAnError(int ch) |
| { |
| if (m_isUnicode && (!strchr("^$\\.*+?()[]{}|/", ch) || !ch)) { |
| m_errorCode = ErrorCode::InvalidIdentityEscape; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * parseEscape(): |
| * |
| * Helper for parseTokens() AND parseCharacterClass(). |
| * Unlike the other parser methods, this function does not report tokens |
| * directly to the member delegate (m_delegate), instead tokens are |
| * emitted to the delegate provided as an argument. In the case of atom |
| * escapes, parseTokens() will call parseEscape() passing m_delegate as |
| * an argument, and as such the escape will be reported to the delegate. |
| * |
| * However this method may also be used by parseCharacterClass(), in which |
| * case a CharacterClassParserDelegate will be passed as the delegate that |
| * tokens should be added to. A boolean flag is also provided to indicate |
| * whether that an escape in a CharacterClass is being parsed (some parsing |
| * rules change in this context). |
| * |
| * The boolean value returned by this method indicates whether the token |
| * parsed was an atom (outside of a characted class \b and \B will be |
| * interpreted as assertions). |
| */ |
| template<bool inCharacterClass, class EscapeDelegate> |
| bool parseEscape(EscapeDelegate& delegate) |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '\\'); |
| consume(); |
| |
| if (atEndOfPattern()) { |
| m_errorCode = ErrorCode::EscapeUnterminated; |
| return false; |
| } |
| |
| switch (peek()) { |
| // Assertions |
| case 'b': |
| consume(); |
| if (inCharacterClass) |
| delegate.atomPatternCharacter('\b'); |
| else { |
| delegate.assertionWordBoundary(false); |
| return false; |
| } |
| break; |
| case 'B': |
| consume(); |
| if (inCharacterClass) { |
| if (isIdentityEscapeAnError('B')) |
| break; |
| |
| delegate.atomPatternCharacter('B'); |
| } else { |
| delegate.assertionWordBoundary(true); |
| return false; |
| } |
| break; |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); |
| break; |
| case 's': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); |
| break; |
| case 'w': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); |
| break; |
| case 'D': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); |
| break; |
| case 'S': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); |
| break; |
| case 'W': |
| consume(); |
| delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); |
| break; |
| |
| case '0': { |
| consume(); |
| |
| if (!peekIsDigit()) { |
| delegate.atomPatternCharacter(0); |
| break; |
| } |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidOctalEscape; |
| break; |
| } |
| |
| delegate.atomPatternCharacter(consumeOctal(2)); |
| break; |
| } |
| |
| // DecimalEscape |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| // For non-Unicode patterns, invalid backreferences are parsed as octal or decimal escapes. |
| // First, try to parse this as backreference. |
| if (!inCharacterClass) { |
| ParseState state = saveState(); |
| |
| unsigned backReference = consumeNumber(); |
| if (backReference <= m_backReferenceLimit) { |
| m_maxSeenBackReference = std::max(m_maxSeenBackReference, backReference); |
| delegate.atomBackReference(backReference); |
| break; |
| } |
| |
| restoreState(state); |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidBackreference; |
| break; |
| } |
| } |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidOctalEscape; |
| break; |
| } |
| |
| delegate.atomPatternCharacter(peek() < '8' ? consumeOctal(3) : consume()); |
| break; |
| } |
| |
| // ControlEscape |
| case 'f': |
| consume(); |
| delegate.atomPatternCharacter('\f'); |
| break; |
| case 'n': |
| consume(); |
| delegate.atomPatternCharacter('\n'); |
| break; |
| case 'r': |
| consume(); |
| delegate.atomPatternCharacter('\r'); |
| break; |
| case 't': |
| consume(); |
| delegate.atomPatternCharacter('\t'); |
| break; |
| case 'v': |
| consume(); |
| delegate.atomPatternCharacter('\v'); |
| break; |
| |
| // ControlLetter |
| case 'c': { |
| ParseState state = saveState(); |
| consume(); |
| if (!atEndOfPattern()) { |
| int control = consume(); |
| |
| if (WTF::isASCIIAlpha(control)) { |
| delegate.atomPatternCharacter(control & 0x1f); |
| break; |
| } |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidControlLetterEscape; |
| break; |
| } |
| |
| // https://tc39.es/ecma262/#prod-annexB-ClassControlLetter |
| if (inCharacterClass && (WTF::isASCIIDigit(control) || control == '_')) { |
| delegate.atomPatternCharacter(control & 0x1f); |
| break; |
| } |
| } |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidIdentityEscape; |
| break; |
| } |
| |
| restoreState(state); |
| delegate.atomPatternCharacter('\\'); |
| break; |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| int x = tryConsumeHex(2); |
| if (x == -1) { |
| if (isIdentityEscapeAnError('x')) |
| break; |
| |
| delegate.atomPatternCharacter('x'); |
| } else |
| delegate.atomPatternCharacter(x); |
| break; |
| } |
| |
| // Named backreference |
| case 'k': { |
| consume(); |
| ParseState state = saveState(); |
| if (!inCharacterClass && tryConsume('<')) { |
| auto groupName = tryConsumeGroupName(); |
| if (hasError(m_errorCode)) |
| break; |
| |
| if (groupName) { |
| if (m_captureGroupNames.contains(groupName.value())) { |
| delegate.atomNamedBackReference(groupName.value()); |
| break; |
| } |
| |
| if (m_isNamedForwardReferenceAllowed) { |
| m_forwardReferenceNames.add(groupName.value()); |
| delegate.atomNamedForwardReference(groupName.value()); |
| break; |
| } |
| } |
| } |
| |
| restoreState(state); |
| if (!isIdentityEscapeAnError('k')) { |
| delegate.atomPatternCharacter('k'); |
| m_kIdentityEscapeSeen = true; |
| } |
| break; |
| } |
| |
| // Unicode property escapes |
| case 'p': |
| case 'P': { |
| int escapeChar = consume(); |
| |
| if (!m_isUnicode) { |
| if (isIdentityEscapeAnError(escapeChar)) |
| break; |
| delegate.atomPatternCharacter(escapeChar); |
| break; |
| } |
| |
| if (!atEndOfPattern() && peek() == '{') { |
| consume(); |
| auto optClassID = tryConsumeUnicodePropertyExpression(); |
| if (!optClassID) { |
| // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression |
| break; |
| } |
| delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); |
| } else |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| break; |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| int codePoint = tryConsumeUnicodeEscape<UnicodeParseContext::PatternCodePoint>(); |
| if (hasError(m_errorCode)) |
| break; |
| |
| delegate.atomPatternCharacter(codePoint == -1 ? 'u' : codePoint); |
| break; |
| } |
| |
| // IdentityEscape |
| default: |
| int ch = peek(); |
| |
| if (ch == '-' && m_isUnicode && inCharacterClass) { |
| // \- is allowed for ClassEscape with unicode flag. |
| delegate.atomPatternCharacter(consume()); |
| break; |
| } |
| |
| if (isIdentityEscapeAnError(ch)) |
| break; |
| |
| delegate.atomPatternCharacter(consume()); |
| } |
| |
| return true; |
| } |
| |
| template<UnicodeParseContext context> |
| UChar32 consumePossibleSurrogatePair() |
| { |
| bool unicodePatternOrGroupName = m_isUnicode || context == UnicodeParseContext::GroupName; |
| |
| UChar32 ch = consume(); |
| if (U16_IS_LEAD(ch) && unicodePatternOrGroupName && !atEndOfPattern()) { |
| ParseState state = saveState(); |
| |
| UChar32 surrogate2 = consume(); |
| if (U16_IS_TRAIL(surrogate2)) |
| ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); |
| else |
| restoreState(state); |
| } |
| |
| return ch; |
| } |
| |
| /* |
| * parseAtomEscape(), parseCharacterClassEscape(): |
| * |
| * These methods alias to parseEscape(). |
| */ |
| bool parseAtomEscape() |
| { |
| return parseEscape<false>(m_delegate); |
| } |
| void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
| { |
| parseEscape<true>(delegate); |
| } |
| |
| /* |
| * parseCharacterClass(): |
| * |
| * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape) |
| * to an instance of CharacterClassParserDelegate, to describe the character class to the |
| * delegate. |
| */ |
| void parseCharacterClass() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '['); |
| consume(); |
| |
| CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode, m_isUnicode); |
| |
| characterClassConstructor.begin(tryConsume('^')); |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case ']': |
| consume(); |
| characterClassConstructor.end(); |
| return; |
| |
| case '\\': |
| parseCharacterClassEscape(characterClassConstructor); |
| break; |
| |
| default: |
| characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>(), true); |
| } |
| |
| if (hasError(m_errorCode)) |
| return; |
| } |
| |
| m_errorCode = ErrorCode::CharacterClassUnmatched; |
| } |
| |
| /* |
| * parseParenthesesBegin(): |
| * |
| * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
| */ |
| void parseParenthesesBegin() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == '('); |
| consume(); |
| |
| auto type = ParenthesesType::Subpattern; |
| |
| if (tryConsume('?')) { |
| if (atEndOfPattern()) { |
| m_errorCode = ErrorCode::ParenthesesTypeInvalid; |
| return; |
| } |
| |
| switch (consume()) { |
| case ':': |
| m_delegate.atomParenthesesSubpatternBegin(false); |
| break; |
| |
| case '=': |
| m_delegate.atomParentheticalAssertionBegin(); |
| type = ParenthesesType::Assertion; |
| break; |
| |
| case '!': |
| m_delegate.atomParentheticalAssertionBegin(true); |
| type = ParenthesesType::Assertion; |
| break; |
| |
| case '<': { |
| auto groupName = tryConsumeGroupName(); |
| if (hasError(m_errorCode)) |
| break; |
| |
| if (groupName) { |
| if (m_kIdentityEscapeSeen) { |
| m_errorCode = ErrorCode::InvalidNamedBackReference; |
| break; |
| } |
| |
| auto setAddResult = m_captureGroupNames.add(groupName.value()); |
| if (setAddResult.isNewEntry) |
| m_delegate.atomParenthesesSubpatternBegin(true, groupName); |
| else |
| m_errorCode = ErrorCode::DuplicateGroupName; |
| } else |
| m_errorCode = ErrorCode::InvalidGroupName; |
| |
| break; |
| } |
| |
| default: |
| m_errorCode = ErrorCode::ParenthesesTypeInvalid; |
| } |
| } else |
| m_delegate.atomParenthesesSubpatternBegin(); |
| |
| if (type == ParenthesesType::Subpattern) |
| ++m_numSubpatterns; |
| |
| m_parenthesesStack.append(type); |
| } |
| |
| /* |
| * parseParenthesesEnd(): |
| * |
| * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
| * |
| * The boolean value returned by this method indicates whether the token parsed |
| * was either an Atom or, for web compatibility reasons, QuantifiableAssertion |
| * in non-Unicode pattern. |
| */ |
| bool parseParenthesesEnd() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(peek() == ')'); |
| consume(); |
| |
| if (m_parenthesesStack.isEmpty()) { |
| m_errorCode = ErrorCode::ParenthesesUnmatched; |
| return false; |
| } |
| |
| m_delegate.atomParenthesesEnd(); |
| auto type = m_parenthesesStack.takeLast(); |
| return type == ParenthesesType::Subpattern || !m_isUnicode; |
| } |
| |
| /* |
| * parseQuantifier(): |
| * |
| * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
| */ |
| void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
| { |
| ASSERT(!hasError(m_errorCode)); |
| ASSERT(min <= max); |
| |
| if (min == UINT_MAX) { |
| m_errorCode = ErrorCode::QuantifierTooLarge; |
| return; |
| } |
| |
| if (lastTokenWasAnAtom) |
| m_delegate.quantifyAtom(min, max, !tryConsume('?')); |
| else |
| m_errorCode = ErrorCode::QuantifierWithoutAtom; |
| } |
| |
| /* |
| * parseTokens(): |
| * |
| * This method loops over the input pattern reporting tokens to the delegate. |
| * The method returns when a parse error is detected, or the end of the pattern |
| * is reached. One piece of state is tracked around the loop, which is whether |
| * the last token passed to the delegate was an atom (this is necessary to detect |
| * a parse error when a quantifier provided without an atom to quantify). |
| */ |
| void parseTokens() |
| { |
| bool lastTokenWasAnAtom = false; |
| |
| while (!atEndOfPattern()) { |
| switch (peek()) { |
| case '|': |
| consume(); |
| m_delegate.disjunction(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '(': |
| parseParenthesesBegin(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case ')': |
| lastTokenWasAnAtom = parseParenthesesEnd(); |
| break; |
| |
| case '^': |
| consume(); |
| m_delegate.assertionBOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '$': |
| consume(); |
| m_delegate.assertionEOL(); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '.': |
| consume(); |
| m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '[': |
| parseCharacterClass(); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case ']': |
| case '}': |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::BracketUnmatched; |
| break; |
| } |
| |
| m_delegate.atomPatternCharacter(consume()); |
| lastTokenWasAnAtom = true; |
| break; |
| |
| case '\\': |
| lastTokenWasAnAtom = parseAtomEscape(); |
| break; |
| |
| case '*': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '+': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '?': |
| consume(); |
| parseQuantifier(lastTokenWasAnAtom, 0, 1); |
| lastTokenWasAnAtom = false; |
| break; |
| |
| case '{': { |
| ParseState state = saveState(); |
| |
| consume(); |
| if (peekIsDigit()) { |
| unsigned min = consumeNumber(); |
| unsigned max = min; |
| |
| if (tryConsume(',')) |
| max = peekIsDigit() ? consumeNumber() : quantifyInfinite; |
| |
| if (tryConsume('}')) { |
| if (min <= max) |
| parseQuantifier(lastTokenWasAnAtom, min, max); |
| else |
| m_errorCode = ErrorCode::QuantifierOutOfOrder; |
| lastTokenWasAnAtom = false; |
| break; |
| } |
| } |
| |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::QuantifierIncomplete; |
| break; |
| } |
| |
| restoreState(state); |
| // if we did not find a complete quantifer, fall through to the default case. |
| FALLTHROUGH; |
| } |
| |
| default: |
| m_delegate.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>()); |
| lastTokenWasAnAtom = true; |
| } |
| |
| if (hasError(m_errorCode)) |
| return; |
| } |
| |
| if (!m_parenthesesStack.isEmpty()) |
| m_errorCode = ErrorCode::MissingParentheses; |
| } |
| |
| /* |
| * parse(): |
| * |
| * This method calls parseTokens() to parse over the input and returns error code for a result. |
| */ |
| ErrorCode parse() |
| { |
| if (m_size > MAX_PATTERN_SIZE) |
| return ErrorCode::PatternTooLarge; |
| |
| parseTokens(); |
| |
| if (!hasError(m_errorCode)) { |
| ASSERT(atEndOfPattern()); |
| handleIllegalReferences(); |
| ASSERT(atEndOfPattern()); |
| } |
| |
| return m_errorCode; |
| } |
| |
| void handleIllegalReferences() |
| { |
| bool shouldReparse = false; |
| |
| if (m_maxSeenBackReference > m_numSubpatterns) { |
| // Contains illegal numeric backreference. See https://tc39.es/ecma262/#prod-annexB-AtomEscape |
| if (m_isUnicode) { |
| m_errorCode = ErrorCode::InvalidBackreference; |
| return; |
| } |
| |
| m_backReferenceLimit = m_numSubpatterns; |
| shouldReparse = true; |
| } |
| |
| if (m_kIdentityEscapeSeen && !m_captureGroupNames.isEmpty()) { |
| m_errorCode = ErrorCode::InvalidNamedBackReference; |
| return; |
| } |
| |
| if (containsIllegalNamedForwardReference()) { |
| // \k<a> is parsed as named reference in Unicode patterns because of strict IdentityEscape grammar. |
| // See https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors |
| if (m_isUnicode || !m_captureGroupNames.isEmpty()) { |
| m_errorCode = ErrorCode::InvalidNamedBackReference; |
| return; |
| } |
| |
| m_isNamedForwardReferenceAllowed = false; |
| shouldReparse = true; |
| } |
| |
| if (shouldReparse) { |
| resetForReparsing(); |
| parseTokens(); |
| } |
| } |
| |
| bool containsIllegalNamedForwardReference() |
| { |
| if (m_forwardReferenceNames.isEmpty()) |
| return false; |
| |
| if (m_captureGroupNames.isEmpty()) |
| return true; |
| |
| for (auto& entry : m_forwardReferenceNames) { |
| if (!m_captureGroupNames.contains(entry)) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void resetForReparsing() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| |
| m_delegate.resetForReparsing(); |
| m_index = 0; |
| m_numSubpatterns = 0; |
| m_maxSeenBackReference = 0; |
| m_kIdentityEscapeSeen = false; |
| m_parenthesesStack.clear(); |
| m_captureGroupNames.clear(); |
| m_forwardReferenceNames.clear(); |
| } |
| |
| // Misc helper functions: |
| |
| typedef unsigned ParseState; |
| |
| ParseState saveState() |
| { |
| return m_index; |
| } |
| |
| void restoreState(ParseState state) |
| { |
| m_index = state; |
| } |
| |
| bool atEndOfPattern() |
| { |
| ASSERT(m_index <= m_size); |
| return m_index == m_size; |
| } |
| |
| unsigned patternRemaining() |
| { |
| ASSERT(m_index <= m_size); |
| return m_size - m_index; |
| } |
| |
| int peek() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index]; |
| } |
| |
| bool peekIsDigit() |
| { |
| return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
| } |
| |
| unsigned peekDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return peek() - '0'; |
| } |
| |
| template<UnicodeParseContext context> |
| int tryConsumeUnicodeEscape() |
| { |
| ASSERT(!hasError(m_errorCode)); |
| |
| bool unicodePatternOrGroupName = m_isUnicode || context == UnicodeParseContext::GroupName; |
| |
| if (!tryConsume('u') || atEndOfPattern()) { |
| if (unicodePatternOrGroupName) |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| return -1; |
| } |
| |
| if (unicodePatternOrGroupName && tryConsume('{')) { |
| int codePoint = 0; |
| do { |
| if (atEndOfPattern() || !isASCIIHexDigit(peek())) { |
| m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; |
| return -1; |
| } |
| |
| codePoint = (codePoint << 4) | toASCIIHexValue(consume()); |
| |
| if (codePoint > UCHAR_MAX_VALUE) { |
| m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; |
| return -1; |
| } |
| } while (!atEndOfPattern() && peek() != '}'); |
| |
| if (!tryConsume('}')) { |
| m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; |
| return -1; |
| } |
| |
| return codePoint; |
| } |
| |
| int codeUnit = tryConsumeHex(4); |
| if (codeUnit == -1) { |
| if (unicodePatternOrGroupName) |
| m_errorCode = ErrorCode::InvalidUnicodeEscape; |
| return -1; |
| } |
| |
| // If we have the first of a surrogate pair, look for the second. |
| if (U16_IS_LEAD(codeUnit) && unicodePatternOrGroupName && patternRemaining() >= 6 && peek() == '\\') { |
| ParseState state = saveState(); |
| consume(); |
| |
| if (tryConsume('u')) { |
| int surrogate2 = tryConsumeHex(4); |
| if (U16_IS_TRAIL(surrogate2)) |
| return U16_GET_SUPPLEMENTARY(codeUnit, surrogate2); |
| } |
| |
| restoreState(state); |
| } |
| |
| return codeUnit; |
| } |
| |
| int tryConsumeIdentifierCharacter() |
| { |
| if (tryConsume('\\')) |
| return tryConsumeUnicodeEscape<UnicodeParseContext::GroupName>(); |
| |
| return consumePossibleSurrogatePair<UnicodeParseContext::GroupName>(); |
| } |
| |
| bool isIdentifierStart(int ch) |
| { |
| return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); |
| } |
| |
| bool isIdentifierPart(int ch) |
| { |
| return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; |
| } |
| |
| bool isUnicodePropertyValueExpressionChar(int ch) |
| { |
| return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; |
| } |
| |
| int consume() |
| { |
| ASSERT(m_index < m_size); |
| return m_data[m_index++]; |
| } |
| |
| unsigned consumeDigit() |
| { |
| ASSERT(peekIsDigit()); |
| return consume() - '0'; |
| } |
| |
| unsigned consumeNumber() |
| { |
| Checked<unsigned, RecordOverflow> n = consumeDigit(); |
| while (peekIsDigit()) |
| n = n * 10 + consumeDigit(); |
| return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); |
| } |
| |
| // https://tc39.es/ecma262/#prod-annexB-LegacyOctalEscapeSequence |
| unsigned consumeOctal(unsigned count) |
| { |
| unsigned octal = 0; |
| while (count-- && octal < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
| octal = octal * 8 + consumeDigit(); |
| return octal; |
| } |
| |
| bool tryConsume(UChar ch) |
| { |
| if (atEndOfPattern() || (m_data[m_index] != ch)) |
| return false; |
| ++m_index; |
| return true; |
| } |
| |
| int tryConsumeHex(int count) |
| { |
| ParseState state = saveState(); |
| |
| int n = 0; |
| while (count--) { |
| if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
| restoreState(state); |
| return -1; |
| } |
| n = (n << 4) | WTF::toASCIIHexValue(consume()); |
| } |
| return n; |
| } |
| |
| Optional<String> tryConsumeGroupName() |
| { |
| if (atEndOfPattern()) |
| return WTF::nullopt; |
| |
| ParseState state = saveState(); |
| |
| int ch = tryConsumeIdentifierCharacter(); |
| |
| if (isIdentifierStart(ch)) { |
| StringBuilder identifierBuilder; |
| identifierBuilder.appendCharacter(ch); |
| |
| while (!atEndOfPattern()) { |
| ch = tryConsumeIdentifierCharacter(); |
| if (ch == '>') |
| return Optional<String>(identifierBuilder.toString()); |
| |
| if (!isIdentifierPart(ch)) |
| break; |
| |
| identifierBuilder.appendCharacter(ch); |
| } |
| } |
| |
| restoreState(state); |
| |
| return WTF::nullopt; |
| } |
| |
| Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() |
| { |
| if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| StringBuilder expressionBuilder; |
| String unicodePropertyName; |
| bool foundEquals = false; |
| unsigned errors = 0; |
| |
| expressionBuilder.appendCharacter(consume()); |
| |
| while (!atEndOfPattern()) { |
| int ch = peek(); |
| if (ch == '}') { |
| consume(); |
| if (errors) { |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| if (foundEquals) { |
| auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); |
| if (!result) |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return result; |
| } |
| |
| auto result = unicodeMatchProperty(expressionBuilder.toString()); |
| if (!result) |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return result; |
| } |
| |
| consume(); |
| if (ch == '=') { |
| if (!foundEquals) { |
| foundEquals = true; |
| unicodePropertyName = expressionBuilder.toString(); |
| expressionBuilder.clear(); |
| } else |
| errors++; |
| } else if (!isUnicodePropertyValueExpressionChar(ch)) |
| errors++; |
| else |
| expressionBuilder.appendCharacter(ch); |
| } |
| |
| m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; |
| return WTF::nullopt; |
| } |
| |
| enum class ParenthesesType : uint8_t { Subpattern, Assertion }; |
| |
| Delegate& m_delegate; |
| ErrorCode m_errorCode { ErrorCode::NoError }; |
| const CharType* m_data; |
| unsigned m_size; |
| unsigned m_index { 0 }; |
| bool m_isUnicode; |
| unsigned m_backReferenceLimit; |
| unsigned m_numSubpatterns { 0 }; |
| unsigned m_maxSeenBackReference { 0 }; |
| bool m_isNamedForwardReferenceAllowed; |
| bool m_kIdentityEscapeSeen { false }; |
| Vector<ParenthesesType, 16> m_parenthesesStack; |
| HashSet<String> m_captureGroupNames; |
| HashSet<String> m_forwardReferenceNames; |
| |
| // Derived by empirical testing of compile time in PCRE and WREC. |
| static constexpr unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
| }; |
| |
| /* |
| * Yarr::parse(): |
| * |
| * The parse method is passed a pattern to be parsed and a delegate upon which |
| * callbacks will be made to record the parsed tokens forming the regex. |
| * Yarr::parse() returns null on success, or a const C string providing an error |
| * message where a parse error occurs. |
| * |
| * The Delegate must implement the following interface: |
| * |
| * void assertionBOL(); |
| * void assertionEOL(); |
| * void assertionWordBoundary(bool invert); |
| * |
| * void atomPatternCharacter(UChar32 ch); |
| * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
| * void atomCharacterClassBegin(bool invert) |
| * void atomCharacterClassAtom(UChar32 ch) |
| * void atomCharacterClassRange(UChar32 begin, UChar32 end) |
| * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
| * void atomCharacterClassEnd() |
| * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); |
| * void atomParentheticalAssertionBegin(bool invert = false); |
| * void atomParenthesesEnd(); |
| * void atomBackReference(unsigned subpatternId); |
| * void atomNamedBackReference(const String& subpatternName); |
| * void atomNamedForwardReference(const String& subpatternName); |
| * |
| * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
| * |
| * void disjunction(); |
| * |
| * void resetForReparsing(); |
| * |
| * The regular expression is described by a sequence of assertion*() and atom*() |
| * callbacks to the delegate, describing the terms in the regular expression. |
| * Following an atom a quantifyAtom() call may occur to indicate that the previous |
| * atom should be quantified. In the case of atoms described across multiple |
| * calls (parentheses and character classes) the call to quantifyAtom() will come |
| * after the call to the atom*End() method, never after atom*Begin(). |
| * |
| * Character classes may either be described by a single call to |
| * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
| * In the latter case, ...Begin() will be called, followed by a sequence of |
| * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
| * |
| * Sequences of atoms and assertions are broken into alternatives via calls to |
| * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
| * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
| * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
| * capturing subpattern, this will be the subpatternId associated with these |
| * parentheses, and will also by definition be the lowest subpatternId of these |
| * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
| * is passed the subpatternId of the last capturing subexpression nested within |
| * these paretheses. In the case of a capturing subpattern with no nested |
| * capturing subpatterns, the same subpatternId will be passed to the begin and |
| * end functions. In the case of non-capturing subpatterns the subpatternId |
| * passed to the begin method is also the first possible subpatternId that might |
| * be nested within these paretheses. If a set of non-capturing parentheses does |
| * not contain any capturing subpatterns, then the subpatternId passed to begin |
| * will be greater than the subpatternId passed to end. |
| */ |
| |
| template<class Delegate> |
| ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite, bool isNamedForwardReferenceAllowed = true) |
| { |
| if (pattern.is8Bit()) |
| return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit, isNamedForwardReferenceAllowed).parse(); |
| return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit, isNamedForwardReferenceAllowed).parse(); |
| } |
| |
| } } // namespace JSC::Yarr |