| /* |
| * Copyright (C) 2008 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "WRECParser.h" |
| |
| #if ENABLE(WREC) |
| |
| #include "CharacterClassConstructor.h" |
| #include "WRECFunctors.h" |
| |
| using namespace WTF; |
| |
| namespace JSC { namespace WREC { |
| |
| // These error messages match the error messages used by PCRE. |
| const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier"; |
| const char* Parser::QuantifierWithoutAtom = "nothing to repeat"; |
| const char* Parser::ParenthesesUnmatched = "unmatched parentheses"; |
| const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?"; |
| const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet. |
| const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class"; |
| const char* Parser::CharacterClassOutOfOrder = "range out of order in character class"; |
| const char* Parser::EscapeUnterminated = "\\ at end of pattern"; |
| |
| class PatternCharacterSequence { |
| typedef Generator::JumpList JumpList; |
| |
| public: |
| PatternCharacterSequence(Generator& generator, JumpList& failures) |
| : m_generator(generator) |
| , m_failures(failures) |
| { |
| } |
| |
| size_t size() { return m_sequence.size(); } |
| |
| void append(int ch) |
| { |
| m_sequence.append(ch); |
| } |
| |
| void flush() |
| { |
| if (!m_sequence.size()) |
| return; |
| |
| m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size()); |
| m_sequence.clear(); |
| } |
| |
| void flush(const Quantifier& quantifier) |
| { |
| if (!m_sequence.size()) |
| return; |
| |
| m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1); |
| |
| switch (quantifier.type) { |
| case Quantifier::None: |
| case Quantifier::Error: |
| ASSERT_NOT_REACHED(); |
| break; |
| |
| case Quantifier::Greedy: { |
| GeneratePatternCharacterFunctor functor(m_sequence.last()); |
| m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); |
| break; |
| } |
| |
| case Quantifier::NonGreedy: { |
| GeneratePatternCharacterFunctor functor(m_sequence.last()); |
| m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); |
| break; |
| } |
| } |
| |
| m_sequence.clear(); |
| } |
| |
| private: |
| Generator& m_generator; |
| JumpList& m_failures; |
| Vector<int, 8> m_sequence; |
| }; |
| |
| ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier() |
| { |
| switch (peek()) { |
| case '?': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 0, 1); |
| |
| case '*': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 0); |
| |
| case '+': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 1); |
| |
| case '{': { |
| SavedState state(*this); |
| consume(); |
| |
| // Accept: {n}, {n,}, {n,m}. |
| // Reject: {n,m} where n > m. |
| // Ignore: Anything else, such as {n, m}. |
| |
| if (!peekIsDigit()) { |
| state.restore(); |
| return Quantifier(); |
| } |
| |
| unsigned min = consumeNumber(); |
| unsigned max = min; |
| |
| if (peek() == ',') { |
| consume(); |
| max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity; |
| } |
| |
| if (peek() != '}') { |
| state.restore(); |
| return Quantifier(); |
| } |
| consume(); |
| |
| if (min > max) { |
| setError(QuantifierOutOfOrder); |
| return Quantifier(Quantifier::Error); |
| } |
| |
| return Quantifier(Quantifier::Greedy, min, max); |
| } |
| |
| default: |
| return Quantifier(); // No quantifier. |
| } |
| } |
| |
| Quantifier Parser::consumeQuantifier() |
| { |
| Quantifier q = consumeGreedyQuantifier(); |
| |
| if ((q.type == Quantifier::Greedy) && (peek() == '?')) { |
| consume(); |
| q.type = Quantifier::NonGreedy; |
| } |
| |
| return q; |
| } |
| |
| bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert) |
| { |
| Quantifier q = consumeQuantifier(); |
| |
| switch (q.type) { |
| case Quantifier::None: { |
| m_generator.generateCharacterClass(failures, charClass, invert); |
| break; |
| } |
| |
| case Quantifier::Greedy: { |
| GenerateCharacterClassFunctor functor(&charClass, invert); |
| m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::NonGreedy: { |
| GenerateCharacterClassFunctor functor(&charClass, invert); |
| m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::Error: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId) |
| { |
| Quantifier q = consumeQuantifier(); |
| |
| switch (q.type) { |
| case Quantifier::None: { |
| m_generator.generateBackreference(failures, subpatternId); |
| break; |
| } |
| |
| case Quantifier::Greedy: |
| case Quantifier::NonGreedy: |
| m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); |
| return true; |
| |
| case Quantifier::Error: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Parser::parseParentheses(JumpList& failures) |
| { |
| ParenthesesType type = consumeParenthesesType(); |
| |
| // FIXME: WREC originally failed to backtrack correctly in cases such as |
| // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For |
| // unsupported parentheses, we fall back on PCRE. |
| |
| switch (type) { |
| case Generator::Assertion: { |
| m_generator.generateParenthesesAssertion(failures); |
| |
| if (consume() != ')') { |
| setError(ParenthesesUnmatched); |
| return false; |
| } |
| |
| Quantifier quantifier = consumeQuantifier(); |
| if (quantifier.type != Quantifier::None && quantifier.min == 0) { |
| setError(ParenthesesNotSupported); |
| return false; |
| } |
| |
| return true; |
| } |
| case Generator::InvertedAssertion: { |
| m_generator.generateParenthesesInvertedAssertion(failures); |
| |
| if (consume() != ')') { |
| setError(ParenthesesUnmatched); |
| return false; |
| } |
| |
| Quantifier quantifier = consumeQuantifier(); |
| if (quantifier.type != Quantifier::None && quantifier.min == 0) { |
| setError(ParenthesesNotSupported); |
| return false; |
| } |
| |
| return true; |
| } |
| default: |
| setError(ParenthesesNotSupported); |
| return false; |
| } |
| } |
| |
| bool Parser::parseCharacterClass(JumpList& failures) |
| { |
| bool invert = false; |
| if (peek() == '^') { |
| consume(); |
| invert = true; |
| } |
| |
| CharacterClassConstructor constructor(m_ignoreCase); |
| |
| int ch; |
| while ((ch = peek()) != ']') { |
| switch (ch) { |
| case EndOfPattern: |
| setError(CharacterClassUnmatched); |
| return false; |
| |
| case '\\': { |
| consume(); |
| Escape escape = consumeEscape(true); |
| |
| switch (escape.type()) { |
| case Escape::PatternCharacter: { |
| int character = PatternCharacterEscape::cast(escape).character(); |
| if (character == '-') |
| constructor.flushBeforeEscapedHyphen(); |
| constructor.put(character); |
| break; |
| } |
| case Escape::CharacterClass: { |
| const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape); |
| ASSERT(!characterClassEscape.invert()); |
| constructor.append(characterClassEscape.characterClass()); |
| break; |
| } |
| case Escape::Error: |
| return false; |
| case Escape::Backreference: |
| case Escape::WordBoundaryAssertion: { |
| ASSERT_NOT_REACHED(); |
| break; |
| } |
| } |
| break; |
| } |
| |
| default: |
| consume(); |
| constructor.put(ch); |
| } |
| } |
| consume(); |
| |
| // lazily catch reversed ranges ([z-a])in character classes |
| if (constructor.isUpsideDown()) { |
| setError(CharacterClassOutOfOrder); |
| return false; |
| } |
| |
| constructor.flush(); |
| CharacterClass charClass = constructor.charClass(); |
| return parseCharacterClassQuantifier(failures, charClass, invert); |
| } |
| |
| bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape) |
| { |
| switch (escape.type()) { |
| case Escape::PatternCharacter: |
| ASSERT_NOT_REACHED(); |
| return false; |
| |
| case Escape::CharacterClass: |
| return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert()); |
| |
| case Escape::Backreference: |
| return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId()); |
| |
| case Escape::WordBoundaryAssertion: |
| m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert()); |
| return true; |
| |
| case Escape::Error: |
| return false; |
| } |
| |
| ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| Escape Parser::consumeEscape(bool inCharacterClass) |
| { |
| switch (peek()) { |
| case EndOfPattern: |
| setError(EscapeUnterminated); |
| return Escape(Escape::Error); |
| |
| // Assertions |
| case 'b': |
| consume(); |
| if (inCharacterClass) |
| return PatternCharacterEscape('\b'); |
| return WordBoundaryAssertionEscape(false); // do not invert |
| case 'B': |
| consume(); |
| if (inCharacterClass) |
| return PatternCharacterEscape('B'); |
| return WordBoundaryAssertionEscape(true); // invert |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| return CharacterClassEscape(CharacterClass::digits(), false); |
| case 's': |
| consume(); |
| return CharacterClassEscape(CharacterClass::spaces(), false); |
| case 'w': |
| consume(); |
| return CharacterClassEscape(CharacterClass::wordchar(), false); |
| case 'D': |
| consume(); |
| return inCharacterClass |
| ? CharacterClassEscape(CharacterClass::nondigits(), false) |
| : CharacterClassEscape(CharacterClass::digits(), true); |
| case 'S': |
| consume(); |
| return inCharacterClass |
| ? CharacterClassEscape(CharacterClass::nonspaces(), false) |
| : CharacterClassEscape(CharacterClass::spaces(), true); |
| case 'W': |
| consume(); |
| return inCharacterClass |
| ? CharacterClassEscape(CharacterClass::nonwordchar(), false) |
| : CharacterClassEscape(CharacterClass::wordchar(), true); |
| |
| // DecimalEscape |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| if (peekDigit() > m_numSubpatterns || inCharacterClass) { |
| // To match Firefox, we parse an invalid backreference in the range [1-7] |
| // as an octal escape. |
| return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); |
| } |
| |
| int value = 0; |
| do { |
| unsigned newValue = value * 10 + peekDigit(); |
| if (newValue > m_numSubpatterns) |
| break; |
| value = newValue; |
| consume(); |
| } while (peekIsDigit()); |
| |
| return BackreferenceEscape(value); |
| } |
| |
| // Octal escape |
| case '0': |
| consume(); |
| return PatternCharacterEscape(consumeOctal()); |
| |
| // ControlEscape |
| case 'f': |
| consume(); |
| return PatternCharacterEscape('\f'); |
| case 'n': |
| consume(); |
| return PatternCharacterEscape('\n'); |
| case 'r': |
| consume(); |
| return PatternCharacterEscape('\r'); |
| case 't': |
| consume(); |
| return PatternCharacterEscape('\t'); |
| case 'v': |
| consume(); |
| return PatternCharacterEscape('\v'); |
| |
| // ControlLetter |
| case 'c': { |
| SavedState state(*this); |
| consume(); |
| |
| int control = consume(); |
| // To match Firefox, inside a character class, we also accept numbers |
| // and '_' as control characters. |
| if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) { |
| state.restore(); |
| return PatternCharacterEscape('\\'); |
| } |
| return PatternCharacterEscape(control & 31); |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| |
| SavedState state(*this); |
| int x = consumeHex(2); |
| if (x == -1) { |
| state.restore(); |
| return PatternCharacterEscape('x'); |
| } |
| return PatternCharacterEscape(x); |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| consume(); |
| |
| SavedState state(*this); |
| int x = consumeHex(4); |
| if (x == -1) { |
| state.restore(); |
| return PatternCharacterEscape('u'); |
| } |
| return PatternCharacterEscape(x); |
| } |
| |
| // IdentityEscape |
| default: |
| return PatternCharacterEscape(consume()); |
| } |
| } |
| |
| void Parser::parseAlternative(JumpList& failures) |
| { |
| PatternCharacterSequence sequence(m_generator, failures); |
| |
| while (1) { |
| switch (peek()) { |
| case EndOfPattern: |
| case '|': |
| case ')': |
| sequence.flush(); |
| return; |
| |
| case '*': |
| case '+': |
| case '?': |
| case '{': { |
| Quantifier q = consumeQuantifier(); |
| |
| if (q.type == Quantifier::None) { |
| sequence.append(consume()); |
| continue; |
| } |
| |
| if (q.type == Quantifier::Error) |
| return; |
| |
| if (!sequence.size()) { |
| setError(QuantifierWithoutAtom); |
| return; |
| } |
| |
| sequence.flush(q); |
| continue; |
| } |
| |
| case '^': |
| consume(); |
| |
| sequence.flush(); |
| m_generator.generateAssertionBOL(failures); |
| continue; |
| |
| case '$': |
| consume(); |
| |
| sequence.flush(); |
| m_generator.generateAssertionEOL(failures); |
| continue; |
| |
| case '.': |
| consume(); |
| |
| sequence.flush(); |
| if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true)) |
| return; |
| continue; |
| |
| case '[': |
| consume(); |
| |
| sequence.flush(); |
| if (!parseCharacterClass(failures)) |
| return; |
| continue; |
| |
| case '(': |
| consume(); |
| |
| sequence.flush(); |
| if (!parseParentheses(failures)) |
| return; |
| continue; |
| |
| case '\\': { |
| consume(); |
| |
| Escape escape = consumeEscape(false); |
| if (escape.type() == Escape::PatternCharacter) { |
| sequence.append(PatternCharacterEscape::cast(escape).character()); |
| continue; |
| } |
| |
| sequence.flush(); |
| if (!parseNonCharacterEscape(failures, escape)) |
| return; |
| continue; |
| } |
| |
| default: |
| sequence.append(consume()); |
| continue; |
| } |
| } |
| } |
| |
| /* |
| TOS holds index. |
| */ |
| void Parser::parseDisjunction(JumpList& failures) |
| { |
| parseAlternative(failures); |
| if (peek() != '|') |
| return; |
| |
| JumpList successes; |
| do { |
| consume(); |
| m_generator.terminateAlternative(successes, failures); |
| parseAlternative(failures); |
| } while (peek() == '|'); |
| |
| m_generator.terminateDisjunction(successes); |
| } |
| |
| Generator::ParenthesesType Parser::consumeParenthesesType() |
| { |
| if (peek() != '?') |
| return Generator::Capturing; |
| consume(); |
| |
| switch (consume()) { |
| case ':': |
| return Generator::NonCapturing; |
| |
| case '=': |
| return Generator::Assertion; |
| |
| case '!': |
| return Generator::InvertedAssertion; |
| |
| default: |
| setError(ParenthesesTypeInvalid); |
| return Generator::Error; |
| } |
| } |
| |
| } } // namespace JSC::WREC |
| |
| #endif // ENABLE(WREC) |