blob: 1709bf92908986ff10b740ef2cc60ae85bd54114 [file] [log] [blame]
/*
* Copyright (C) 2008 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "WRECParser.h"
#if ENABLE(WREC)
#include "CharacterClassConstructor.h"
#include "WRECFunctors.h"
using namespace WTF;
namespace JSC { namespace WREC {
// These error messages match the error messages used by PCRE.
const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
const char* Parser::EscapeUnterminated = "\\ at end of pattern";
class PatternCharacterSequence {
typedef Generator::JumpList JumpList;
public:
PatternCharacterSequence(Generator& generator, JumpList& failures)
: m_generator(generator)
, m_failures(failures)
{
}
size_t size() { return m_sequence.size(); }
void append(int ch)
{
m_sequence.append(ch);
}
void flush()
{
if (!m_sequence.size())
return;
m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
m_sequence.clear();
}
void flush(const Quantifier& quantifier)
{
if (!m_sequence.size())
return;
m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);
switch (quantifier.type) {
case Quantifier::None:
case Quantifier::Error:
ASSERT_NOT_REACHED();
break;
case Quantifier::Greedy: {
GeneratePatternCharacterFunctor functor(m_sequence.last());
m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
break;
}
case Quantifier::NonGreedy: {
GeneratePatternCharacterFunctor functor(m_sequence.last());
m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
break;
}
}
m_sequence.clear();
}
private:
Generator& m_generator;
JumpList& m_failures;
Vector<int, 8> m_sequence;
};
ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
{
switch (peek()) {
case '?':
consume();
return Quantifier(Quantifier::Greedy, 0, 1);
case '*':
consume();
return Quantifier(Quantifier::Greedy, 0);
case '+':
consume();
return Quantifier(Quantifier::Greedy, 1);
case '{': {
SavedState state(*this);
consume();
// Accept: {n}, {n,}, {n,m}.
// Reject: {n,m} where n > m.
// Ignore: Anything else, such as {n, m}.
if (!peekIsDigit()) {
state.restore();
return Quantifier();
}
unsigned min = consumeNumber();
unsigned max = min;
if (peek() == ',') {
consume();
max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
}
if (peek() != '}') {
state.restore();
return Quantifier();
}
consume();
if (min > max) {
setError(QuantifierOutOfOrder);
return Quantifier(Quantifier::Error);
}
return Quantifier(Quantifier::Greedy, min, max);
}
default:
return Quantifier(); // No quantifier.
}
}
Quantifier Parser::consumeQuantifier()
{
Quantifier q = consumeGreedyQuantifier();
if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
consume();
q.type = Quantifier::NonGreedy;
}
return q;
}
bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
{
Quantifier q = consumeQuantifier();
switch (q.type) {
case Quantifier::None: {
m_generator.generateCharacterClass(failures, charClass, invert);
break;
}
case Quantifier::Greedy: {
GenerateCharacterClassFunctor functor(&charClass, invert);
m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
break;
}
case Quantifier::NonGreedy: {
GenerateCharacterClassFunctor functor(&charClass, invert);
m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
break;
}
case Quantifier::Error:
return false;
}
return true;
}
bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
{
Quantifier q = consumeQuantifier();
switch (q.type) {
case Quantifier::None: {
m_generator.generateBackreference(failures, subpatternId);
break;
}
case Quantifier::Greedy:
case Quantifier::NonGreedy:
m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
return true;
case Quantifier::Error:
return false;
}
return true;
}
bool Parser::parseParentheses(JumpList& failures)
{
ParenthesesType type = consumeParenthesesType();
// FIXME: WREC originally failed to backtrack correctly in cases such as
// "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
// unsupported parentheses, we fall back on PCRE.
switch (type) {
case Generator::Assertion: {
m_generator.generateParenthesesAssertion(failures);
if (consume() != ')') {
setError(ParenthesesUnmatched);
return false;
}
Quantifier quantifier = consumeQuantifier();
if (quantifier.type != Quantifier::None && quantifier.min == 0) {
setError(ParenthesesNotSupported);
return false;
}
return true;
}
case Generator::InvertedAssertion: {
m_generator.generateParenthesesInvertedAssertion(failures);
if (consume() != ')') {
setError(ParenthesesUnmatched);
return false;
}
Quantifier quantifier = consumeQuantifier();
if (quantifier.type != Quantifier::None && quantifier.min == 0) {
setError(ParenthesesNotSupported);
return false;
}
return true;
}
default:
setError(ParenthesesNotSupported);
return false;
}
}
bool Parser::parseCharacterClass(JumpList& failures)
{
bool invert = false;
if (peek() == '^') {
consume();
invert = true;
}
CharacterClassConstructor constructor(m_ignoreCase);
int ch;
while ((ch = peek()) != ']') {
switch (ch) {
case EndOfPattern:
setError(CharacterClassUnmatched);
return false;
case '\\': {
consume();
Escape escape = consumeEscape(true);
switch (escape.type()) {
case Escape::PatternCharacter: {
int character = PatternCharacterEscape::cast(escape).character();
if (character == '-')
constructor.flushBeforeEscapedHyphen();
constructor.put(character);
break;
}
case Escape::CharacterClass: {
const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
ASSERT(!characterClassEscape.invert());
constructor.append(characterClassEscape.characterClass());
break;
}
case Escape::Error:
return false;
case Escape::Backreference:
case Escape::WordBoundaryAssertion: {
ASSERT_NOT_REACHED();
break;
}
}
break;
}
default:
consume();
constructor.put(ch);
}
}
consume();
// lazily catch reversed ranges ([z-a])in character classes
if (constructor.isUpsideDown()) {
setError(CharacterClassOutOfOrder);
return false;
}
constructor.flush();
CharacterClass charClass = constructor.charClass();
return parseCharacterClassQuantifier(failures, charClass, invert);
}
bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
{
switch (escape.type()) {
case Escape::PatternCharacter:
ASSERT_NOT_REACHED();
return false;
case Escape::CharacterClass:
return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());
case Escape::Backreference:
return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());
case Escape::WordBoundaryAssertion:
m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
return true;
case Escape::Error:
return false;
}
ASSERT_NOT_REACHED();
return false;
}
Escape Parser::consumeEscape(bool inCharacterClass)
{
switch (peek()) {
case EndOfPattern:
setError(EscapeUnterminated);
return Escape(Escape::Error);
// Assertions
case 'b':
consume();
if (inCharacterClass)
return PatternCharacterEscape('\b');
return WordBoundaryAssertionEscape(false); // do not invert
case 'B':
consume();
if (inCharacterClass)
return PatternCharacterEscape('B');
return WordBoundaryAssertionEscape(true); // invert
// CharacterClassEscape
case 'd':
consume();
return CharacterClassEscape(CharacterClass::digits(), false);
case 's':
consume();
return CharacterClassEscape(CharacterClass::spaces(), false);
case 'w':
consume();
return CharacterClassEscape(CharacterClass::wordchar(), false);
case 'D':
consume();
return inCharacterClass
? CharacterClassEscape(CharacterClass::nondigits(), false)
: CharacterClassEscape(CharacterClass::digits(), true);
case 'S':
consume();
return inCharacterClass
? CharacterClassEscape(CharacterClass::nonspaces(), false)
: CharacterClassEscape(CharacterClass::spaces(), true);
case 'W':
consume();
return inCharacterClass
? CharacterClassEscape(CharacterClass::nonwordchar(), false)
: CharacterClassEscape(CharacterClass::wordchar(), true);
// DecimalEscape
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (peekDigit() > m_numSubpatterns || inCharacterClass) {
// To match Firefox, we parse an invalid backreference in the range [1-7]
// as an octal escape.
return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
}
int value = 0;
do {
unsigned newValue = value * 10 + peekDigit();
if (newValue > m_numSubpatterns)
break;
value = newValue;
consume();
} while (peekIsDigit());
return BackreferenceEscape(value);
}
// Octal escape
case '0':
consume();
return PatternCharacterEscape(consumeOctal());
// ControlEscape
case 'f':
consume();
return PatternCharacterEscape('\f');
case 'n':
consume();
return PatternCharacterEscape('\n');
case 'r':
consume();
return PatternCharacterEscape('\r');
case 't':
consume();
return PatternCharacterEscape('\t');
case 'v':
consume();
return PatternCharacterEscape('\v');
// ControlLetter
case 'c': {
SavedState state(*this);
consume();
int control = consume();
// To match Firefox, inside a character class, we also accept numbers
// and '_' as control characters.
if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
state.restore();
return PatternCharacterEscape('\\');
}
return PatternCharacterEscape(control & 31);
}
// HexEscape
case 'x': {
consume();
SavedState state(*this);
int x = consumeHex(2);
if (x == -1) {
state.restore();
return PatternCharacterEscape('x');
}
return PatternCharacterEscape(x);
}
// UnicodeEscape
case 'u': {
consume();
SavedState state(*this);
int x = consumeHex(4);
if (x == -1) {
state.restore();
return PatternCharacterEscape('u');
}
return PatternCharacterEscape(x);
}
// IdentityEscape
default:
return PatternCharacterEscape(consume());
}
}
void Parser::parseAlternative(JumpList& failures)
{
PatternCharacterSequence sequence(m_generator, failures);
while (1) {
switch (peek()) {
case EndOfPattern:
case '|':
case ')':
sequence.flush();
return;
case '*':
case '+':
case '?':
case '{': {
Quantifier q = consumeQuantifier();
if (q.type == Quantifier::None) {
sequence.append(consume());
continue;
}
if (q.type == Quantifier::Error)
return;
if (!sequence.size()) {
setError(QuantifierWithoutAtom);
return;
}
sequence.flush(q);
continue;
}
case '^':
consume();
sequence.flush();
m_generator.generateAssertionBOL(failures);
continue;
case '$':
consume();
sequence.flush();
m_generator.generateAssertionEOL(failures);
continue;
case '.':
consume();
sequence.flush();
if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
return;
continue;
case '[':
consume();
sequence.flush();
if (!parseCharacterClass(failures))
return;
continue;
case '(':
consume();
sequence.flush();
if (!parseParentheses(failures))
return;
continue;
case '\\': {
consume();
Escape escape = consumeEscape(false);
if (escape.type() == Escape::PatternCharacter) {
sequence.append(PatternCharacterEscape::cast(escape).character());
continue;
}
sequence.flush();
if (!parseNonCharacterEscape(failures, escape))
return;
continue;
}
default:
sequence.append(consume());
continue;
}
}
}
/*
TOS holds index.
*/
void Parser::parseDisjunction(JumpList& failures)
{
parseAlternative(failures);
if (peek() != '|')
return;
JumpList successes;
do {
consume();
m_generator.terminateAlternative(successes, failures);
parseAlternative(failures);
} while (peek() == '|');
m_generator.terminateDisjunction(successes);
}
Generator::ParenthesesType Parser::consumeParenthesesType()
{
if (peek() != '?')
return Generator::Capturing;
consume();
switch (consume()) {
case ':':
return Generator::NonCapturing;
case '=':
return Generator::Assertion;
case '!':
return Generator::InvertedAssertion;
default:
setError(ParenthesesTypeInvalid);
return Generator::Error;
}
}
} } // namespace JSC::WREC
#endif // ENABLE(WREC)