blob: 8426b115fd2a1713f8c88376c5c9e133811e7aae [file] [log] [blame]
/*
* Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "ConcurrentJSLock.h"
#include "YarrErrorCode.h"
#include "YarrFlags.h"
#include "YarrPattern.h"
namespace WTF {
class BumpPointerAllocator;
}
using WTF::BumpPointerAllocator;
namespace JSC { namespace Yarr {
class ByteDisjunction;
struct ByteTerm {
union {
struct {
union {
UChar32 patternCharacter;
struct {
UChar32 lo;
UChar32 hi;
} casedCharacter;
CharacterClass* characterClass;
unsigned subpatternId;
};
union {
ByteDisjunction* parenthesesDisjunction;
unsigned parenthesesWidth;
};
QuantifierType quantityType;
unsigned quantityMinCount;
unsigned quantityMaxCount;
} atom;
struct {
int next;
int end;
bool onceThrough;
} alternative;
struct {
bool m_bol : 1;
bool m_eol : 1;
} anchors;
unsigned checkInputCount;
};
unsigned frameLocation { 0 };
enum class Type : uint8_t {
BodyAlternativeBegin,
BodyAlternativeDisjunction,
BodyAlternativeEnd,
AlternativeBegin,
AlternativeDisjunction,
AlternativeEnd,
SubpatternBegin,
SubpatternEnd,
AssertionBOL,
AssertionEOL,
AssertionWordBoundary,
PatternCharacterOnce,
PatternCharacterFixed,
PatternCharacterGreedy,
PatternCharacterNonGreedy,
PatternCasedCharacterOnce,
PatternCasedCharacterFixed,
PatternCasedCharacterGreedy,
PatternCasedCharacterNonGreedy,
CharacterClass,
BackReference,
ParenthesesSubpattern,
ParenthesesSubpatternOnceBegin,
ParenthesesSubpatternOnceEnd,
ParenthesesSubpatternTerminalBegin,
ParenthesesSubpatternTerminalEnd,
ParentheticalAssertionBegin,
ParentheticalAssertionEnd,
CheckInput,
UncheckInput,
DotStarEnclosure,
};
Type type;
bool m_capture : 1;
bool m_invert : 1;
unsigned inputPosition { 0 };
ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
: frameLocation(frameLocation)
, m_capture(false)
, m_invert(false)
, inputPosition(inputPos)
{
atom.patternCharacter = ch;
atom.quantityType = quantityType;
atom.quantityMinCount = quantityCount;
atom.quantityMaxCount = quantityCount;
switch (quantityType) {
case QuantifierType::FixedCount:
type = (quantityCount == 1) ? ByteTerm::Type::PatternCharacterOnce : ByteTerm::Type::PatternCharacterFixed;
break;
case QuantifierType::Greedy:
type = ByteTerm::Type::PatternCharacterGreedy;
break;
case QuantifierType::NonGreedy:
type = ByteTerm::Type::PatternCharacterNonGreedy;
break;
}
}
ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
: frameLocation(frameLocation)
, m_capture(false)
, m_invert(false)
, inputPosition(inputPos)
{
switch (quantityType) {
case QuantifierType::FixedCount:
type = (quantityCount == 1) ? ByteTerm::Type::PatternCasedCharacterOnce : ByteTerm::Type::PatternCasedCharacterFixed;
break;
case QuantifierType::Greedy:
type = ByteTerm::Type::PatternCasedCharacterGreedy;
break;
case QuantifierType::NonGreedy:
type = ByteTerm::Type::PatternCasedCharacterNonGreedy;
break;
}
atom.casedCharacter.lo = lo;
atom.casedCharacter.hi = hi;
atom.quantityType = quantityType;
atom.quantityMinCount = quantityCount;
atom.quantityMaxCount = quantityCount;
}
ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos)
: type(ByteTerm::Type::CharacterClass)
, m_capture(false)
, m_invert(invert)
, inputPosition(inputPos)
{
atom.characterClass = characterClass;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
}
ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos)
: type(type)
, m_capture(capture)
, m_invert(false)
, inputPosition(inputPos)
{
atom.subpatternId = subpatternId;
atom.parenthesesDisjunction = parenthesesInfo;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
}
ByteTerm(Type type, bool invert = false)
: type(type)
, m_capture(false)
, m_invert(invert)
{
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
}
ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos)
: type(type)
, m_capture(capture)
, m_invert(invert)
, inputPosition(inputPos)
{
atom.subpatternId = subpatternId;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
}
static ByteTerm BOL(unsigned inputPos)
{
ByteTerm term(Type::AssertionBOL);
term.inputPosition = inputPos;
return term;
}
static ByteTerm CheckInput(Checked<unsigned> count)
{
ByteTerm term(Type::CheckInput);
term.checkInputCount = count;
return term;
}
static ByteTerm UncheckInput(Checked<unsigned> count)
{
ByteTerm term(Type::UncheckInput);
term.checkInputCount = count;
return term;
}
static ByteTerm EOL(unsigned inputPos)
{
ByteTerm term(Type::AssertionEOL);
term.inputPosition = inputPos;
return term;
}
static ByteTerm WordBoundary(bool invert, unsigned inputPos)
{
ByteTerm term(Type::AssertionWordBoundary, invert);
term.inputPosition = inputPos;
return term;
}
static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos)
{
return ByteTerm(Type::BackReference, subpatternId, false, false, inputPos);
}
static ByteTerm BodyAlternativeBegin(bool onceThrough)
{
ByteTerm term(Type::BodyAlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = onceThrough;
return term;
}
static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
{
ByteTerm term(Type::BodyAlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = onceThrough;
return term;
}
static ByteTerm BodyAlternativeEnd()
{
ByteTerm term(Type::BodyAlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = false;
return term;
}
static ByteTerm AlternativeBegin()
{
ByteTerm term(Type::AlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = false;
return term;
}
static ByteTerm AlternativeDisjunction()
{
ByteTerm term(Type::AlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = false;
return term;
}
static ByteTerm AlternativeEnd()
{
ByteTerm term(Type::AlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
term.alternative.onceThrough = false;
return term;
}
static ByteTerm SubpatternBegin()
{
return ByteTerm(Type::SubpatternBegin);
}
static ByteTerm SubpatternEnd()
{
return ByteTerm(Type::SubpatternEnd);
}
static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
{
ByteTerm term(Type::DotStarEnclosure);
term.anchors.m_bol = bolAnchor;
term.anchors.m_eol = eolAnchor;
return term;
}
bool invert()
{
return m_invert;
}
bool capture()
{
return m_capture;
}
};
class ByteDisjunction {
WTF_MAKE_FAST_ALLOCATED;
public:
ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
: m_numSubpatterns(numSubpatterns)
, m_frameSize(frameSize)
{
}
size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); }
Vector<ByteTerm> terms;
unsigned m_numSubpatterns;
unsigned m_frameSize;
};
struct BytecodePattern {
WTF_MAKE_FAST_ALLOCATED;
public:
BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
: m_body(WTFMove(body))
, m_flags(pattern.m_flags)
, m_allocator(allocator)
, m_lock(lock)
{
m_body->terms.shrinkToFit();
newlineCharacterClass = pattern.newlineCharacterClass();
if (unicode() && ignoreCase())
wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
else
wordcharCharacterClass = pattern.wordcharCharacterClass();
m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
m_allParenthesesInfo.shrinkToFit();
m_userCharacterClasses.swap(pattern.m_userCharacterClasses);
m_userCharacterClasses.shrinkToFit();
}
size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); }
bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); }
bool multiline() const { return m_flags.contains(Flags::Multiline); }
bool hasIndices() const { return m_flags.contains(Flags::HasIndices); }
bool sticky() const { return m_flags.contains(Flags::Sticky); }
bool unicode() const { return m_flags.contains(Flags::Unicode); }
bool dotAll() const { return m_flags.contains(Flags::DotAll); }
std::unique_ptr<ByteDisjunction> m_body;
OptionSet<Flags> m_flags;
// Each BytecodePattern is associated with a RegExp, each RegExp is associated
// with a VM. Cache a pointer to our VM's m_regExpAllocator.
BumpPointerAllocator* m_allocator;
ConcurrentJSLock* m_lock;
CharacterClass* newlineCharacterClass;
CharacterClass* wordcharCharacterClass;
private:
Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
};
JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ErrorCode&, ConcurrentJSLock* = nullptr);
JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, StringView input, unsigned start, unsigned* output);
unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
} } // namespace JSC::Yarr