blob: 0d51833f5c0685bb888e3aa2177c387f4376a544 [file] [log] [blame]
/*
* Copyright (C) 2014-2015 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#if ENABLE(CONTENT_EXTENSIONS)
#include "ContentExtensionsDebugging.h"
#include "NFA.h"
#include <unicode/utypes.h>
#include <wtf/ASCIICType.h>
#include <wtf/HashMap.h>
#include <wtf/Vector.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>
namespace WebCore {
namespace ContentExtensions {
enum class AtomQuantifier : uint8_t {
One,
ZeroOrOne,
ZeroOrMore,
OneOrMore
};
class Term {
public:
Term();
Term(char character, bool isCaseSensitive);
enum UniversalTransitionTag { UniversalTransition };
explicit Term(UniversalTransitionTag);
enum CharacterSetTermTag { CharacterSetTerm };
explicit Term(CharacterSetTermTag, bool isInverted);
enum GroupTermTag { GroupTerm };
explicit Term(GroupTermTag);
enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
explicit Term(EndOfLineAssertionTermTag);
Term(const Term& other);
Term(Term&& other);
enum EmptyValueTag { EmptyValue };
Term(EmptyValueTag);
~Term();
bool isValid() const;
// CharacterSet terms only.
void addCharacter(UChar character, bool isCaseSensitive);
// Group terms only.
void extendGroupSubpattern(const Term&);
void quantify(const AtomQuantifier&);
ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const;
void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
bool isEndOfLineAssertion() const;
bool matchesAtLeastOneCharacter() const;
// Matches any string, the empty string included.
// This is very conservative. Patterns matching any string can return false here.
bool isKnownToMatchAnyString() const;
// Return true if the term can only match input of a single fixed length.
bool hasFixedLength() const;
Term& operator=(const Term& other);
Term& operator=(Term&& other);
bool operator==(const Term& other) const;
unsigned hash() const;
bool isEmptyValue() const;
#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
String toString() const;
#endif
#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
size_t memoryUsed() const;
#endif
private:
// This is exact for character sets but conservative for groups.
// The return value can be false for a group equivalent to a universal transition.
bool isUniversalTransition() const;
void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const;
void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
void destroy();
enum class TermType : uint8_t {
Empty,
CharacterSet,
Group
};
TermType m_termType { TermType::Empty };
AtomQuantifier m_quantifier { AtomQuantifier::One };
class CharacterSet {
public:
void set(UChar character)
{
RELEASE_ASSERT(character < 128);
m_characters[character / 64] |= (uint64_t(1) << (character % 64));
}
bool get(UChar character) const
{
RELEASE_ASSERT(character < 128);
return m_characters[character / 64] & (uint64_t(1) << (character % 64));
}
void invert()
{
ASSERT(!m_inverted);
m_inverted = true;
}
bool inverted() const { return m_inverted; }
unsigned bitCount() const
{
return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]);
}
bool operator==(const CharacterSet& other) const
{
return other.m_inverted == m_inverted
&& other.m_characters[0] == m_characters[0]
&& other.m_characters[1] == m_characters[1];
}
unsigned hash() const
{
return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted);
}
private:
bool m_inverted { false };
uint64_t m_characters[2] { 0, 0 };
};
struct Group {
Vector<Term> terms;
bool operator==(const Group& other) const
{
return other.terms == terms;
}
unsigned hash() const
{
unsigned hash = 6421749;
for (const Term& term : terms) {
unsigned termHash = term.hash();
hash = (hash << 16) ^ ((termHash << 11) ^ hash);
hash += hash >> 11;
}
return hash;
}
};
union AtomData {
AtomData()
: invalidTerm(0)
{
}
~AtomData()
{
}
char invalidTerm;
CharacterSet characterSet;
Group group;
} m_atomData;
};
#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
inline String quantifierToString(AtomQuantifier quantifier)
{
switch (quantifier) {
case AtomQuantifier::One:
return "";
case AtomQuantifier::ZeroOrOne:
return "?";
case AtomQuantifier::ZeroOrMore:
return "*";
case AtomQuantifier::OneOrMore:
return "+";
}
}
inline String Term::toString() const
{
switch (m_termType) {
case TermType::Empty:
return "(Empty)";
case TermType::CharacterSet: {
StringBuilder builder;
builder.append('[');
for (UChar c = 0; c < 128; c++) {
if (m_atomData.characterSet.get(c)) {
if (isASCIIPrintable(c) && !isASCIISpace(c))
builder.append(c);
else {
builder.append('\\');
builder.append('u');
builder.appendNumber(c);
}
}
}
builder.append(']');
builder.append(quantifierToString(m_quantifier));
return builder.toString();
}
case TermType::Group: {
StringBuilder builder;
builder.append('(');
for (const Term& term : m_atomData.group.terms)
builder.append(term.toString());
builder.append(')');
builder.append(quantifierToString(m_quantifier));
return builder.toString();
}
}
}
#endif
inline Term::Term()
{
}
inline Term::Term(char character, bool isCaseSensitive)
: m_termType(TermType::CharacterSet)
{
new (NotNull, &m_atomData.characterSet) CharacterSet();
addCharacter(character, isCaseSensitive);
}
inline Term::Term(UniversalTransitionTag)
: m_termType(TermType::CharacterSet)
{
new (NotNull, &m_atomData.characterSet) CharacterSet();
for (UChar i = 1; i < 128; ++i)
m_atomData.characterSet.set(i);
}
inline Term::Term(CharacterSetTermTag, bool isInverted)
: m_termType(TermType::CharacterSet)
{
new (NotNull, &m_atomData.characterSet) CharacterSet();
if (isInverted)
m_atomData.characterSet.invert();
}
inline Term::Term(GroupTermTag)
: m_termType(TermType::Group)
{
new (NotNull, &m_atomData.group) Group();
}
inline Term::Term(EndOfLineAssertionTermTag)
: Term(CharacterSetTerm, false)
{
m_atomData.characterSet.set(0);
}
inline Term::Term(const Term& other)
: m_termType(other.m_termType)
, m_quantifier(other.m_quantifier)
{
switch (m_termType) {
case TermType::Empty:
break;
case TermType::CharacterSet:
new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
break;
case TermType::Group:
new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
break;
}
}
inline Term::Term(Term&& other)
: m_termType(WTFMove(other.m_termType))
, m_quantifier(WTFMove(other.m_quantifier))
{
switch (m_termType) {
case TermType::Empty:
break;
case TermType::CharacterSet:
new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet));
break;
case TermType::Group:
new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group));
break;
}
other.destroy();
}
inline Term::Term(EmptyValueTag)
: m_termType(TermType::Empty)
{
}
inline Term::~Term()
{
destroy();
}
inline bool Term::isValid() const
{
return m_termType != TermType::Empty;
}
inline void Term::addCharacter(UChar character, bool isCaseSensitive)
{
ASSERT(isASCII(character));
ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
if (m_termType != TermType::CharacterSet)
return;
if (isCaseSensitive || !isASCIIAlpha(character))
m_atomData.characterSet.set(character);
else {
m_atomData.characterSet.set(toASCIIUpper(character));
m_atomData.characterSet.set(toASCIILower(character));
}
}
inline void Term::extendGroupSubpattern(const Term& term)
{
ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
if (m_termType != TermType::Group)
return;
m_atomData.group.terms.append(term);
}
inline void Term::quantify(const AtomQuantifier& quantifier)
{
ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
m_quantifier = quantifier;
}
inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const
{
ImmutableCharNFANodeBuilder newEnd(nfa);
generateGraph(nfa, source, newEnd.nodeId());
newEnd.setActions(finalActions.begin(), finalActions.end());
return newEnd;
}
inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
{
ASSERT(isValid());
switch (m_quantifier) {
case AtomQuantifier::One: {
generateSubgraphForAtom(nfa, source, destination);
break;
}
case AtomQuantifier::ZeroOrOne: {
generateSubgraphForAtom(nfa, source, destination);
source.addEpsilonTransition(destination);
break;
}
case AtomQuantifier::ZeroOrMore: {
ImmutableCharNFANodeBuilder repeatStart(nfa);
source.addEpsilonTransition(repeatStart);
ImmutableCharNFANodeBuilder repeatEnd(nfa);
generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
repeatEnd.addEpsilonTransition(repeatStart);
repeatEnd.addEpsilonTransition(destination);
source.addEpsilonTransition(destination);
break;
}
case AtomQuantifier::OneOrMore: {
ImmutableCharNFANodeBuilder repeatStart(nfa);
source.addEpsilonTransition(repeatStart);
ImmutableCharNFANodeBuilder repeatEnd(nfa);
generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
repeatEnd.addEpsilonTransition(repeatStart);
repeatEnd.addEpsilonTransition(destination);
break;
}
}
}
inline bool Term::isEndOfLineAssertion() const
{
return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0);
}
inline bool Term::matchesAtLeastOneCharacter() const
{
ASSERT(isValid());
if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore)
return false;
if (isEndOfLineAssertion())
return false;
if (m_termType == TermType::Group) {
for (const Term& term : m_atomData.group.terms) {
if (term.matchesAtLeastOneCharacter())
return true;
}
return false;
}
return true;
}
inline bool Term::isKnownToMatchAnyString() const
{
ASSERT(isValid());
switch (m_termType) {
case TermType::Empty:
ASSERT_NOT_REACHED();
break;
case TermType::CharacterSet:
// ".*" is the only simple term matching any string.
return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
break;
case TermType::Group: {
// There are infinitely many ways to match anything with groups, we just handle simple cases
if (m_atomData.group.terms.size() != 1)
return false;
const Term& firstTermInGroup = m_atomData.group.terms.first();
// -(.*) with any quantifier.
if (firstTermInGroup.isKnownToMatchAnyString())
return true;
if (firstTermInGroup.isUniversalTransition()) {
// -(.)*, (.+)*, (.?)* etc.
if (m_quantifier == AtomQuantifier::ZeroOrMore)
return true;
// -(.+)?.
if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
return true;
// -(.?)+.
if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
return true;
}
break;
}
}
return false;
}
inline bool Term::hasFixedLength() const
{
ASSERT(isValid());
switch (m_termType) {
case TermType::Empty:
ASSERT_NOT_REACHED();
break;
case TermType::CharacterSet:
return m_quantifier == AtomQuantifier::One;
case TermType::Group: {
if (m_quantifier != AtomQuantifier::One)
return false;
for (const Term& term : m_atomData.group.terms) {
if (!term.hasFixedLength())
return false;
}
return true;
}
}
return false;
}
inline Term& Term::operator=(const Term& other)
{
destroy();
new (NotNull, this) Term(other);
return *this;
}
inline Term& Term::operator=(Term&& other)
{
destroy();
new (NotNull, this) Term(WTFMove(other));
return *this;
}
inline bool Term::operator==(const Term& other) const
{
if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
return false;
switch (m_termType) {
case TermType::Empty:
return true;
case TermType::CharacterSet:
return m_atomData.characterSet == other.m_atomData.characterSet;
case TermType::Group:
return m_atomData.group == other.m_atomData.group;
}
ASSERT_NOT_REACHED();
return false;
}
inline unsigned Term::hash() const
{
unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
unsigned secondary = 0;
switch (m_termType) {
case TermType::Empty:
secondary = 52184393;
break;
case TermType::CharacterSet:
secondary = m_atomData.characterSet.hash();
break;
case TermType::Group:
secondary = m_atomData.group.hash();
break;
}
return WTF::pairIntHash(primary, secondary);
}
inline bool Term::isEmptyValue() const
{
return m_termType == TermType::Empty;
}
inline bool Term::isUniversalTransition() const
{
ASSERT(isValid());
switch (m_termType) {
case TermType::Empty:
ASSERT_NOT_REACHED();
break;
case TermType::CharacterSet:
return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
|| (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0));
case TermType::Group:
return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition();
}
return false;
}
inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const
{
generateSubgraphForAtom(nfa, source, destination.nodeId());
}
inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
{
switch (m_termType) {
case TermType::Empty:
ASSERT_NOT_REACHED();
source.addEpsilonTransition(destination);
break;
case TermType::CharacterSet: {
if (!m_atomData.characterSet.inverted()) {
UChar i = 0;
while (true) {
while (i < 128 && !m_atomData.characterSet.get(i))
++i;
if (i == 128)
break;
UChar start = i;
++i;
while (i < 128 && m_atomData.characterSet.get(i))
++i;
source.addTransition(start, i - 1, destination);
}
} else {
UChar i = 1;
while (true) {
while (i < 128 && m_atomData.characterSet.get(i))
++i;
if (i == 128)
break;
UChar start = i;
++i;
while (i < 128 && !m_atomData.characterSet.get(i))
++i;
source.addTransition(start, i - 1, destination);
}
}
break;
}
case TermType::Group: {
if (m_atomData.group.terms.isEmpty()) {
// FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion.
source.addEpsilonTransition(destination);
return;
}
if (m_atomData.group.terms.size() == 1) {
m_atomData.group.terms.first().generateGraph(nfa, source, destination);
return;
}
ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList());
for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) {
const Term& currentTerm = m_atomData.group.terms[i];
ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList());
lastTarget = WTFMove(newNode);
}
const Term& lastTerm = m_atomData.group.terms.last();
lastTerm.generateGraph(nfa, lastTarget, destination);
break;
}
}
}
inline void Term::destroy()
{
switch (m_termType) {
case TermType::Empty:
break;
case TermType::CharacterSet:
m_atomData.characterSet.~CharacterSet();
break;
case TermType::Group:
m_atomData.group.~Group();
break;
}
m_termType = TermType::Empty;
}
#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
inline size_t Term::memoryUsed() const
{
size_t extraMemory = 0;
if (m_termType == TermType::Group) {
for (const Term& term : m_atomData.group.terms)
extraMemory += term.memoryUsed();
}
return sizeof(Term) + extraMemory;
}
#endif
} // namespace ContentExtensions
} // namespace WebCore
#endif // ENABLE(CONTENT_EXTENSIONS)