| /* |
| * Copyright (C) 2014, 2015 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
| * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
| * THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "URLFilterParser.h" |
| |
| #if ENABLE(CONTENT_EXTENSIONS) |
| |
| #include "CombinedURLFilters.h" |
| #include "Term.h" |
| #include <JavaScriptCore/YarrParser.h> |
| #include <wtf/Deque.h> |
| #include <wtf/text/CString.h> |
| |
| namespace WebCore { |
| |
| namespace ContentExtensions { |
| |
| class PatternParser { |
| public: |
| PatternParser(bool patternIsCaseSensitive) |
| : m_patternIsCaseSensitive(patternIsCaseSensitive) |
| , m_parseStatus(URLFilterParser::Ok) |
| { |
| } |
| |
| void finalize(uint64_t patternId, CombinedURLFilters& combinedURLFilters) |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| |
| simplifySunkTerms(); |
| |
| // Check to see if there are any terms without ? or *. |
| bool matchesEverything = true; |
| for (const auto& term : m_sunkTerms) { |
| if (term.matchesAtLeastOneCharacter()) { |
| matchesEverything = false; |
| break; |
| } |
| } |
| if (matchesEverything) { |
| fail(URLFilterParser::MatchesEverything); |
| return; |
| } |
| |
| combinedURLFilters.addPattern(patternId, m_sunkTerms); |
| } |
| |
| URLFilterParser::ParseStatus parseStatus() const |
| { |
| return m_parseStatus; |
| } |
| |
| void atomPatternCharacter(UChar character) |
| { |
| if (hasError()) |
| return; |
| |
| if (!isASCII(character)) { |
| fail(URLFilterParser::NonASCII); |
| return; |
| } |
| |
| sinkFloatingTermIfNecessary(); |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| char asciiChararacter = static_cast<char>(character); |
| m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive); |
| } |
| |
| void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted) |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| if (builtInCharacterClassID == JSC::Yarr::BuiltInCharacterClassID::DotClassID && !inverted) |
| m_floatingTerm = Term(Term::UniversalTransition); |
| else |
| fail(URLFilterParser::UnsupportedCharacterClass); |
| } |
| |
| void quantifyAtom(unsigned minimum, unsigned maximum, bool) |
| { |
| if (hasError()) |
| return; |
| |
| ASSERT(m_floatingTerm.isValid()); |
| |
| if (!minimum && maximum == 1) |
| m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne); |
| else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) |
| m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore); |
| else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite) |
| m_floatingTerm.quantify(AtomQuantifier::OneOrMore); |
| else |
| fail(URLFilterParser::InvalidQuantifier); |
| } |
| |
| void atomBackReference(unsigned) |
| { |
| fail(URLFilterParser::BackReference); |
| } |
| |
| void atomNamedBackReference(const String&) |
| { |
| fail(URLFilterParser::BackReference); |
| } |
| |
| bool isValidNamedForwardReference(const String&) |
| { |
| return false; |
| } |
| |
| void atomNamedForwardReference(const String&) |
| { |
| fail(URLFilterParser::ForwardReference); |
| } |
| |
| void assertionBOL() |
| { |
| if (hasError()) |
| return; |
| |
| if (m_floatingTerm.isValid() || !m_sunkTerms.isEmpty() || !m_openGroups.isEmpty()) { |
| fail(URLFilterParser::MisplacedStartOfLine); |
| return; |
| } |
| |
| m_hasBeginningOfLineAssertion = true; |
| } |
| |
| void assertionEOL() |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| m_floatingTerm = Term(Term::EndOfLineAssertionTerm); |
| } |
| |
| void assertionWordBoundary(bool) |
| { |
| fail(URLFilterParser::WordBoundary); |
| } |
| |
| void atomCharacterClassBegin(bool inverted = false) |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| m_floatingTerm = Term(Term::CharacterSetTerm, inverted); |
| } |
| |
| void atomCharacterClassAtom(UChar character) |
| { |
| if (hasError()) |
| return; |
| |
| ASSERT(isASCII(character)); |
| |
| m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive); |
| } |
| |
| void atomCharacterClassRange(UChar a, UChar b) |
| { |
| if (hasError()) |
| return; |
| |
| ASSERT(a); |
| ASSERT(b); |
| ASSERT(isASCII(a)); |
| ASSERT(isASCII(b)); |
| |
| for (unsigned i = a; i <= b; ++i) |
| m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive); |
| } |
| |
| void atomCharacterClassEnd() |
| { |
| // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily. |
| } |
| |
| void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool) |
| { |
| fail(URLFilterParser::AtomCharacter); |
| } |
| |
| void atomParenthesesSubpatternBegin(bool = true, Optional<String> = WTF::nullopt) |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| |
| m_openGroups.append(Term(Term::GroupTerm)); |
| } |
| |
| void atomParentheticalAssertionBegin(bool = false) |
| { |
| fail(URLFilterParser::Group); |
| } |
| |
| void atomParenthesesEnd() |
| { |
| if (hasError()) |
| return; |
| |
| sinkFloatingTermIfNecessary(); |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| m_floatingTerm = m_openGroups.takeLast(); |
| } |
| |
| void disjunction() |
| { |
| fail(URLFilterParser::Disjunction); |
| } |
| |
| private: |
| bool hasError() const |
| { |
| return m_parseStatus != URLFilterParser::Ok; |
| } |
| |
| void fail(URLFilterParser::ParseStatus reason) |
| { |
| if (hasError()) |
| return; |
| |
| m_parseStatus = reason; |
| } |
| |
| void sinkFloatingTermIfNecessary() |
| { |
| if (!m_floatingTerm.isValid()) |
| return; |
| |
| if (m_hasProcessedEndOfLineAssertion) { |
| fail(URLFilterParser::MisplacedEndOfLine); |
| m_floatingTerm = Term(); |
| return; |
| } |
| |
| if (m_floatingTerm.isEndOfLineAssertion()) |
| m_hasProcessedEndOfLineAssertion = true; |
| |
| if (!m_openGroups.isEmpty()) { |
| m_openGroups.last().extendGroupSubpattern(m_floatingTerm); |
| m_floatingTerm = Term(); |
| return; |
| } |
| |
| m_sunkTerms.append(m_floatingTerm); |
| m_floatingTerm = Term(); |
| } |
| |
| void simplifySunkTerms() |
| { |
| ASSERT(!m_floatingTerm.isValid()); |
| |
| if (m_sunkTerms.isEmpty()) |
| return; |
| |
| Term canonicalDotStar(Term::UniversalTransition); |
| canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore); |
| |
| // Replace every ".*"-like terms by our canonical version. Remove any duplicate ".*". |
| { |
| unsigned termIndex = 0; |
| bool isAfterDotStar = false; |
| while (termIndex < m_sunkTerms.size()) { |
| if (isAfterDotStar && m_sunkTerms[termIndex].isKnownToMatchAnyString()) { |
| m_sunkTerms.remove(termIndex); |
| continue; |
| } |
| isAfterDotStar = false; |
| |
| if (m_sunkTerms[termIndex].isKnownToMatchAnyString()) { |
| m_sunkTerms[termIndex] = canonicalDotStar; |
| isAfterDotStar = true; |
| } |
| ++termIndex; |
| } |
| } |
| |
| // Add our ".*" in front if needed. |
| if (!m_hasBeginningOfLineAssertion && !m_sunkTerms.first().isKnownToMatchAnyString()) |
| m_sunkTerms.insert(0, canonicalDotStar); |
| |
| // Remove trailing ".*$". |
| if (m_sunkTerms.size() > 2 && m_sunkTerms.last().isEndOfLineAssertion() && m_sunkTerms[m_sunkTerms.size() - 2].isKnownToMatchAnyString()) |
| m_sunkTerms.shrink(m_sunkTerms.size() - 2); |
| |
| // Remove irrelevant terms that can match empty. For example in "foob?", matching "b" is irrelevant. |
| if (m_sunkTerms.last().isEndOfLineAssertion()) |
| return; |
| while (!m_sunkTerms.isEmpty() && !m_sunkTerms.last().matchesAtLeastOneCharacter()) |
| m_sunkTerms.removeLast(); |
| } |
| |
| bool m_patternIsCaseSensitive; |
| |
| Deque<Term> m_openGroups; |
| Vector<Term> m_sunkTerms; |
| Term m_floatingTerm; |
| bool m_hasBeginningOfLineAssertion { false }; |
| bool m_hasProcessedEndOfLineAssertion { false }; |
| |
| URLFilterParser::ParseStatus m_parseStatus; |
| }; |
| |
| URLFilterParser::URLFilterParser(CombinedURLFilters& combinedURLFilters) |
| : m_combinedURLFilters(combinedURLFilters) |
| { |
| } |
| |
| URLFilterParser::~URLFilterParser() = default; |
| |
| URLFilterParser::ParseStatus URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId) |
| { |
| if (!pattern.isAllASCII()) |
| return NonASCII; |
| if (pattern.isEmpty()) |
| return EmptyPattern; |
| |
| ParseStatus status = Ok; |
| PatternParser patternParser(patternIsCaseSensitive); |
| if (!JSC::Yarr::hasError(JSC::Yarr::parse(patternParser, pattern, false, 0))) |
| patternParser.finalize(patternId, m_combinedURLFilters); |
| else |
| status = YarrError; |
| |
| if (status == Ok) |
| status = patternParser.parseStatus(); |
| |
| return status; |
| } |
| |
| String URLFilterParser::statusString(ParseStatus status) |
| { |
| switch (status) { |
| case Ok: |
| return "Ok"; |
| case MatchesEverything: |
| return "Matches everything."; |
| case NonASCII: |
| return "Only ASCII characters are supported in pattern."; |
| case UnsupportedCharacterClass: |
| return "Character class is not supported."; |
| case BackReference: |
| return "Patterns cannot contain backreferences."; |
| case ForwardReference: |
| return "Patterns cannot contain forward references."; |
| case MisplacedStartOfLine: |
| return "Start of line assertion can only appear as the first term in a filter."; |
| case WordBoundary: |
| return "Word boundaries assertions are not supported yet."; |
| case AtomCharacter: |
| return "Builtins character class atoms are not supported yet."; |
| case Group: |
| return "Groups are not supported yet."; |
| case Disjunction: |
| return "Disjunctions are not supported yet."; |
| case MisplacedEndOfLine: |
| return "The end of line assertion must be the last term in an expression."; |
| case EmptyPattern: |
| return "Empty pattern."; |
| case YarrError: |
| return "Internal error in YARR."; |
| case InvalidQuantifier: |
| return "Arbitrary atom repetitions are not supported."; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| } |
| |
| } // namespace ContentExtensions |
| } // namespace WebCore |
| |
| #endif // ENABLE(CONTENT_EXTENSIONS) |