| /* |
| * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved. |
| * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "YarrInterpreter.h" |
| |
| #include "Options.h" |
| #include "SuperSampler.h" |
| #include "Yarr.h" |
| #include "YarrCanonicalize.h" |
| #include <wtf/BumpPointerAllocator.h> |
| #include <wtf/CheckedArithmetic.h> |
| #include <wtf/DataLog.h> |
| #include <wtf/StackCheck.h> |
| #include <wtf/text/WTFString.h> |
| |
| namespace JSC { namespace Yarr { |
| |
| template<typename CharType> |
| class Interpreter { |
| public: |
| struct ParenthesesDisjunctionContext; |
| |
| struct BackTrackInfoParentheses { |
| uintptr_t begin; |
| uintptr_t matchAmount; |
| ParenthesesDisjunctionContext* lastContext; |
| }; |
| |
| static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) |
| { |
| context->next = backTrack->lastContext; |
| backTrack->lastContext = context; |
| ++backTrack->matchAmount; |
| } |
| |
| static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) |
| { |
| RELEASE_ASSERT(backTrack->matchAmount); |
| RELEASE_ASSERT(backTrack->lastContext); |
| backTrack->lastContext = backTrack->lastContext->next; |
| --backTrack->matchAmount; |
| } |
| |
| struct DisjunctionContext |
| { |
| DisjunctionContext() = default; |
| |
| void* operator new(size_t, void* where) |
| { |
| return where; |
| } |
| |
| static size_t allocationSize(unsigned numberOfFrames) |
| { |
| static_assert(alignof(DisjunctionContext) <= sizeof(void*)); |
| size_t rawSize = sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t); |
| size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize); |
| RELEASE_ASSERT(roundedSize >= rawSize); |
| return roundedSize; |
| } |
| |
| int term { 0 }; |
| unsigned matchBegin; |
| unsigned matchEnd; |
| uintptr_t frame[1]; |
| }; |
| |
| DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) |
| { |
| size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize); |
| allocatorPool = allocatorPool->ensureCapacity(size); |
| RELEASE_ASSERT(allocatorPool); |
| return new (allocatorPool->alloc(size)) DisjunctionContext(); |
| } |
| |
| void freeDisjunctionContext(DisjunctionContext* context) |
| { |
| allocatorPool = allocatorPool->dealloc(context); |
| } |
| |
| struct ParenthesesDisjunctionContext |
| { |
| ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) |
| { |
| unsigned firstSubpatternId = term.atom.subpatternId; |
| unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; |
| |
| for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { |
| subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; |
| output[(firstSubpatternId << 1) + i] = offsetNoMatch; |
| } |
| |
| new (getDisjunctionContext(term)) DisjunctionContext(); |
| } |
| |
| void* operator new(size_t, void* where) |
| { |
| return where; |
| } |
| |
| void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) |
| { |
| for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) |
| output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; |
| } |
| |
| DisjunctionContext* getDisjunctionContext(ByteTerm& term) |
| { |
| return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)); |
| } |
| |
| static size_t allocationSize(unsigned numberOfSubpatterns) |
| { |
| static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*)); |
| size_t rawSize = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned); |
| size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize); |
| RELEASE_ASSERT(roundedSize >= rawSize); |
| return roundedSize; |
| } |
| |
| ParenthesesDisjunctionContext* next { nullptr }; |
| unsigned subpatternBackup[1]; |
| }; |
| |
| ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) |
| { |
| size_t size = Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize); |
| allocatorPool = allocatorPool->ensureCapacity(size); |
| RELEASE_ASSERT(allocatorPool); |
| return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); |
| } |
| |
| void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) |
| { |
| allocatorPool = allocatorPool->dealloc(context); |
| } |
| |
| class InputStream { |
| public: |
| InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs) |
| : input(input) |
| , pos(start) |
| , length(length) |
| , decodeSurrogatePairs(decodeSurrogatePairs) |
| { |
| } |
| |
| void next() |
| { |
| ++pos; |
| } |
| |
| void rewind(unsigned amount) |
| { |
| ASSERT(pos >= amount); |
| pos -= amount; |
| } |
| |
| int read() |
| { |
| ASSERT(pos < length); |
| if (pos < length) |
| return input[pos]; |
| return -1; |
| } |
| |
| int readPair() |
| { |
| ASSERT(pos + 1 < length); |
| return input[pos] | input[pos + 1] << 16; |
| } |
| |
| int readChecked(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| unsigned p = pos - negativePositionOffest; |
| ASSERT(p < length); |
| int result = input[p]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { |
| if (atEnd()) |
| return -1; |
| |
| result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); |
| next(); |
| } |
| return result; |
| } |
| |
| int readSurrogatePairChecked(unsigned negativePositionOffset) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffset); |
| unsigned p = pos - negativePositionOffset; |
| ASSERT(p < length); |
| if (p + 1 >= length) |
| return -1; |
| |
| int first = input[p]; |
| int second = input[p + 1]; |
| if (U16_IS_LEAD(first) && U16_IS_TRAIL(second)) |
| return U16_GET_SUPPLEMENTARY(first, second); |
| |
| return -1; |
| } |
| |
| int reread(unsigned from) |
| { |
| ASSERT(from < length); |
| int result = input[from]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1])) |
| result = U16_GET_SUPPLEMENTARY(result, input[from + 1]); |
| return result; |
| } |
| |
| int prev() |
| { |
| ASSERT(!(pos > length)); |
| if (pos && length) |
| return input[pos - 1]; |
| return -1; |
| } |
| |
| unsigned getPos() |
| { |
| return pos; |
| } |
| |
| void setPos(unsigned p) |
| { |
| pos = p; |
| } |
| |
| bool atStart() |
| { |
| return pos == 0; |
| } |
| |
| bool atEnd() |
| { |
| return pos == length; |
| } |
| |
| unsigned end() |
| { |
| return length; |
| } |
| |
| bool checkInput(unsigned count) |
| { |
| if (((pos + count) <= length) && ((pos + count) >= pos)) { |
| pos += count; |
| return true; |
| } |
| return false; |
| } |
| |
| void uncheckInput(unsigned count) |
| { |
| RELEASE_ASSERT(pos >= count); |
| pos -= count; |
| } |
| |
| bool atStart(unsigned negativePositionOffset) |
| { |
| return pos == negativePositionOffset; |
| } |
| |
| bool atEnd(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| return (pos - negativePositionOffest) == length; |
| } |
| |
| bool isAvailableInput(unsigned offset) |
| { |
| return (((pos + offset) <= length) && ((pos + offset) >= pos)); |
| } |
| |
| private: |
| const CharType* input; |
| unsigned pos; |
| unsigned length; |
| bool decodeSurrogatePairs; |
| }; |
| |
| bool testCharacterClass(CharacterClass* characterClass, int ch) |
| { |
| auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) { |
| for (unsigned i = 0; i < matches.size(); ++i) { |
| if (ch == matches[i]) |
| return true; |
| } |
| |
| return false; |
| }; |
| |
| auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) { |
| size_t low = 0; |
| size_t high = matches.size() - 1; |
| |
| while (low <= high) { |
| size_t mid = low + (high - low) / 2; |
| int diff = ch - matches[mid]; |
| if (!diff) |
| return true; |
| |
| if (diff < 0) { |
| if (mid == low) |
| return false; |
| high = mid - 1; |
| } else |
| low = mid + 1; |
| } |
| return false; |
| }; |
| |
| auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) { |
| for (unsigned i = 0; i < ranges.size(); ++i) { |
| if ((ch >= ranges[i].begin) && (ch <= ranges[i].end)) |
| return true; |
| } |
| |
| return false; |
| }; |
| |
| auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) { |
| size_t low = 0; |
| size_t high = ranges.size() - 1; |
| |
| while (low <= high) { |
| size_t mid = low + (high - low) / 2; |
| int rangeBeginDiff = ch - ranges[mid].begin; |
| if (rangeBeginDiff >= 0 && ch <= ranges[mid].end) |
| return true; |
| |
| if (rangeBeginDiff < 0) { |
| if (mid == low) |
| return false; |
| high = mid - 1; |
| } else |
| low = mid + 1; |
| } |
| return false; |
| }; |
| |
| if (characterClass->m_anyCharacter) |
| return true; |
| |
| const size_t thresholdForBinarySearch = 6; |
| |
| if (!isASCII(ch)) { |
| if (characterClass->m_matchesUnicode.size()) { |
| if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) { |
| if (binarySearchMatches(characterClass->m_matchesUnicode)) |
| return true; |
| } else if (linearSearchMatches(characterClass->m_matchesUnicode)) |
| return true; |
| } |
| |
| if (characterClass->m_rangesUnicode.size()) { |
| if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) { |
| if (binarySearchRanges(characterClass->m_rangesUnicode)) |
| return true; |
| } else if (linearSearchRanges(characterClass->m_rangesUnicode)) |
| return true; |
| } |
| } else { |
| if (characterClass->m_matches.size()) { |
| if (characterClass->m_matches.size() > thresholdForBinarySearch) { |
| if (binarySearchMatches(characterClass->m_matches)) |
| return true; |
| } else if (linearSearchMatches(characterClass->m_matches)) |
| return true; |
| } |
| |
| if (characterClass->m_ranges.size()) { |
| if (characterClass->m_ranges.size() > thresholdForBinarySearch) { |
| if (binarySearchRanges(characterClass->m_ranges)) |
| return true; |
| } else if (linearSearchRanges(characterClass->m_ranges)) |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| bool checkCharacter(int testChar, unsigned negativeInputOffset) |
| { |
| return testChar == input.readChecked(negativeInputOffset); |
| } |
| |
| bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset) |
| { |
| return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset); |
| } |
| |
| bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) |
| { |
| int ch = input.readChecked(negativeInputOffset); |
| return (loChar == ch) || (hiChar == ch); |
| } |
| |
| bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) |
| { |
| int inputChar = input.readChecked(negativeInputOffset); |
| if (inputChar < 0) |
| return false; |
| |
| bool match = testCharacterClass(characterClass, inputChar); |
| return invert ? !match : match; |
| } |
| |
| bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset) |
| { |
| int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset); |
| if (readCharacter < 0) |
| return false; |
| |
| return testCharacterClass(characterClass, readCharacter); |
| } |
| |
| bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset) |
| { |
| unsigned matchSize = (unsigned)(matchEnd - matchBegin); |
| |
| if (!input.checkInput(matchSize)) |
| return false; |
| |
| for (unsigned i = 0; i < matchSize; ++i) { |
| int oldCh = input.reread(matchBegin + i); |
| int ch; |
| if (!U_IS_BMP(oldCh)) { |
| ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i); |
| ++i; |
| } else |
| ch = input.readChecked(negativeInputOffset + matchSize - i); |
| |
| if (oldCh == ch) |
| continue; |
| |
| if (pattern->ignoreCase()) { |
| // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode |
| // patterns, Unicode values are never allowed to match against ASCII ones. |
| // For Unicode, we need to check all canonical equivalents of a character. |
| if (!unicode && (isASCII(oldCh) || isASCII(ch))) { |
| if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) |
| continue; |
| } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2)) |
| continue; |
| } |
| |
| input.uncheckInput(matchSize); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool matchAssertionBOL(ByteTerm& term) |
| { |
| return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); |
| } |
| |
| bool matchAssertionEOL(ByteTerm& term) |
| { |
| if (term.inputPosition) |
| return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); |
| |
| return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read())); |
| } |
| |
| bool matchAssertionWordBoundary(ByteTerm& term) |
| { |
| bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1)); |
| bool readIsWordchar; |
| if (term.inputPosition) |
| readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); |
| else |
| readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); |
| |
| bool wordBoundary = prevIsWordchar != readIsWordchar; |
| return term.invert() ? !wordBoundary : wordBoundary; |
| } |
| |
| bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) |
| { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| input.uncheckInput(U16_LENGTH(term.atom.patternCharacter)); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) |
| { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| input.uncheckInput(1); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) |
| return true; |
| } |
| input.uncheckInput(backTrack->matchAmount); |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::CharacterClass); |
| BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| if (unicode) { |
| CharacterClass* charClass = term.atom.characterClass; |
| backTrack->begin = input.getPos(); |
| unsigned matchAmount = 0; |
| for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (term.invert()) { |
| if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } else { |
| unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1); |
| if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) |
| return false; |
| } |
| return true; |
| } |
| |
| case QuantifierType::Greedy: { |
| unsigned position = input.getPos(); |
| backTrack->begin = position; |
| unsigned matchAmount = 0; |
| while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| return true; |
| } |
| |
| case QuantifierType::NonGreedy: |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| return true; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::CharacterClass); |
| BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| if (unicode) |
| input.setPos(backTrack->begin); |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| if (unicode) { |
| // Rematch one less match |
| input.setPos(backTrack->begin); |
| --backTrack->matchAmount; |
| for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) { |
| if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { |
| input.uncheckInput(1); |
| break; |
| } |
| } |
| return true; |
| } |
| --backTrack->matchAmount; |
| input.uncheckInput(1); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchBackReference(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::BackReference); |
| BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
| |
| unsigned matchBegin = output[(term.atom.subpatternId << 1)]; |
| unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; |
| |
| // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. |
| // In this case the result of match is empty string like when it references to a parentheses with zero-width match. |
| // Eg.: /(a\1)/ |
| if (matchEnd == offsetNoMatch) |
| return true; |
| |
| if (matchBegin == offsetNoMatch) |
| return true; |
| |
| ASSERT(matchBegin <= matchEnd); |
| |
| if (matchBegin == matchEnd) |
| return true; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| backTrack->begin = input.getPos(); |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| case QuantifierType::Greedy: { |
| unsigned matchAmount = 0; |
| while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) |
| ++matchAmount; |
| backTrack->matchAmount = matchAmount; |
| return true; |
| } |
| |
| case QuantifierType::NonGreedy: |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| return true; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::BackReference); |
| BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
| |
| unsigned matchBegin = output[(term.atom.subpatternId << 1)]; |
| unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; |
| |
| if (matchBegin == offsetNoMatch) |
| return false; |
| |
| ASSERT(matchBegin <= matchEnd); |
| |
| if (matchBegin == matchEnd) |
| return false; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| // for quantityMaxCount == 1, could rewind. |
| input.setPos(backTrack->begin); |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| input.rewind(matchEnd - matchBegin); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { |
| ++backTrack->matchAmount; |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| |
| return false; |
| } |
| |
| void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) |
| { |
| if (term.capture()) { |
| unsigned subpatternId = term.atom.subpatternId; |
| output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition; |
| output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition; |
| } |
| } |
| void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) |
| { |
| unsigned firstSubpatternId = term.atom.subpatternId; |
| unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; |
| context->restoreOutput(output, firstSubpatternId, count); |
| } |
| JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) |
| { |
| while (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| |
| JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); |
| if (result == JSRegExpResult::Match) |
| return JSRegExpResult::Match; |
| |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| return JSRegExpResult::NoMatch; |
| } |
| |
| bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: { |
| // set this speculatively; if we get to the parens end this will be true. |
| backTrack->begin = input.getPos(); |
| break; |
| } |
| case QuantifierType::NonGreedy: { |
| backTrack->begin = notFound; |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.atom.subpatternId; |
| output[(subpatternId << 1)] = input.getPos() - term.inputPosition; |
| } |
| |
| return true; |
| } |
| |
| bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.atom.subpatternId; |
| output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition; |
| } |
| |
| if (term.atom.quantityType == QuantifierType::FixedCount) |
| return true; |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| return backTrack->begin != input.getPos(); |
| } |
| |
| bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.atom.subpatternId; |
| output[(subpatternId << 1)] = offsetNoMatch; |
| output[(subpatternId << 1) + 1] = offsetNoMatch; |
| } |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: |
| // if we backtrack to this point, there is another chance - try matching nothing. |
| ASSERT(backTrack->begin != notFound); |
| backTrack->begin = notFound; |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| case QuantifierType::NonGreedy: |
| ASSERT(backTrack->begin != notFound); |
| FALLTHROUGH; |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: |
| if (backTrack->begin == notFound) { |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| FALLTHROUGH; |
| case QuantifierType::NonGreedy: |
| if (backTrack->begin == notFound) { |
| backTrack->begin = input.getPos(); |
| if (term.capture()) { |
| // Technically this access to inputPosition should be accessing the begin term's |
| // inputPosition, but for repeats other than fixed these values should be |
| // the same anyway! (We don't pre-check for greedy or non-greedy matches.) |
| ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); |
| unsigned subpatternId = term.atom.subpatternId; |
| output[subpatternId << 1] = input.getPos() - term.inputPosition; |
| } |
| context->term -= term.atom.parenthesesWidth; |
| return true; |
| } |
| FALLTHROUGH; |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| ASSERT(term.atom.quantityType == QuantifierType::Greedy); |
| ASSERT(term.atom.quantityMaxCount == quantifyInfinite); |
| ASSERT(!term.capture()); |
| |
| BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); |
| backTrack->begin = input.getPos(); |
| return true; |
| } |
| |
| bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalEnd); |
| |
| BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); |
| // Empty match is a failed match. |
| if (backTrack->begin == input.getPos()) |
| return false; |
| |
| // Successful match! Okay, what's next? - loop around and try to match more! |
| context->term -= (term.atom.parenthesesWidth + 1); |
| return true; |
| } |
| |
| bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| ASSERT(term.atom.quantityType == QuantifierType::Greedy); |
| ASSERT(term.atom.quantityMaxCount == quantifyInfinite); |
| ASSERT(!term.capture()); |
| |
| // If we backtrack to this point, we have failed to match this iteration of the parens. |
| // Since this is greedy / zero minimum a failed is also accepted as a match! |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| |
| bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) |
| { |
| // 'Terminal' parentheses are at the end of the regex, and as such a match past end |
| // should always be returned as a successful match - we should never backtrack to here. |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| backTrack->begin = input.getPos(); |
| return true; |
| } |
| |
| bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| input.setPos(backTrack->begin); |
| |
| // We've reached the end of the parens; if they are inverted, this is failure. |
| if (term.invert()) { |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| // We've failed to match parens; if they are inverted, this is win! |
| if (term.invert()) { |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| input.setPos(backTrack->begin); |
| |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| |
| JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpattern); |
| |
| BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
| ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
| |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| backTrack->lastContext = nullptr; |
| |
| ASSERT(term.atom.quantityType != QuantifierType::FixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount); |
| |
| unsigned minimumMatchCount = term.atom.quantityMinCount; |
| JSRegExpResult fixedMatchResult; |
| |
| // Handle fixed matches and the minimum part of a variable length match. |
| if (minimumMatchCount) { |
| // While we haven't yet reached our fixed limit, |
| while (backTrack->matchAmount < minimumMatchCount) { |
| // Try to do a match, and it it succeeds, add it to the list. |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); |
| if (fixedMatchResult == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| // The match failed; try to find an alternate point to carry on from. |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (fixedMatchResult != JSRegExpResult::NoMatch) |
| return fixedMatchResult; |
| JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); |
| if (backtrackResult != JSRegExpResult::Match) |
| return backtrackResult; |
| } |
| } |
| |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::Greedy: { |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); |
| if (result == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| break; |
| } |
| } |
| |
| if (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::NonGreedy: |
| return JSRegExpResult::Match; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| // Rules for backtracking differ depending on whether this is greedy or non-greedy. |
| // |
| // Greedy matches never should try just adding more - you should already have done |
| // the 'more' cases. Always backtrack, at least a leetle bit. However cases where |
| // you backtrack an item off the list needs checking, since we'll never have matched |
| // the one less case. Tracking forwards, still add as much as possible. |
| // |
| // Non-greedy, we've already done the one less case, so don't match on popping. |
| // We haven't done the one more case, so always try to add that. |
| // |
| JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpattern); |
| |
| BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
| ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| |
| ParenthesesDisjunctionContext* context = nullptr; |
| JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); |
| |
| if (result != JSRegExpResult::Match) |
| return result; |
| |
| // While we haven't yet reached our fixed limit, |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| // Try to do a match, and it it succeeds, add it to the list. |
| context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); |
| |
| if (result == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| // The match failed; try to find an alternate point to carry on from. |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); |
| if (backtrackResult != JSRegExpResult::Match) |
| return backtrackResult; |
| } |
| } |
| |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::Greedy: { |
| if (!backTrack->matchAmount) |
| return JSRegExpResult::NoMatch; |
| |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); |
| if (result == JSRegExpResult::Match) { |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); |
| if (parenthesesResult == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (parenthesesResult != JSRegExpResult::NoMatch) |
| return parenthesesResult; |
| |
| break; |
| } |
| } |
| } else { |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (backTrack->matchAmount < term.atom.quantityMinCount) { |
| while (backTrack->matchAmount) { |
| context = backTrack->lastContext; |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| } |
| |
| input.setPos(backTrack->begin); |
| return result; |
| } |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| if (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::NonGreedy: { |
| // If we've not reached the limit, try to add one more match. |
| if (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); |
| if (result == JSRegExpResult::Match) { |
| appendParenthesesDisjunctionContext(backTrack, context); |
| recordParenthesesMatch(term, context); |
| return JSRegExpResult::Match; |
| } |
| |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| // Nope - okay backtrack looking for an alternative. |
| while (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); |
| if (result == JSRegExpResult::Match) { |
| // successful backtrack! we're back in the game! |
| if (backTrack->matchAmount) { |
| context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| // pop a match off the stack |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| return JSRegExpResult::NoMatch; |
| } |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) |
| { |
| UNUSED_PARAM(term); |
| |
| if (pattern->dotAll()) { |
| context->matchBegin = startOffset; |
| context->matchEnd = input.end(); |
| return true; |
| } |
| |
| unsigned matchBegin = context->matchBegin; |
| |
| if (matchBegin > startOffset) { |
| for (matchBegin--; true; matchBegin--) { |
| if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { |
| ++matchBegin; |
| break; |
| } |
| |
| if (matchBegin == startOffset) |
| break; |
| } |
| } |
| |
| unsigned matchEnd = input.getPos(); |
| |
| for (; (matchEnd != input.end()) |
| && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } |
| |
| if (((matchBegin && term.anchors.m_bol) |
| || ((matchEnd != input.end()) && term.anchors.m_eol)) |
| && !pattern->multiline()) |
| return false; |
| |
| context->matchBegin = matchBegin; |
| context->matchEnd = matchEnd; |
| return true; |
| } |
| |
| #define MATCH_NEXT() { ++context->term; goto matchAgain; } |
| #define BACKTRACK() { --context->term; goto backtrack; } |
| #define currentTerm() (disjunction->terms[context->term]) |
| JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) |
| return JSRegExpResult::ErrorNoMemory; |
| |
| if (!--remainingMatchCount) |
| return JSRegExpResult::ErrorHitLimit; |
| |
| if (btrack) |
| BACKTRACK(); |
| |
| context->matchBegin = input.getPos(); |
| context->term = 0; |
| |
| matchAgain: |
| ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
| |
| switch (currentTerm().type) { |
| case ByteTerm::Type::SubpatternBegin: |
| MATCH_NEXT(); |
| case ByteTerm::Type::SubpatternEnd: |
| context->matchEnd = input.getPos(); |
| return JSRegExpResult::Match; |
| |
| case ByteTerm::Type::BodyAlternativeBegin: |
| MATCH_NEXT(); |
| case ByteTerm::Type::BodyAlternativeDisjunction: |
| case ByteTerm::Type::BodyAlternativeEnd: |
| context->matchEnd = input.getPos(); |
| return JSRegExpResult::Match; |
| |
| case ByteTerm::Type::AlternativeBegin: |
| MATCH_NEXT(); |
| case ByteTerm::Type::AlternativeDisjunction: |
| case ByteTerm::Type::AlternativeEnd: { |
| int offset = currentTerm().alternative.end; |
| BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
| backTrack->offset = offset; |
| context->term += offset; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::AssertionBOL: |
| if (matchAssertionBOL(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::AssertionEOL: |
| if (matchAssertionEOL(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::AssertionWordBoundary: |
| if (matchAssertionWordBoundary(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::PatternCharacterOnce: |
| case ByteTerm::Type::PatternCharacterFixed: { |
| if (unicode) { |
| if (!U_IS_BMP(currentTerm().atom.patternCharacter)) { |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) { |
| BACKTRACK(); |
| } |
| } |
| MATCH_NEXT(); |
| } |
| } |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCharacterGreedy: { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| unsigned matchAmount = 0; |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCharacterNonGreedy: { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| case ByteTerm::Type::PatternCasedCharacterFixed: { |
| if (unicode) { |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| MATCH_NEXT(); |
| } |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) |
| BACKTRACK(); |
| } |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCasedCharacterGreedy: { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| unsigned matchAmount = 0; |
| while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { |
| input.uncheckInput(1); |
| break; |
| } |
| ++matchAmount; |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| backTrack->matchAmount = 0; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::CharacterClass: |
| if (matchCharacterClass(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::BackReference: |
| if (matchBackReference(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpattern: { |
| JSRegExpResult result = matchParentheses(currentTerm(), context); |
| |
| if (result == JSRegExpResult::Match) { |
| MATCH_NEXT(); |
| } else if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| if (matchParenthesesOnceBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| if (matchParenthesesOnceEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| if (matchParenthesesTerminalBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| if (matchParenthesesTerminalEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| if (matchParentheticalAssertionBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| if (matchParentheticalAssertionEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::CheckInput: |
| if (input.checkInput(currentTerm().checkInputCount)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::UncheckInput: |
| input.uncheckInput(currentTerm().checkInputCount); |
| MATCH_NEXT(); |
| |
| case ByteTerm::Type::DotStarEnclosure: |
| if (matchDotStarEnclosure(currentTerm(), context)) |
| return JSRegExpResult::Match; |
| BACKTRACK(); |
| } |
| |
| // We should never fall-through to here. |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| backtrack: |
| ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
| |
| switch (currentTerm().type) { |
| case ByteTerm::Type::SubpatternBegin: |
| return JSRegExpResult::NoMatch; |
| case ByteTerm::Type::SubpatternEnd: |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| case ByteTerm::Type::BodyAlternativeBegin: |
| case ByteTerm::Type::BodyAlternativeDisjunction: { |
| int offset = currentTerm().alternative.next; |
| context->term += offset; |
| if (offset > 0) |
| MATCH_NEXT(); |
| |
| if (input.atEnd() || pattern->sticky()) |
| return JSRegExpResult::NoMatch; |
| |
| input.next(); |
| |
| context->matchBegin = input.getPos(); |
| |
| if (currentTerm().alternative.onceThrough) |
| context->term += currentTerm().alternative.next; |
| |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::BodyAlternativeEnd: |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| case ByteTerm::Type::AlternativeBegin: |
| case ByteTerm::Type::AlternativeDisjunction: { |
| int offset = currentTerm().alternative.next; |
| context->term += offset; |
| if (offset > 0) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::AlternativeEnd: { |
| // We should never backtrack back into an alternative of the main body of the regex. |
| BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
| unsigned offset = backTrack->offset; |
| context->term -= offset; |
| BACKTRACK(); |
| } |
| |
| case ByteTerm::Type::AssertionBOL: |
| case ByteTerm::Type::AssertionEOL: |
| case ByteTerm::Type::AssertionWordBoundary: |
| BACKTRACK(); |
| |
| case ByteTerm::Type::PatternCharacterOnce: |
| case ByteTerm::Type::PatternCharacterFixed: |
| case ByteTerm::Type::PatternCharacterGreedy: |
| case ByteTerm::Type::PatternCharacterNonGreedy: |
| if (backtrackPatternCharacter(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| case ByteTerm::Type::PatternCasedCharacterFixed: |
| case ByteTerm::Type::PatternCasedCharacterGreedy: |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: |
| if (backtrackPatternCasedCharacter(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::CharacterClass: |
| if (backtrackCharacterClass(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::BackReference: |
| if (backtrackBackReference(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpattern: { |
| JSRegExpResult result = backtrackParentheses(currentTerm(), context); |
| |
| if (result == JSRegExpResult::Match) { |
| MATCH_NEXT(); |
| } else if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| if (backtrackParenthesesOnceBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| if (backtrackParenthesesOnceEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| if (backtrackParenthesesTerminalBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| if (backtrackParenthesesTerminalEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| if (backtrackParentheticalAssertionBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| if (backtrackParentheticalAssertionEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::CheckInput: |
| input.uncheckInput(currentTerm().checkInputCount); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::UncheckInput: |
| input.checkInput(currentTerm().checkInputCount); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::DotStarEnclosure: |
| RELEASE_ASSERT_NOT_REACHED(); |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
| { |
| JSRegExpResult result = matchDisjunction(disjunction, context, btrack); |
| |
| if (result == JSRegExpResult::Match) { |
| while (context->matchBegin == context->matchEnd) { |
| result = matchDisjunction(disjunction, context, true); |
| if (result != JSRegExpResult::Match) |
| return result; |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| return result; |
| } |
| |
| // WTF_IGNORES_THREAD_SAFETY_ANALYSIS because this function does conditional locking. |
| unsigned interpret() WTF_IGNORES_THREAD_SAFETY_ANALYSIS |
| { |
| // FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970 |
| // [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion |
| if (!input.isAvailableInput(0)) |
| return offsetNoMatch; |
| |
| if (pattern->m_lock) |
| pattern->m_lock->lock(); |
| |
| for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) |
| output[i << 1] = offsetNoMatch; |
| |
| allocatorPool = pattern->m_allocator->startAllocator(); |
| RELEASE_ASSERT(allocatorPool); |
| |
| DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); |
| |
| JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); |
| if (result == JSRegExpResult::Match) { |
| output[0] = context->matchBegin; |
| output[1] = context->matchEnd; |
| } |
| |
| freeDisjunctionContext(context); |
| |
| pattern->m_allocator->stopAllocator(); |
| |
| ASSERT((result == JSRegExpResult::Match) == (output[0] != offsetNoMatch)); |
| |
| if (pattern->m_lock) |
| pattern->m_lock->unlock(); |
| |
| return output[0]; |
| } |
| |
| Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) |
| : pattern(pattern) |
| , unicode(pattern->unicode()) |
| , output(output) |
| , input(input, start, length, pattern->unicode()) |
| , startOffset(start) |
| , remainingMatchCount(matchLimit) |
| { |
| } |
| |
| private: |
| inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); } |
| |
| BytecodePattern* pattern; |
| bool unicode; |
| unsigned* output; |
| InputStream input; |
| StackCheck m_stackCheck; |
| WTF::BumpPointerPool* allocatorPool { nullptr }; |
| unsigned startOffset; |
| unsigned remainingMatchCount; |
| }; |
| |
| class ByteCompiler { |
| struct ParenthesesStackEntry { |
| unsigned beginTerm; |
| unsigned savedAlternativeIndex; |
| ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) |
| : beginTerm(beginTerm) |
| , savedAlternativeIndex(savedAlternativeIndex) |
| { |
| } |
| }; |
| |
| public: |
| ByteCompiler(YarrPattern& pattern) |
| : m_pattern(pattern) |
| { |
| } |
| |
| std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock, ErrorCode& errorCode) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) { |
| errorCode = ErrorCode::TooManyDisjunctions; |
| return nullptr; |
| } |
| |
| regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); |
| if (auto error = emitDisjunction(m_pattern.m_body, 0, 0)) { |
| errorCode = error.value(); |
| return nullptr; |
| } |
| regexEnd(); |
| |
| #ifndef NDEBUG |
| if (Options::dumpCompiledRegExpPatterns()) |
| dumpDisjunction(m_bodyDisjunction.get()); |
| #endif |
| |
| return makeUnique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock); |
| } |
| |
| void checkInput(unsigned count) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); |
| } |
| |
| void uncheckInput(unsigned count) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); |
| } |
| |
| void assertionBOL(unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); |
| } |
| |
| void assertionEOL(unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); |
| } |
| |
| void assertionWordBoundary(bool invert, unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); |
| } |
| |
| void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| if (m_pattern.ignoreCase()) { |
| UChar32 lo = u_tolower(ch); |
| UChar32 hi = u_toupper(ch); |
| |
| if (lo != hi) { |
| m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType)); |
| return; |
| } |
| } |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType)); |
| } |
| |
| void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); |
| |
| m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms.last().atom.quantityType = quantityType; |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| } |
| |
| void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| ASSERT(subpatternId); |
| |
| m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); |
| |
| m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms.last().atom.quantityType = quantityType; |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| // Errrk! - this is a little crazy, we initially generate as a Type::ParenthesesSubpatternOnceBegin, |
| // then fix this up at the end! - simplifying this should make it much clearer. |
| // https://bugs.webkit.org/show_bug.cgi?id=50136 |
| |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParentheticalAssertionBegin, subpatternId, false, invert, 0)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParentheticalAssertionBegin); |
| |
| bool invert = m_bodyDisjunction->terms[beginTerm].invert(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); |
| } |
| |
| unsigned popParenthesesStack() |
| { |
| ASSERT(m_parenthesesStack.size()); |
| unsigned beginTerm = m_parenthesesStack.last().beginTerm; |
| m_currentAlternativeIndex = m_parenthesesStack.last().savedAlternativeIndex; |
| m_parenthesesStack.removeLast(); |
| |
| ASSERT(beginTerm < m_bodyDisjunction->terms.size()); |
| ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); |
| |
| return beginTerm; |
| } |
| |
| void closeAlternative(unsigned beginTerm) |
| { |
| unsigned origBeginTerm = beginTerm; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::AlternativeBegin); |
| unsigned endIndex = m_bodyDisjunction->terms.size(); |
| |
| unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
| |
| if (!m_bodyDisjunction->terms[beginTerm].alternative.next) |
| m_bodyDisjunction->terms.remove(beginTerm); |
| else { |
| while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
| beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::AlternativeDisjunction); |
| m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
| |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); |
| m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
| } |
| } |
| |
| void closeBodyAlternative() |
| { |
| unsigned beginTerm = 0; |
| unsigned origBeginTerm = 0; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::BodyAlternativeBegin); |
| unsigned endIndex = m_bodyDisjunction->terms.size(); |
| |
| unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
| |
| while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
| beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::BodyAlternativeDisjunction); |
| m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
| |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); |
| m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| |
| ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; |
| |
| bool capture = parenthesesBegin.capture(); |
| unsigned subpatternId = parenthesesBegin.atom.subpatternId; |
| |
| unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; |
| auto parenthesesDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize); |
| |
| unsigned firstTermInParentheses = beginTerm + 1; |
| parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); |
| |
| parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); |
| for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses) |
| parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); |
| parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); |
| |
| m_bodyDisjunction->terms.shrink(beginTerm); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); |
| m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction)); |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| |
| bool capture = m_bodyDisjunction->terms[beginTerm].capture(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| |
| bool capture = m_bodyDisjunction->terms[beginTerm].capture(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) |
| { |
| m_bodyDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize); |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); |
| m_bodyDisjunction->terms[0].frameLocation = 0; |
| m_currentAlternativeIndex = 0; |
| } |
| |
| void regexEnd() |
| { |
| closeBodyAlternative(); |
| } |
| |
| void alternativeBodyDisjunction(bool onceThrough) |
| { |
| unsigned newAlternativeIndex = m_bodyDisjunction->terms.size(); |
| m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); |
| |
| m_currentAlternativeIndex = newAlternativeIndex; |
| } |
| |
| void alternativeDisjunction() |
| { |
| unsigned newAlternativeIndex = m_bodyDisjunction->terms.size(); |
| m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); |
| |
| m_currentAlternativeIndex = newAlternativeIndex; |
| } |
| |
| std::optional<ErrorCode> WARN_UNUSED_RETURN emitDisjunction(PatternDisjunction* disjunction, CheckedUint32 inputCountAlreadyChecked, unsigned parenthesesInputCountAlreadyChecked) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) |
| return ErrorCode::TooManyDisjunctions; |
| |
| for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
| auto currentCountAlreadyChecked = inputCountAlreadyChecked; |
| |
| PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); |
| |
| if (alt) { |
| if (disjunction == m_pattern.m_body) |
| alternativeBodyDisjunction(alternative->onceThrough()); |
| else |
| alternativeDisjunction(); |
| } |
| |
| unsigned minimumSize = alternative->m_minimumSize; |
| ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); |
| unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; |
| |
| if (countToCheck) { |
| checkInput(countToCheck); |
| currentCountAlreadyChecked += countToCheck; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| |
| for (auto& term : alternative->m_terms) { |
| switch (term.type) { |
| case PatternTerm::Type::AssertionBOL: |
| assertionBOL(currentCountAlreadyChecked - term.inputPosition); |
| break; |
| |
| case PatternTerm::Type::AssertionEOL: |
| assertionEOL(currentCountAlreadyChecked - term.inputPosition); |
| break; |
| |
| case PatternTerm::Type::AssertionWordBoundary: |
| assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition); |
| break; |
| |
| case PatternTerm::Type::PatternCharacter: |
| atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::CharacterClass: |
| atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::BackReference: |
| atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::ForwardReference: |
| break; |
| |
| case PatternTerm::Type::ParenthesesSubpattern: { |
| unsigned disjunctionAlreadyCheckedCount = 0; |
| if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { |
| unsigned alternativeFrameLocation = term.frameLocation; |
| // For QuantifierType::FixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. |
| if (term.quantityType == QuantifierType::FixedCount) |
| disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; |
| else |
| alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; |
| unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; |
| atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount)) |
| return error; |
| atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); |
| } else if (term.parentheses.isTerminal) { |
| unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; |
| atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount)) |
| return error; |
| atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); |
| } else { |
| unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; |
| atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0)) |
| return error; |
| atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); |
| } |
| break; |
| } |
| |
| case PatternTerm::Type::ParentheticalAssertion: { |
| unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; |
| unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; |
| unsigned uncheckAmount = 0; |
| if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { |
| uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; |
| uncheckInput(uncheckAmount); |
| currentCountAlreadyChecked -= uncheckAmount; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| |
| atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount)) |
| return error; |
| atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| if (uncheckAmount) { |
| checkInput(uncheckAmount); |
| currentCountAlreadyChecked += uncheckAmount; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| break; |
| } |
| |
| case PatternTerm::Type::DotStarEnclosure: |
| assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); |
| break; |
| } |
| } |
| } |
| return std::nullopt; |
| } |
| #ifndef NDEBUG |
| void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0) |
| { |
| PrintStream& out = WTF::dataFile(); |
| |
| unsigned termIndexNest = 0; |
| |
| if (!nesting) { |
| out.printf("ByteDisjunction(%p):\n", disjunction); |
| nesting = 1; |
| } else { |
| termIndexNest = nesting - 1; |
| nesting = 2; |
| } |
| |
| auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) { |
| for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++) |
| out.print(" "); |
| out.printf("%4zu", index); |
| for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++) |
| out.print(" "); |
| }; |
| |
| auto dumpQuantity = [&](ByteTerm& term) { |
| if (term.atom.quantityType == QuantifierType::FixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1) |
| return; |
| |
| out.print(" {", term.atom.quantityMinCount); |
| if (term.atom.quantityMinCount != term.atom.quantityMaxCount) { |
| if (term.atom.quantityMaxCount == UINT_MAX) |
| out.print(",inf"); |
| else |
| out.print(",", term.atom.quantityMaxCount); |
| } |
| out.print("}"); |
| if (term.atom.quantityType == QuantifierType::Greedy) |
| out.print(" greedy"); |
| else if (term.atom.quantityType == QuantifierType::NonGreedy) |
| out.print(" non-greedy"); |
| }; |
| |
| auto dumpCaptured = [&](ByteTerm& term) { |
| if (term.capture()) |
| out.print(" captured (#", term.atom.subpatternId, ")"); |
| }; |
| |
| auto dumpInverted = [&](ByteTerm& term) { |
| if (term.invert()) |
| out.print(" inverted"); |
| }; |
| |
| auto dumpInputPosition = [&](ByteTerm& term) { |
| out.printf(" inputPosition %u", term.inputPosition); |
| }; |
| |
| auto dumpFrameLocation = [&](ByteTerm& term) { |
| out.printf(" frameLocation %u", term.frameLocation); |
| }; |
| |
| auto dumpCharacter = [&](ByteTerm& term) { |
| out.print(" "); |
| dumpUChar32(out, term.atom.patternCharacter); |
| }; |
| |
| auto dumpCharClass = [&](ByteTerm& term) { |
| out.print(" "); |
| dumpCharacterClass(out, &m_pattern, term.atom.characterClass); |
| }; |
| |
| for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) { |
| ByteTerm term = disjunction->terms[idx]; |
| |
| bool outputNewline = true; |
| |
| switch (term.type) { |
| case ByteTerm::Type::BodyAlternativeBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("BodyAlternativeBegin"); |
| if (term.alternative.onceThrough) |
| out.print(" onceThrough"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::BodyAlternativeDisjunction: |
| outputTermIndexAndNest(idx, nesting - 1); |
| out.print("BodyAlternativeDisjunction"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::BodyAlternativeEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("BodyAlternativeEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::AlternativeBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("AlternativeBegin"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::AlternativeDisjunction: |
| outputTermIndexAndNest(idx, nesting - 1); |
| out.print("AlternativeDisjunction"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::AlternativeEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("AlternativeEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::SubpatternBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("SubpatternBegin"); |
| break; |
| case ByteTerm::Type::SubpatternEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("SubpatternEnd"); |
| break; |
| case ByteTerm::Type::AssertionBOL: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("AssertionBOL"); |
| break; |
| case ByteTerm::Type::AssertionEOL: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("AssertionEOL"); |
| break; |
| case ByteTerm::Type::AssertionWordBoundary: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("AssertionWordBoundary"); |
| break; |
| case ByteTerm::Type::PatternCharacterOnce: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCharacterOnce"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::PatternCharacterFixed: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCharacterFixed"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| out.print(" {", term.atom.quantityMinCount, "}"); |
| break; |
| case ByteTerm::Type::PatternCharacterGreedy: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCharacterGreedy"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::PatternCharacterNonGreedy: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCharacterNonGreedy"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCasedCharacterOnce"); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterFixed: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCasedCharacterFixed"); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterGreedy: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCasedCharacterGreedy"); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("PatternCasedCharacterNonGreedy"); |
| break; |
| case ByteTerm::Type::CharacterClass: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("CharacterClass"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharClass(term); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::BackReference: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("BackReference #", term.atom.subpatternId); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpattern: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("ParenthesesSubpattern"); |
| dumpCaptured(term); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpQuantity(term); |
| out.print("\n"); |
| outputNewline = false; |
| dumpDisjunction(term.atom.parenthesesDisjunction, nesting); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("ParenthesesSubpatternOnceBegin"); |
| dumpCaptured(term); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("ParenthesesSubpatternOnceEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("ParenthesesSubpatternTerminalBegin"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("ParenthesesSubpatternTerminalEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| outputTermIndexAndNest(idx, nesting++); |
| out.print("ParentheticalAssertionBegin"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| outputTermIndexAndNest(idx, --nesting); |
| out.print("ParentheticalAssertionEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::CheckInput: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("CheckInput ", term.checkInputCount); |
| break; |
| case ByteTerm::Type::UncheckInput: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("UncheckInput ", term.checkInputCount); |
| break; |
| case ByteTerm::Type::DotStarEnclosure: |
| outputTermIndexAndNest(idx, nesting); |
| out.print("DotStarEnclosure"); |
| break; |
| } |
| if (outputNewline) |
| out.print("\n"); |
| } |
| } |
| #endif |
| |
| private: |
| inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); } |
| |
| YarrPattern& m_pattern; |
| std::unique_ptr<ByteDisjunction> m_bodyDisjunction; |
| StackCheck m_stackCheck; |
| unsigned m_currentAlternativeIndex { 0 }; |
| Vector<ParenthesesStackEntry> m_parenthesesStack; |
| Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; |
| }; |
| |
| std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ErrorCode& errorCode, ConcurrentJSLock* lock) |
| { |
| return ByteCompiler(pattern).compile(allocator, lock, errorCode); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, StringView input, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| if (input.is8Bit()) |
| return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); |
| return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); |
| } |
| |
| // These should be the same for both UChar & LChar. |
| static_assert(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t))); |
| static_assert(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t))); |
| |
| |
| } } |