| /* |
| * Copyright (C) 2008 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "WREC.h" |
| |
| #if ENABLE(WREC) |
| |
| #include "CharacterClassConstructor.h" |
| #include "Machine.h" |
| #include "pcre_internal.h" |
| |
| using namespace WTF; |
| |
| namespace JSC { |
| |
| class GenerateAtomFunctor { |
| public: |
| virtual ~GenerateAtomFunctor() {} |
| |
| virtual void generateAtom(WRECGenerator*, JmpSrcVector&) = 0; |
| virtual void backtrack(WRECGenerator*) = 0; |
| }; |
| |
| class GeneratePatternCharacterFunctor : public GenerateAtomFunctor { |
| public: |
| GeneratePatternCharacterFunctor(const UChar ch) |
| : m_ch(ch) |
| { |
| } |
| |
| virtual void generateAtom(WRECGenerator*, JmpSrcVector&); |
| virtual void backtrack(WRECGenerator*); |
| |
| private: |
| const UChar m_ch; |
| }; |
| |
| class GenerateCharacterClassFunctor : public GenerateAtomFunctor { |
| public: |
| GenerateCharacterClassFunctor(CharacterClass* charClass, bool invert) |
| : m_charClass(charClass) |
| , m_invert(invert) |
| { |
| } |
| |
| virtual void generateAtom(WRECGenerator*, JmpSrcVector&); |
| virtual void backtrack(WRECGenerator*); |
| |
| private: |
| CharacterClass* m_charClass; |
| bool m_invert; |
| }; |
| |
| class GenerateBackreferenceFunctor : public GenerateAtomFunctor { |
| public: |
| GenerateBackreferenceFunctor(unsigned subpatternId) |
| : m_subpatternId(subpatternId) |
| { |
| } |
| |
| virtual void generateAtom(WRECGenerator*, JmpSrcVector&); |
| virtual void backtrack(WRECGenerator*); |
| |
| private: |
| unsigned m_subpatternId; |
| }; |
| |
| class GenerateParenthesesNonGreedyFunctor : public GenerateAtomFunctor { |
| public: |
| GenerateParenthesesNonGreedyFunctor(WRECGenerator::JmpDst start, WRECGenerator::JmpSrc success, WRECGenerator::JmpSrc fail) |
| : m_start(start) |
| , m_success(success) |
| , m_fail(fail) |
| { |
| } |
| |
| virtual void generateAtom(WRECGenerator*, JmpSrcVector&); |
| virtual void backtrack(WRECGenerator*); |
| |
| private: |
| WRECGenerator::JmpDst m_start; |
| WRECGenerator::JmpSrc m_success; |
| WRECGenerator::JmpSrc m_fail; |
| }; |
| |
| void GeneratePatternCharacterFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) |
| { |
| wrec->generatePatternCharacter(failures, m_ch); |
| } |
| void GeneratePatternCharacterFunctor::backtrack(WRECGenerator* wrec) |
| { |
| wrec->generateBacktrack1(); |
| } |
| |
| void GenerateCharacterClassFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) |
| { |
| wrec->generateCharacterClass(failures, *m_charClass, m_invert); |
| } |
| void GenerateCharacterClassFunctor::backtrack(WRECGenerator* wrec) |
| { |
| wrec->generateBacktrack1(); |
| } |
| |
| void GenerateBackreferenceFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) |
| { |
| wrec->generateBackreference(failures, m_subpatternId); |
| } |
| void GenerateBackreferenceFunctor::backtrack(WRECGenerator* wrec) |
| { |
| wrec->generateBacktrackBackreference(m_subpatternId); |
| } |
| |
| void GenerateParenthesesNonGreedyFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) |
| { |
| wrec->generateParenthesesNonGreedy(failures, m_start, m_success, m_fail); |
| } |
| |
| void GenerateParenthesesNonGreedyFunctor::backtrack(WRECGenerator*) |
| { |
| // FIXME: do something about this. |
| CRASH(); |
| } |
| |
| void WRECGenerator::generateBacktrack1() |
| { |
| m_jit.subl_i8r(1, currentPositionRegister); |
| } |
| |
| void WRECGenerator::generateBacktrackBackreference(unsigned subpatternId) |
| { |
| m_jit.subl_mr((2 * subpatternId + 1) * sizeof(int), outputRegister, currentPositionRegister); |
| m_jit.addl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentPositionRegister); |
| } |
| |
| void WRECGenerator::generateBackreferenceQuantifier(JmpSrcVector& failures, Quantifier::Type quantifierType, unsigned subpatternId, unsigned min, unsigned max) |
| { |
| GenerateBackreferenceFunctor functor(subpatternId); |
| |
| m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentValueRegister); |
| m_jit.cmpl_rm(currentValueRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); |
| JmpSrc skipIfEmpty = m_jit.emitUnlinkedJe(); |
| |
| ASSERT(quantifierType == Quantifier::Greedy || quantifierType == Quantifier::NonGreedy); |
| if (quantifierType == Quantifier::Greedy) |
| generateGreedyQuantifier(failures, functor, min, max); |
| else |
| generateNonGreedyQuantifier(failures, functor, min, max); |
| |
| m_jit.link(skipIfEmpty, m_jit.label()); |
| } |
| |
| void WRECGenerator::generateNonGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) |
| { |
| // comment me better! |
| JmpSrcVector newFailures; |
| |
| // (0) Setup: |
| // init quantifierCountRegister |
| m_jit.pushl_r(quantifierCountRegister); |
| m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); |
| JmpSrc gotoStart = m_jit.emitUnlinkedJmp(); |
| |
| // (4) Failure case |
| |
| JmpDst quantifierFailed = m_jit.label(); |
| // (4.1) Restore original value of quantifierCountRegister from the stack |
| m_jit.popl_r(quantifierCountRegister); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| // (3) We just tried an alternative, and it failed - check we can try more. |
| |
| JmpDst alternativeFailed = m_jit.label(); |
| // (3.1) remove the value pushed prior to testing the alternative |
| m_jit.popl_r(currentPositionRegister); |
| // (3.2) if there is a limit, and we have reached it, game over. |
| if (max != Quantifier::noMaxSpecified) { |
| m_jit.cmpl_i32r(max, quantifierCountRegister); |
| m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); |
| } |
| |
| // (1) Do a check for the atom |
| |
| // (1.0) This is where we start, if there is a minimum (then we must read at least one of the atom). |
| JmpDst testQuantifiedAtom = m_jit.label(); |
| if (min) |
| m_jit.link(gotoStart, testQuantifiedAtom); |
| // (1.1) Do a check for the atom check. |
| functor.generateAtom(this, newFailures); |
| // (1.2) If we get here, successful match! |
| m_jit.addl_i8r(1, quantifierCountRegister); |
| // (1.3) We needed to read the atom, and we failed - that's terminally bad news. |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], quantifierFailed); |
| newFailures.clear(); |
| // (1.4) If there is a minimum, check we have read enough ... |
| if (min) { |
| // ... except if min was 1 no need to keep checking! |
| if (min != 1) { |
| m_jit.cmpl_i32r(min, quantifierCountRegister); |
| m_jit.link(m_jit.emitUnlinkedJl(), testQuantifiedAtom); |
| } |
| } else |
| m_jit.link(gotoStart, m_jit.label()); |
| // if there was no minimum, this is where we start. |
| |
| // (2) Plant an alternative check for the remainder of the expression |
| |
| // (2.1) recursively call to parseAlternative, if it falls through, success! |
| m_jit.pushl_r(currentPositionRegister); |
| m_parser.parseAlternative(newFailures); |
| m_jit.addl_i8r(4, X86::esp); |
| m_jit.popl_r(quantifierCountRegister); |
| // (2.2) link failure cases to jump back up to alternativeFailed. |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], alternativeFailed); |
| newFailures.clear(); |
| } |
| |
| void WRECGenerator::generateGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) |
| { |
| if (!max) |
| return; |
| |
| // comment me better! |
| JmpSrcVector newFailures; |
| |
| // (0) Setup: |
| // init quantifierCountRegister |
| m_jit.pushl_r(quantifierCountRegister); |
| m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); |
| |
| // (1) Greedily read as many of the atom as possible |
| |
| JmpDst readMore = m_jit.label(); |
| |
| // (1.1) Do a character class check. |
| functor.generateAtom(this, newFailures); |
| // (1.2) If we get here, successful match! |
| m_jit.addl_i8r(1, quantifierCountRegister); |
| // (1.3) loop, unless we've read max limit. |
| if (max != Quantifier::noMaxSpecified) { |
| if (max != 1) { |
| // if there is a limit, only loop while less than limit, otherwise fall throught to... |
| m_jit.cmpl_i32r(max, quantifierCountRegister); |
| m_jit.link(m_jit.emitUnlinkedJne(), readMore); |
| } |
| // ...if there is no min we need jump to the alternative test, if there is we can just fall through to it. |
| if (!min) |
| newFailures.append(m_jit.emitUnlinkedJmp()); |
| } else |
| m_jit.link(m_jit.emitUnlinkedJmp(), readMore); |
| // (1.4) check enough matches to bother trying an alternative... |
| if (min) { |
| // We will fall through to here if (min && max), after the max check. |
| // First, also link a |
| JmpDst minimumTest = m_jit.label(); |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], minimumTest); |
| newFailures.clear(); |
| // |
| m_jit.cmpl_i32r(min, quantifierCountRegister); |
| newFailures.append(m_jit.emitUnlinkedJae()); |
| } |
| |
| // (4) Failure case |
| |
| JmpDst quantifierFailed = m_jit.label(); |
| // (4.1) Restore original value of quantifierCountRegister from the stack |
| m_jit.popl_r(quantifierCountRegister); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| // (3) Backtrack |
| |
| JmpDst backtrack = m_jit.label(); |
| // (3.1) this was preserved prior to executing the alternative |
| m_jit.popl_r(currentPositionRegister); |
| // (3.2) check we can retry with fewer matches - backtracking fails if already at the minimum |
| m_jit.cmpl_i32r(min, quantifierCountRegister); |
| m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); |
| // (3.3) roll off one match, and retry. |
| functor.backtrack(this); |
| m_jit.subl_i8r(1, quantifierCountRegister); |
| |
| // (2) Try an alternative. |
| |
| // (2.1) point to retry |
| JmpDst tryAlternative = m_jit.label(); |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], tryAlternative); |
| newFailures.clear(); |
| // (2.2) recursively call to parseAlternative, if it falls through, success! |
| m_jit.pushl_r(currentPositionRegister); |
| m_parser.parseAlternative(newFailures); |
| m_jit.addl_i8r(4, X86::esp); |
| m_jit.popl_r(quantifierCountRegister); |
| // (2.3) link failure cases to here. |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], backtrack); |
| newFailures.clear(); |
| } |
| |
| void WRECGenerator::generatePatternCharacter(JmpSrcVector& failures, int ch) |
| { |
| // check there is more input, read a char. |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| failures.append(m_jit.emitUnlinkedJe()); |
| m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); |
| |
| // used for unicode case insensitive |
| bool hasUpper = false; |
| JmpSrc isUpper; |
| |
| // if case insensitive match |
| if (m_parser.m_ignoreCase) { |
| UChar lower, upper; |
| |
| // check for ascii case sensitive characters |
| if (isASCIIAlpha(ch)) { |
| m_jit.orl_i32r(32, currentValueRegister); |
| ch |= 32; |
| } else if ((ch > 0x7f) && ((lower = Unicode::toLower(ch)) != (upper = Unicode::toUpper(ch)))) { |
| // handle unicode case sentitive characters - branch to success on upper |
| m_jit.cmpl_i32r(upper, currentValueRegister); |
| isUpper = m_jit.emitUnlinkedJe(); |
| hasUpper = true; |
| ch = lower; |
| } |
| } |
| |
| // checks for ch, or lower case version of ch, if insensitive |
| m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); |
| failures.append(m_jit.emitUnlinkedJne()); |
| |
| if (m_parser.m_ignoreCase && hasUpper) { |
| // for unicode case insensitive matches, branch here if upper matches. |
| m_jit.link(isUpper, m_jit.label()); |
| } |
| |
| // on success consume the char |
| m_jit.addl_i8r(1, currentPositionRegister); |
| } |
| |
| void WRECGenerator::generateCharacterClassInvertedRange(JmpSrcVector& failures, JmpSrcVector& matchDest, const CharacterClassRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) |
| { |
| do { |
| // pick which range we're going to generate |
| int which = count >> 1; |
| char lo = ranges[which].begin; |
| char hi = ranges[which].end; |
| |
| m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); |
| |
| // check if there are any ranges or matches below lo. If not, just jl to failure - |
| // if there is anything else to check, check that first, if it falls through jmp to failure. |
| if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { |
| JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); |
| |
| // generate code for all ranges before this one |
| if (which) |
| generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); |
| |
| do { |
| m_jit.cmpl_i32r((unsigned short)matches[*matchIndex], currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJe()); |
| ++*matchIndex; |
| } while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| m_jit.link(loOrAbove, m_jit.label()); |
| } else if (which) { |
| JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); |
| |
| generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| m_jit.link(loOrAbove, m_jit.label()); |
| } else |
| failures.append(m_jit.emitUnlinkedJl()); |
| |
| while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) |
| ++*matchIndex; |
| |
| m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJle()); |
| // fall through to here, the value is above hi. |
| |
| // shuffle along & loop around if there are any more matches to handle. |
| unsigned next = which + 1; |
| ranges += next; |
| count -= next; |
| } while (count); |
| } |
| |
| void WRECGenerator::generateCharacterClassInverted(JmpSrcVector& matchDest, CharacterClass& charClass) |
| { |
| JmpSrc unicodeFail; |
| if (charClass.numMatchesUnicode || charClass.numRangesUnicode) { |
| m_jit.cmpl_i8r(0x7f, currentValueRegister); |
| JmpSrc isAscii = m_jit.emitUnlinkedJle(); |
| |
| if (charClass.numMatchesUnicode) { |
| for (unsigned i = 0; i < charClass.numMatchesUnicode; ++i) { |
| UChar ch = charClass.matchesUnicode[i]; |
| m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJe()); |
| } |
| } |
| |
| if (charClass.numRangesUnicode) { |
| for (unsigned i = 0; i < charClass.numRangesUnicode; ++i) { |
| UChar lo = charClass.rangesUnicode[i].begin; |
| UChar hi = charClass.rangesUnicode[i].end; |
| |
| m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); |
| JmpSrc below = m_jit.emitUnlinkedJl(); |
| m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJle()); |
| m_jit.link(below, m_jit.label()); |
| } |
| } |
| |
| unicodeFail = m_jit.emitUnlinkedJmp(); |
| m_jit.link(isAscii, m_jit.label()); |
| } |
| |
| if (charClass.numRanges) { |
| unsigned matchIndex = 0; |
| JmpSrcVector failures; |
| generateCharacterClassInvertedRange(failures, matchDest, charClass.ranges, charClass.numRanges, &matchIndex, charClass.matches, charClass.numMatches); |
| while (matchIndex < charClass.numMatches) { |
| m_jit.cmpl_i32r((unsigned short)charClass.matches[matchIndex], currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJe()); |
| ++matchIndex; |
| } |
| JmpDst noMatch = m_jit.label(); |
| for (unsigned i = 0; i < failures.size(); ++i) |
| m_jit.link(failures[i], noMatch); |
| failures.clear(); |
| } else if (charClass.numMatches) { |
| // optimization: gather 'a','A' etc back together, can mask & test once. |
| Vector<char> matchesAZaz; |
| |
| for (unsigned i = 0; i < charClass.numMatches; ++i) { |
| char ch = charClass.matches[i]; |
| if (m_parser.m_ignoreCase) { |
| if (isASCIILower(ch)) { |
| matchesAZaz.append(ch); |
| continue; |
| } |
| if (isASCIIUpper(ch)) |
| continue; |
| } |
| m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJe()); |
| } |
| |
| if (unsigned countAZaz = matchesAZaz.size()) { |
| m_jit.orl_i32r(32, currentValueRegister); |
| |
| for (unsigned i = 0; i < countAZaz; ++i) { |
| char ch = matchesAZaz[i]; |
| m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); |
| matchDest.append(m_jit.emitUnlinkedJe()); |
| } |
| } |
| } |
| |
| if (charClass.numMatchesUnicode || charClass.numRangesUnicode) |
| m_jit.link(unicodeFail, m_jit.label()); |
| } |
| |
| void WRECGenerator::generateCharacterClass(JmpSrcVector& failures, CharacterClass& charClass, bool invert) |
| { |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| failures.append(m_jit.emitUnlinkedJe()); |
| m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); |
| |
| if (invert) |
| generateCharacterClassInverted(failures, charClass); |
| else { |
| JmpSrcVector successes; |
| generateCharacterClassInverted(successes, charClass); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| JmpDst here = m_jit.label(); |
| for (unsigned i = 0; i < successes.size(); ++i) |
| m_jit.link(successes[i], here); |
| } |
| |
| m_jit.addl_i8r(1, currentPositionRegister); |
| } |
| |
| WRECGenerator::JmpSrc WRECGenerator::generateParentheses(ParenthesesType type) |
| { |
| unsigned subpatternId = (type == capturing) ? ++m_parser.m_numSubpatterns : m_parser.m_numSubpatterns; |
| |
| // push pos, both to preserve for fail + reloaded in parseDisjunction |
| m_jit.pushl_r(currentPositionRegister); |
| |
| // Plant a disjunction, wrapped to invert behaviour - |
| JmpSrcVector newFailures; |
| m_parser.parseDisjunction(newFailures); |
| |
| if (type == capturing) { |
| m_jit.popl_r(currentValueRegister); |
| m_jit.movl_rm(currentValueRegister, (2 * subpatternId) * sizeof(int), outputRegister); |
| m_jit.movl_rm(currentPositionRegister, (2 * subpatternId + 1) * sizeof(int), outputRegister); |
| } else if (type == non_capturing) |
| m_jit.addl_i8r(4, X86::esp); |
| else |
| m_jit.popl_r(currentPositionRegister); |
| |
| // This is a little lame - jump to jump if there is a nested disjunction. |
| // (suggestion to fix: make parseDisjunction populate a JmpSrcVector of |
| // disjunct successes... this is probably not worth the compile cost in |
| // the common case to fix). |
| JmpSrc successfulMatch = m_jit.emitUnlinkedJmp(); |
| |
| JmpDst originalFailure = m_jit.label(); |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], originalFailure); |
| newFailures.clear(); |
| // fail: restore currentPositionRegister |
| m_jit.popl_r(currentPositionRegister); |
| |
| JmpSrc jumpToFail; |
| // If this was an inverted assert, fail = success! - just let the failure case drop through, |
| // success case goes to failures. Both paths restore curr pos. |
| if (type == inverted_assertion) |
| jumpToFail = successfulMatch; |
| else { |
| // plant a jump so any fail will link off to 'failures', |
| jumpToFail = m_jit.emitUnlinkedJmp(); |
| // link successes to jump here |
| m_jit.link(successfulMatch, m_jit.label()); |
| } |
| return jumpToFail; |
| } |
| |
| void WRECGenerator::generateParenthesesNonGreedy(JmpSrcVector& failures, JmpDst start, JmpSrc success, JmpSrc fail) |
| { |
| m_jit.link(m_jit.emitUnlinkedJmp(), start); |
| m_jit.link(success, m_jit.label()); |
| |
| failures.append(fail); |
| } |
| |
| WRECGenerator::JmpSrc WRECGenerator::generateParenthesesResetTrampoline(JmpSrcVector& newFailures, unsigned subpatternIdBefore, unsigned subpatternIdAfter) |
| { |
| JmpSrc skip = m_jit.emitUnlinkedJmp(); |
| |
| JmpDst subPatternResetTrampoline = m_jit.label(); |
| for (unsigned i = 0; i < newFailures.size(); ++i) |
| m_jit.link(newFailures[i], subPatternResetTrampoline); |
| newFailures.clear(); |
| for (unsigned i = subpatternIdBefore + 1; i <= subpatternIdAfter; ++i) { |
| m_jit.movl_i32m(-1, (2 * i) * sizeof(int), outputRegister); |
| m_jit.movl_i32m(-1, (2 * i + 1) * sizeof(int), outputRegister); |
| } |
| |
| JmpSrc newFailJump = m_jit.emitUnlinkedJmp(); |
| m_jit.link(skip, m_jit.label()); |
| |
| return newFailJump; |
| } |
| |
| void WRECGenerator::generateAssertionBOL(JmpSrcVector& failures) |
| { |
| if (m_parser.m_multiline) { |
| JmpSrcVector previousIsNewline; |
| |
| // begin of input == success |
| m_jit.cmpl_i8r(0, currentPositionRegister); |
| previousIsNewline.append(m_jit.emitUnlinkedJe()); |
| |
| // now check prev char against newline characters. |
| m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); |
| generateCharacterClassInverted(previousIsNewline, getCharacterClassNewline()); |
| |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| JmpDst success = m_jit.label(); |
| for (unsigned i = 0; i < previousIsNewline.size(); ++i) |
| m_jit.link(previousIsNewline[i], success); |
| previousIsNewline.clear(); |
| } else { |
| m_jit.cmpl_i8r(0, currentPositionRegister); |
| failures.append(m_jit.emitUnlinkedJne()); |
| } |
| } |
| |
| void WRECGenerator::generateAssertionEOL(JmpSrcVector& failures) |
| { |
| if (m_parser.m_multiline) { |
| JmpSrcVector nextIsNewline; |
| |
| // end of input == success |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| nextIsNewline.append(m_jit.emitUnlinkedJe()); |
| |
| // now check next char against newline characters. |
| m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); |
| generateCharacterClassInverted(nextIsNewline, getCharacterClassNewline()); |
| |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| JmpDst success = m_jit.label(); |
| for (unsigned i = 0; i < nextIsNewline.size(); ++i) |
| m_jit.link(nextIsNewline[i], success); |
| nextIsNewline.clear(); |
| } else { |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| failures.append(m_jit.emitUnlinkedJne()); |
| } |
| } |
| |
| void WRECGenerator::generateAssertionWordBoundary(JmpSrcVector& failures, bool invert) |
| { |
| JmpSrcVector wordBoundary; |
| JmpSrcVector notWordBoundary; |
| |
| // (1) Check if the previous value was a word char |
| |
| // (1.1) check for begin of input |
| m_jit.cmpl_i8r(0, currentPositionRegister); |
| JmpSrc atBegin = m_jit.emitUnlinkedJe(); |
| // (1.2) load the last char, and chck if is word character |
| m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); |
| JmpSrcVector previousIsWord; |
| generateCharacterClassInverted(previousIsWord, getCharacterClassWordchar()); |
| // (1.3) if we get here, previous is not a word char |
| m_jit.link(atBegin, m_jit.label()); |
| |
| // (2) Handle situation where previous was NOT a \w |
| |
| // (2.1) check for end of input |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| notWordBoundary.append(m_jit.emitUnlinkedJe()); |
| // (2.2) load the next char, and chck if is word character |
| m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); |
| generateCharacterClassInverted(wordBoundary, getCharacterClassWordchar()); |
| // (2.3) If we get here, neither chars are word chars |
| notWordBoundary.append(m_jit.emitUnlinkedJmp()); |
| |
| // (3) Handle situation where previous was a \w |
| |
| // (3.0) link success in first match to here |
| JmpDst section3 = m_jit.label(); |
| for (unsigned i = 0; i < previousIsWord.size(); ++i) |
| m_jit.link(previousIsWord[i], section3); |
| previousIsWord.clear(); |
| // (3.1) check for end of input |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| wordBoundary.append(m_jit.emitUnlinkedJe()); |
| // (3.2) load the next char, and chck if is word character |
| m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); |
| generateCharacterClassInverted(notWordBoundary, getCharacterClassWordchar()); |
| // (3.3) If we get here, this is an end of a word, within the input. |
| |
| // (4) Link everything up |
| |
| if (invert) { |
| // handle the fall through case |
| wordBoundary.append(m_jit.emitUnlinkedJmp()); |
| |
| // looking for non word boundaries, so link boundary fails to here. |
| JmpDst success = m_jit.label(); |
| for (unsigned i = 0; i < notWordBoundary.size(); ++i) |
| m_jit.link(notWordBoundary[i], success); |
| notWordBoundary.clear(); |
| |
| failures.append(wordBoundary.begin(), wordBoundary.size()); |
| } else { |
| // looking for word boundaries, so link successes here. |
| JmpDst success = m_jit.label(); |
| for (unsigned i = 0; i < wordBoundary.size(); ++i) |
| m_jit.link(wordBoundary[i], success); |
| wordBoundary.clear(); |
| |
| failures.append(notWordBoundary.begin(), notWordBoundary.size()); |
| } |
| } |
| |
| void WRECGenerator::generateBackreference(JmpSrcVector& failures, unsigned subpatternId) |
| { |
| m_jit.pushl_r(currentPositionRegister); |
| m_jit.pushl_r(quantifierCountRegister); |
| |
| // get the start pos of the backref into quantifierCountRegister (multipurpose!) |
| m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, quantifierCountRegister); |
| |
| JmpSrc skipIncrement = m_jit.emitUnlinkedJmp(); |
| JmpDst topOfLoop = m_jit.label(); |
| m_jit.addl_i8r(1, currentPositionRegister); |
| m_jit.addl_i8r(1, quantifierCountRegister); |
| m_jit.link(skipIncrement, m_jit.label()); |
| |
| // check if we're at the end of backref (if we are, success!) |
| m_jit.cmpl_rm(quantifierCountRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); |
| JmpSrc endOfBackRef = m_jit.emitUnlinkedJe(); |
| |
| m_jit.movzwl_mr(inputRegister, quantifierCountRegister, 2, currentValueRegister); |
| |
| // check if we've run out of input (this would be a can o'fail) |
| m_jit.cmpl_rr(lengthRegister, currentPositionRegister); |
| JmpSrc endOfInput = m_jit.emitUnlinkedJe(); |
| |
| m_jit.cmpw_rm(currentValueRegister, inputRegister, currentPositionRegister, 2); |
| m_jit.link(m_jit.emitUnlinkedJe(), topOfLoop); |
| |
| m_jit.link(endOfInput, m_jit.label()); |
| // Failure |
| m_jit.popl_r(quantifierCountRegister); |
| m_jit.popl_r(currentPositionRegister); |
| failures.append(m_jit.emitUnlinkedJmp()); |
| |
| // Success |
| m_jit.link(endOfBackRef, m_jit.label()); |
| m_jit.popl_r(quantifierCountRegister); |
| m_jit.addl_i8r(4, X86::esp); |
| } |
| |
| void WRECGenerator::generateDisjunction(JmpSrcVector& successes, JmpSrcVector& failures) |
| { |
| successes.append(m_jit.emitUnlinkedJmp()); |
| |
| JmpDst here = m_jit.label(); |
| |
| for (unsigned i = 0; i < failures.size(); ++i) |
| m_jit.link(failures[i], here); |
| failures.clear(); |
| |
| m_jit.movl_mr(X86::esp, currentPositionRegister); |
| } |
| |
| void WRECGenerator::terminateDisjunction(JmpSrcVector& successes) |
| { |
| JmpDst here = m_jit.label(); |
| for (unsigned i = 0; i < successes.size(); ++i) |
| m_jit.link(successes[i], here); |
| successes.clear(); |
| } |
| |
| ALWAYS_INLINE Quantifier WRECParser::parseGreedyQuantifier() |
| { |
| switch (peek()) { |
| case '?': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 0, 1); |
| |
| case '*': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 0); |
| |
| case '+': |
| consume(); |
| return Quantifier(Quantifier::Greedy, 1); |
| |
| case '{': { |
| consume(); |
| // a numeric quantifier should always have a lower bound |
| if (!peekIsDigit()) { |
| m_err = Error_malformedQuantifier; |
| return Quantifier(Quantifier::Error); |
| } |
| int min = consumeNumber(); |
| |
| // this should either be a , or a } |
| switch (peek()) { |
| case '}': |
| // {n} - exactly n times. (technically I think a '?' is valid in the bnf - bit meaningless). |
| consume(); |
| return Quantifier(Quantifier::Greedy, min, min); |
| |
| case ',': |
| consume(); |
| switch (peek()) { |
| case '}': |
| // {n,} - n to inf times. |
| consume(); |
| return Quantifier(Quantifier::Greedy, min); |
| |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| // {n,m} - n to m times. |
| int max = consumeNumber(); |
| |
| if (peek() != '}') { |
| m_err = Error_malformedQuantifier; |
| return Quantifier(Quantifier::Error); |
| } |
| consume(); |
| |
| return Quantifier(Quantifier::Greedy, min, max); |
| } |
| |
| default: |
| m_err = Error_malformedQuantifier; |
| return Quantifier(Quantifier::Error); |
| } |
| |
| default: |
| m_err = Error_malformedQuantifier; |
| return Quantifier(Quantifier::Error); |
| } |
| } |
| // None of the above - no quantifier |
| default: |
| return Quantifier(); |
| } |
| } |
| |
| Quantifier WRECParser::parseQuantifier() |
| { |
| Quantifier q = parseGreedyQuantifier(); |
| |
| if ((q.type == Quantifier::Greedy) && (peek() == '?')) { |
| consume(); |
| q.type = Quantifier::NonGreedy; |
| } |
| |
| return q; |
| } |
| |
| bool WRECParser::parsePatternCharacterQualifier(JmpSrcVector& failures, int ch) |
| { |
| Quantifier q = parseQuantifier(); |
| |
| switch (q.type) { |
| case Quantifier::None: { |
| m_generator.generatePatternCharacter(failures, ch); |
| break; |
| } |
| |
| case Quantifier::Greedy: { |
| GeneratePatternCharacterFunctor functor(ch); |
| m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::NonGreedy: { |
| GeneratePatternCharacterFunctor functor(ch); |
| m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::Error: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool WRECParser::parseCharacterClassQuantifier(JmpSrcVector& failures, CharacterClass& charClass, bool invert) |
| { |
| Quantifier q = parseQuantifier(); |
| |
| switch (q.type) { |
| case Quantifier::None: { |
| m_generator.generateCharacterClass(failures, charClass, invert); |
| break; |
| } |
| |
| case Quantifier::Greedy: { |
| GenerateCharacterClassFunctor functor(&charClass, invert); |
| m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::NonGreedy: { |
| GenerateCharacterClassFunctor functor(&charClass, invert); |
| m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); |
| break; |
| } |
| |
| case Quantifier::Error: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool WRECParser::parseBackreferenceQuantifier(JmpSrcVector& failures, unsigned subpatternId) |
| { |
| Quantifier q = parseQuantifier(); |
| |
| switch (q.type) { |
| case Quantifier::None: { |
| m_generator.generateBackreference(failures, subpatternId); |
| break; |
| } |
| |
| case Quantifier::Greedy: |
| case Quantifier::NonGreedy: |
| m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); |
| return true; |
| |
| case Quantifier::Error: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool WRECParser::parseParentheses(JmpSrcVector&) |
| { |
| // FIXME: We don't currently backtrack correctly within parentheses in cases such as |
| // "c".match(/(.*)c/) so we fall back to PCRE for any regexp containing parentheses. |
| |
| m_err = TempError_unsupportedParentheses; |
| return false; |
| } |
| |
| bool WRECParser::parseCharacterClass(JmpSrcVector& failures) |
| { |
| bool invert = false; |
| if (peek() == '^') { |
| consume(); |
| invert = true; |
| } |
| |
| CharacterClassConstructor charClassConstructor(m_ignoreCase); |
| |
| UChar ch; |
| while ((ch = peek()) != ']') { |
| switch (ch) { |
| case EndOfPattern: |
| m_err = Error_malformedCharacterClass; |
| return false; |
| |
| case '\\': |
| consume(); |
| switch (ch = peek()) { |
| case EndOfPattern: |
| m_err = Error_malformedEscape; |
| return false; |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| charClassConstructor.put(consumeOctal()); |
| break; |
| |
| // ControlEscape |
| case 'b': |
| consume(); |
| charClassConstructor.put('\b'); |
| break; |
| case 'f': |
| consume(); |
| charClassConstructor.put('\f'); |
| break; |
| case 'n': |
| consume(); |
| charClassConstructor.put('\n'); |
| break; |
| case 'r': |
| consume(); |
| charClassConstructor.put('\r'); |
| break; |
| case 't': |
| consume(); |
| charClassConstructor.put('\t'); |
| break; |
| case 'v': |
| consume(); |
| charClassConstructor.put('\v'); |
| break; |
| |
| // ControlLetter |
| case 'c': { |
| consume(); |
| int control = consume(); |
| if (!isASCIIAlpha(control)) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| charClassConstructor.put(control&31); |
| break; |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| int x = consumeHex(2); |
| if (x == -1) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| charClassConstructor.put(x); |
| break; |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| consume(); |
| int x = consumeHex(4); |
| if (x == -1) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| charClassConstructor.put(x); |
| break; |
| } |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| charClassConstructor.append(getCharacterClassDigits()); |
| break; |
| case 's': |
| consume(); |
| charClassConstructor.append(getCharacterClassSpaces()); |
| break; |
| case 'w': |
| consume(); |
| charClassConstructor.append(getCharacterClassWordchar()); |
| break; |
| case 'D': |
| consume(); |
| charClassConstructor.append(getCharacterClassNondigits()); |
| break; |
| case 'S': |
| consume(); |
| charClassConstructor.append(getCharacterClassNonspaces()); |
| break; |
| case 'W': |
| consume(); |
| charClassConstructor.append(getCharacterClassNonwordchar()); |
| break; |
| |
| case '-': |
| consume(); |
| charClassConstructor.flushBeforeEscapedHyphen(); |
| charClassConstructor.put(ch); |
| break; |
| |
| // IdentityEscape |
| default: { |
| // TODO: check this test for IdentifierPart. |
| int ch = consume(); |
| if (isASCIIAlphanumeric(ch) || (ch == '_')) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| charClassConstructor.put(ch); |
| break; |
| } |
| } |
| break; |
| |
| default: |
| consume(); |
| charClassConstructor.put(ch); |
| } |
| } |
| consume(); |
| |
| // lazily catch reversed ranges ([z-a])in character classes |
| if (charClassConstructor.isUpsideDown()) { |
| m_err = Error_malformedCharacterClass; |
| return false; |
| } |
| |
| charClassConstructor.flush(); |
| CharacterClass charClass = charClassConstructor.charClass(); |
| return parseCharacterClassQuantifier(failures, charClass, invert); |
| } |
| |
| bool WRECParser::parseOctalEscape(JmpSrcVector& failures) |
| { |
| return parsePatternCharacterQualifier(failures, consumeOctal()); |
| } |
| |
| bool WRECParser::parseEscape(JmpSrcVector& failures) |
| { |
| switch (peek()) { |
| case EndOfPattern: |
| m_err = Error_malformedEscape; |
| return false; |
| |
| // Assertions |
| case 'b': |
| consume(); |
| m_generator.generateAssertionWordBoundary(failures, false); |
| return true; |
| |
| case 'B': |
| consume(); |
| m_generator.generateAssertionWordBoundary(failures, true); |
| return true; |
| |
| // Octal escape |
| case '0': |
| consume(); |
| return parseOctalEscape(failures); |
| |
| // DecimalEscape |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': { |
| // FIXME: This does not support forward references. It's not clear whether they |
| // should be valid. |
| unsigned value = peekDigit(); |
| if (value > m_numSubpatterns) |
| return parseOctalEscape(failures); |
| } |
| case '8': |
| case '9': { |
| unsigned value = peekDigit(); |
| if (value > m_numSubpatterns) { |
| consume(); |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| consume(); |
| |
| while (peekIsDigit()) { |
| unsigned newValue = value * 10 + peekDigit(); |
| if (newValue > m_numSubpatterns) |
| break; |
| value = newValue; |
| consume(); |
| } |
| |
| parseBackreferenceQuantifier(failures, value); |
| return true; |
| } |
| |
| // ControlEscape |
| case 'f': |
| consume(); |
| return parsePatternCharacterQualifier(failures, '\f'); |
| case 'n': |
| consume(); |
| return parsePatternCharacterQualifier(failures, '\n'); |
| case 'r': |
| consume(); |
| return parsePatternCharacterQualifier(failures, '\r'); |
| case 't': |
| consume(); |
| return parsePatternCharacterQualifier(failures, '\t'); |
| case 'v': |
| consume(); |
| return parsePatternCharacterQualifier(failures, '\v'); |
| |
| // ControlLetter |
| case 'c': { |
| consume(); |
| int control = consume(); |
| if (!isASCIIAlpha(control)) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| return parsePatternCharacterQualifier(failures, control&31); |
| } |
| |
| // HexEscape |
| case 'x': { |
| consume(); |
| int x = consumeHex(2); |
| if (x == -1) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| return parsePatternCharacterQualifier(failures, x); |
| } |
| |
| // UnicodeEscape |
| case 'u': { |
| consume(); |
| int x = consumeHex(4); |
| if (x == -1) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| return parsePatternCharacterQualifier(failures, x); |
| } |
| |
| // CharacterClassEscape |
| case 'd': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), false); |
| case 's': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), false); |
| case 'w': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), false); |
| case 'D': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), true); |
| case 'S': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), true); |
| case 'W': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), true); |
| |
| // IdentityEscape |
| default: { |
| // TODO: check this test for IdentifierPart. |
| int ch = consume(); |
| if (isASCIIAlphanumeric(ch) || (ch == '_')) { |
| m_err = Error_malformedEscape; |
| return false; |
| } |
| return parsePatternCharacterQualifier(failures, ch); |
| } |
| } |
| } |
| |
| bool WRECParser::parseTerm(JmpSrcVector& failures) |
| { |
| switch (peek()) { |
| case EndOfPattern: |
| case '*': |
| case '+': |
| case '?': |
| case ')': |
| case ']': |
| case '{': |
| case '}': |
| case '|': |
| // Not allowed in a Term! |
| return false; |
| |
| case '^': |
| consume(); |
| m_generator.generateAssertionBOL(failures); |
| return true; |
| |
| case '$': |
| consume(); |
| m_generator.generateAssertionEOL(failures); |
| return true; |
| |
| case '\\': |
| // b & B are also assertions... |
| consume(); |
| return parseEscape(failures); |
| |
| case '.': |
| consume(); |
| return parseCharacterClassQuantifier(failures, getCharacterClassNewline(), true); |
| |
| case '[': |
| // CharacterClass |
| consume(); |
| return parseCharacterClass(failures); |
| |
| case '(': |
| consume(); |
| return parseParentheses(failures); |
| |
| default: |
| // Anything else is a PatternCharacter |
| return parsePatternCharacterQualifier(failures, consume()); |
| } |
| } |
| |
| /* |
| interface req: CURR_POS is on stack (can be reloaded). |
| */ |
| void WRECParser::parseDisjunction(JmpSrcVector& failures) |
| { |
| parseAlternative(failures); |
| |
| if (peek() == '|') { |
| JmpSrcVector successes; |
| |
| do { |
| consume(); |
| |
| m_generator.generateDisjunction(successes, failures); |
| |
| parseAlternative(failures); |
| } while (peek() == '|'); |
| |
| m_generator.terminateDisjunction(successes); |
| } |
| } |
| |
| } // namespace JSC |
| |
| #endif // ENABLE(WREC) |