JavaScriptCore/yarr/RegexParser.h - WebKit - Git at Google

 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef RegexParser_h
 #define RegexParser_h

 #include <wtf/Platform.h>

 #if ENABLE(YARR)

 #include <UString.h>
 #include <wtf/ASCIICType.h>
 #include <wtf/unicode/Unicode.h>
 #include <limits.h>

 namespace JSC { namespace Yarr {

 enum BuiltInCharacterClassID {
     DigitClassID,
     SpaceClassID,
     WordClassID,
     NewlineClassID,
 };

 // The Parser class should not be used directly - only via the Yarr::parse() method.
 template<class Delegate>
 class Parser {
 private:
     template<class FriendDelegate>
     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);

     enum ErrorCode {
         NoError,
         PatternTooLarge,
         QuantifierOutOfOrder,
         QuantifierWithoutAtom,
         MissingParentheses,
         ParenthesesUnmatched,
         ParenthesesTypeInvalid,
         CharacterClassUnmatched,
         CharacterClassOutOfOrder,
         EscapeUnterminated,
         NumberOfErrorCodes
     };

     /*
      * CharacterClassParserDelegate:
      *
      * The class CharacterClassParserDelegate is used in the parsing of character
      * classes.  This class handles detection of character ranges.  This class
      * implements enough of the delegate interface such that it can be passed to
      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
      * to perform the parsing of escape characters in character sets.
      */
     class CharacterClassParserDelegate {
     public:
         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
             : m_delegate(delegate)
             , m_err(err)
             , m_state(empty)
         {
         }

         /*
          * begin():
          *
          * Called at beginning of construction.
          */
         void begin(bool invert)
         {
             m_delegate.atomCharacterClassBegin(invert);
         }

         /*
          * atomPatternCharacterUnescaped():
          *
          * This method is called directly from parseCharacterClass(), to report a new
          * pattern character token.  This method differs from atomPatternCharacter(),
          * which will be called from parseEscape(), since a hypen provided via this
          * method may be indicating a character range, but a hyphen parsed by
          * parseEscape() cannot be interpreted as doing so.
          */
         void atomPatternCharacterUnescaped(UChar ch)
         {
             switch (m_state) {
             case empty:
                 m_character = ch;
                 m_state = cachedCharacter;
                 break;

             case cachedCharacter:
                 if (ch == '-')
                     m_state = cachedCharacterHyphen;
                 else {
                     m_delegate.atomCharacterClassAtom(m_character);
                     m_character = ch;
                 }
                 break;

             case cachedCharacterHyphen:
                 if (ch >= m_character)
                     m_delegate.atomCharacterClassRange(m_character, ch);
                 else
                     m_err = CharacterClassOutOfOrder;
                 m_state = empty;
             }
         }

         /*
          * atomPatternCharacter():
          *
          * Adds a pattern character, called by parseEscape(), as such will not
          * interpret a hyphen as indicating a character range.
          */
         void atomPatternCharacter(UChar ch)
         {
             // Flush if a character is already pending to prevent the
             // hyphen from begin interpreted as indicating a range.
             if((ch == '-') && (m_state == cachedCharacter))
                 flush();

             atomPatternCharacterUnescaped(ch);
         }

         /*
          * atomBuiltInCharacterClass():
          *
          * Adds a built-in character class, called by parseEscape().
          */
         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
         {
             flush();
             m_delegate.atomCharacterClassBuiltIn(classID, invert);
         }

         /*
          * end():
          *
          * Called at end of construction.
          */
         void end()
         {
             flush();
             m_delegate.atomCharacterClassEnd();
         }

         // parseEscape() should never call these delegate methods when
         // invoked with inCharacterClass set.
         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }

     private:
         void flush()
         {
             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
                 m_delegate.atomCharacterClassAtom(m_character);
             if (m_state == cachedCharacterHyphen)
                 m_delegate.atomCharacterClassAtom('-');
             m_state = empty;
         }

         Delegate& m_delegate;
         ErrorCode& m_err;
         enum CharacterClassConstructionState {
             empty,
             cachedCharacter,
             cachedCharacterHyphen,
         } m_state;
         UChar m_character;
     };

     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
         : m_delegate(delegate)
         , m_backReferenceLimit(backReferenceLimit)
         , m_err(NoError)
         , m_data(pattern.data())
         , m_size(pattern.size())
         , m_index(0)
         , m_parenthesesNestingDepth(0)
     {
     }

     /*
      * parseEscape():
      *
      * Helper for parseTokens() AND parseCharacterClass().
      * Unlike the other parser methods, this function does not report tokens
      * directly to the member delegate (m_delegate), instead tokens are
      * emitted to the delegate provided as an argument.  In the case of atom
      * escapes, parseTokens() will call parseEscape() passing m_delegate as
      * an argument, and as such the escape will be reported to the delegate.
      *
      * However this method may also be used by parseCharacterClass(), in which
      * case a CharacterClassParserDelegate will be passed as the delegate that
      * tokens should be added to.  A boolean flag is also provided to indicate
      * whether that an escape in a CharacterClass is being parsed (some parsing
      * rules change in this context).
      *
      * The boolean value returned by this method indicates whether the token
      * parsed was an atom (outside of a characted class \b and \B will be
      * interpreted as assertions).
      */
     template<bool inCharacterClass, class EscapeDelegate>
     bool parseEscape(EscapeDelegate& delegate)
     {
         ASSERT(!m_err);
         ASSERT(peek() == '\\');
         consume();

         if (atEndOfPattern()) {
             m_err = EscapeUnterminated;
             return false;
         }

         switch (peek()) {
         // Assertions
         case 'b':
             consume();
             if (inCharacterClass)
                 delegate.atomPatternCharacter('\b');
             else {
                 delegate.assertionWordBoundary(false);
                 return false;
             }
             break;
         case 'B':
             consume();
             if (inCharacterClass)
                 delegate.atomPatternCharacter('B');
             else {
                 delegate.assertionWordBoundary(true);
                 return false;
             }
             break;

         // CharacterClassEscape
         case 'd':
             consume();
             delegate.atomBuiltInCharacterClass(DigitClassID, false);
             break;
         case 's':
             consume();
             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
             break;
         case 'w':
             consume();
             delegate.atomBuiltInCharacterClass(WordClassID, false);
             break;
         case 'D':
             consume();
             delegate.atomBuiltInCharacterClass(DigitClassID, true);
             break;
         case 'S':
             consume();
             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
             break;
         case 'W':
             consume();
             delegate.atomBuiltInCharacterClass(WordClassID, true);
             break;

         // DecimalEscape
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9': {
             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
             // First, try to parse this as backreference.
             if (!inCharacterClass) {
                 ParseState state = saveState();

                 unsigned backReference = consumeNumber();
                 if (backReference <= m_backReferenceLimit) {
                     delegate.atomBackReference(backReference);
                     break;
                 }

                 restoreState(state);
             }

             // Not a backreference, and not octal.
             if (peek() >= '8') {
                 delegate.atomPatternCharacter('\\');
                 break;
             }

             // Fall-through to handle this as an octal escape.
         }

         // Octal escape
         case '0':
             delegate.atomPatternCharacter(consumeOctal());
             break;

         // ControlEscape
         case 'f':
             consume();
             delegate.atomPatternCharacter('\f');
             break;
         case 'n':
             consume();
             delegate.atomPatternCharacter('\n');
             break;
         case 'r':
             consume();
             delegate.atomPatternCharacter('\r');
             break;
         case 't':
             consume();
             delegate.atomPatternCharacter('\t');
             break;
         case 'v':
             consume();
             delegate.atomPatternCharacter('\v');
             break;

         // ControlLetter
         case 'c': {
             ParseState state = saveState();
             consume();
             if (!atEndOfPattern()) {
                 int control = consume();

                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
                     delegate.atomPatternCharacter(control & 0x1f);
                     break;
                 }
             }
             restoreState(state);
             delegate.atomPatternCharacter('\\');
             break;
         }

         // HexEscape
         case 'x': {
             consume();
             int x = tryConsumeHex(2);
             if (x == -1)
                 delegate.atomPatternCharacter('x');
             else
                 delegate.atomPatternCharacter(x);
             break;
         }

         // UnicodeEscape
         case 'u': {
             consume();
             int u = tryConsumeHex(4);
             if (u == -1)
                 delegate.atomPatternCharacter('u');
             else
                 delegate.atomPatternCharacter(u);
             break;
         }

         // IdentityEscape
         default:
             delegate.atomPatternCharacter(consume());
         }

         return true;
     }

     /*
      * parseAtomEscape(), parseCharacterClassEscape():
      *
      * These methods alias to parseEscape().
      */
     bool parseAtomEscape()
     {
         return parseEscape<false>(m_delegate);
     }
     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
     {
         parseEscape<true>(delegate);
     }

     /*
      * parseCharacterClass():
      *
      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
      * to an instance of CharacterClassParserDelegate, to describe the character class to the
      * delegate.
      */
     void parseCharacterClass()
     {
         ASSERT(!m_err);
         ASSERT(peek() == '[');
         consume();

         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);

         characterClassConstructor.begin(tryConsume('^'));

         while (!atEndOfPattern()) {
             switch (peek()) {
             case ']':
                 consume();
                 characterClassConstructor.end();
                 return;

             case '\\':
                 parseCharacterClassEscape(characterClassConstructor);
                 break;

             default:
                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
             }

             if (m_err)
                 return;
         }

         m_err = CharacterClassUnmatched;
     }

     /*
      * parseParenthesesBegin():
      *
      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
      */
     void parseParenthesesBegin()
     {
         ASSERT(!m_err);
         ASSERT(peek() == '(');
         consume();

         if (tryConsume('?')) {
             if (atEndOfPattern()) {
                 m_err = ParenthesesTypeInvalid;
                 return;
             }

             switch (consume()) {
             case ':':
                 m_delegate.atomParenthesesSubpatternBegin(false);
                 break;

             case '=':
                 m_delegate.atomParentheticalAssertionBegin();
                 break;

             case '!':
                 m_delegate.atomParentheticalAssertionBegin(true);
                 break;

             default:
                 m_err = ParenthesesTypeInvalid;
             }
         } else
             m_delegate.atomParenthesesSubpatternBegin();

         ++m_parenthesesNestingDepth;
     }

     /*
      * parseParenthesesEnd():
      *
      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
      */
     void parseParenthesesEnd()
     {
         ASSERT(!m_err);
         ASSERT(peek() == ')');
         consume();

         if (m_parenthesesNestingDepth > 0)
             m_delegate.atomParenthesesEnd();
         else
             m_err = ParenthesesUnmatched;

         --m_parenthesesNestingDepth;
     }

     /*
      * parseQuantifier():
      *
      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
      */
     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
     {
         ASSERT(!m_err);
         ASSERT(min <= max);

         if (lastTokenWasAnAtom)
             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
         else
             m_err = QuantifierWithoutAtom;
     }

     /*
      * parseTokens():
      *
      * This method loops over the input pattern reporting tokens to the delegate.
      * The method returns when a parse error is detected, or the end of the pattern
      * is reached.  One piece of state is tracked around the loop, which is whether
      * the last token passed to the delegate was an atom (this is necessary to detect
      * a parse error when a quantifier provided without an atom to quantify).
      */
     void parseTokens()
     {
         bool lastTokenWasAnAtom = false;

         while (!atEndOfPattern()) {
             switch (peek()) {
             case '|':
                 consume();
                 m_delegate.disjunction();
                 lastTokenWasAnAtom = false;
                 break;

             case '(':
                 parseParenthesesBegin();
                 lastTokenWasAnAtom = false;
                 break;

             case ')':
                 parseParenthesesEnd();
                 lastTokenWasAnAtom = true;
                 break;

             case '^':
                 consume();
                 m_delegate.assertionBOL();
                 lastTokenWasAnAtom = false;
                 break;

             case '$':
                 consume();
                 m_delegate.assertionEOL();
                 lastTokenWasAnAtom = false;
                 break;

             case '.':
                 consume();
                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
                 lastTokenWasAnAtom = true;
                 break;

             case '[':
                 parseCharacterClass();
                 lastTokenWasAnAtom = true;
                 break;

             case '\\':
                 lastTokenWasAnAtom = parseAtomEscape();
                 break;

             case '*':
                 consume();
                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
                 lastTokenWasAnAtom = false;
                 break;

             case '+':
                 consume();
                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
                 lastTokenWasAnAtom = false;
                 break;

             case '?':
                 consume();
                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
                 lastTokenWasAnAtom = false;
                 break;

             case '{': {
                 ParseState state = saveState();

                 consume();
                 if (peekIsDigit()) {
                     unsigned min = consumeNumber();
                     unsigned max = min;

                     if (tryConsume(','))
                         max = peekIsDigit() ? consumeNumber() : UINT_MAX;

                     if (tryConsume('}')) {
                         if (min <= max)
                             parseQuantifier(lastTokenWasAnAtom, min, max);
                         else
                             m_err = QuantifierOutOfOrder;
                         lastTokenWasAnAtom = false;
                         break;
                     }
                 }

                 restoreState(state);
             } // if we did not find a complete quantifer, fall through to the default case.

             default:
                 m_delegate.atomPatternCharacter(consume());
                 lastTokenWasAnAtom = true;
             }

             if (m_err)
                 return;
         }

         if (m_parenthesesNestingDepth > 0)
             m_err = MissingParentheses;
     }

     /*
      * parse():
      *
      * This method calls regexBegin(), calls parseTokens() to parse over the input
      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
      * error code to a const char* for a result.
      */
     const char* parse()
     {
         m_delegate.regexBegin();

         if (m_size > MAX_PATTERN_SIZE)
             m_err = PatternTooLarge;
         else
             parseTokens();
         ASSERT(atEndOfPattern() || m_err);

         if (m_err)
             m_delegate.regexError();
         else
             m_delegate.regexEnd();

         // The order of this array must match the ErrorCode enum.
         static const char* errorMessages[NumberOfErrorCodes] = {
             0, // NoError
             "regular expression too large",
             "numbers out of order in {} quantifier",
             "nothing to repeat",
             "missing )",
             "unmatched parentheses",
             "unrecognized character after (?",
             "missing terminating ] for character class",
             "range out of order in character class",
             "\\ at end of pattern"
         };

         return errorMessages[m_err];
     }


     // Misc helper functions:

     typedef unsigned ParseState;

     ParseState saveState()
     {
         return m_index;
     }

     void restoreState(ParseState state)
     {
         m_index = state;
     }

     bool atEndOfPattern()
     {
         ASSERT(m_index <= m_size);
         return m_index == m_size;
     }

     int peek()
     {
         ASSERT(m_index < m_size);
         return m_data[m_index];
     }

     bool peekIsDigit()
     {
         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
     }

     unsigned peekDigit()
     {
         ASSERT(peekIsDigit());
         return peek() - '0';
     }

     int consume()
     {
         ASSERT(m_index < m_size);
         return m_data[m_index++];
     }

     unsigned consumeDigit()
     {
         ASSERT(peekIsDigit());
         return consume() - '0';
     }

     unsigned consumeNumber()
     {
         unsigned n = consumeDigit();
         // check for overflow.
         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
             n = newValue;
             consume();
         }
         return n;
     }

     unsigned consumeOctal()
     {
         ASSERT(WTF::isASCIIOctalDigit(peek()));

         unsigned n = consumeDigit();
         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
             n = n * 8 + consumeDigit();
         return n;
     }

     bool tryConsume(UChar ch)
     {
         if (atEndOfPattern() || (m_data[m_index] != ch))
             return false;
         ++m_index;
         return true;
     }

     int tryConsumeHex(int count)
     {
         ParseState state = saveState();

         int n = 0;
         while (count--) {
             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
                 restoreState(state);
                 return -1;
             }
             n = (n << 4) | WTF::toASCIIHexValue(consume());
         }
         return n;
     }

     Delegate& m_delegate;
     unsigned m_backReferenceLimit;
     ErrorCode m_err;
     const UChar* m_data;
     unsigned m_size;
     unsigned m_index;
     unsigned m_parenthesesNestingDepth;

     // Derived by empirical testing of compile time in PCRE and WREC.
     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 };

 /*
  * Yarr::parse():
  *
  * The parse method is passed a pattern to be parsed and a delegate upon which
  * callbacks will be made to record the parsed tokens forming the regex.
  * Yarr::parse() returns null on success, or a const C string providing an error
  * message where a parse error occurs.
  *
  * The Delegate must implement the following interface:
  *
  *    void assertionBOL();
  *    void assertionEOL();
  *    void assertionWordBoundary(bool invert);
  *
  *    void atomPatternCharacter(UChar ch);
  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
  *    void atomCharacterClassBegin(bool invert)
  *    void atomCharacterClassAtom(UChar ch)
  *    void atomCharacterClassRange(UChar begin, UChar end)
  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
  *    void atomCharacterClassEnd()
  *    void atomParenthesesSubpatternBegin(bool capture = true);
  *    void atomParentheticalAssertionBegin(bool invert = false);
  *    void atomParenthesesEnd();
  *    void atomBackReference(unsigned subpatternId);
  *
  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
  *
  *    void disjunction();
  *
  *    void regexBegin();
  *    void regexEnd();
  *    void regexError();
  *
  * Before any call recording tokens are made, regexBegin() will be called on the
  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
  * be called, as appropriate.
  *
  * The regular expression is described by a sequence of assertion*() and atom*()
  * callbacks to the delegate, describing the terms in the regular expression.
  * Following an atom a quantifyAtom() call may occur to indicate that the previous
  * atom should be quantified.  In the case of atoms described across multiple
  * calls (parentheses and character classes) the call to quantifyAtom() will come
  * after the call to the atom*End() method, never after atom*Begin().
  *
  * Character classes may either be described by a single call to
  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
  * In the latter case, ...Begin() will be called, followed by a sequence of
  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
  *
  * Sequences of atoms and assertions are broken into alternatives via calls to
  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
  * capturing subpattern, this will be the subpatternId associated with these
  * parentheses, and will also by definition be the lowest subpatternId of these
  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
  * is passed the subpatternId of the last capturing subexpression nested within
  * these paretheses.  In the case of a capturing subpattern with no nested
  * capturing subpatterns, the same subpatternId will be passed to the begin and
  * end functions.  In the case of non-capturing subpatterns the subpatternId
  * passed to the begin method is also the first possible subpatternId that might
  * be nested within these paretheses.  If a set of non-capturing parentheses does
  * not contain any capturing subpatterns, then the subpatternId passed to begin
  * will be greater than the subpatternId passed to end.
  */

 template<class Delegate>
 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
 {
     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 }

 } } // namespace JSC::Yarr

 #endif

 #endif // RegexParser_h
	/*
	* Copyright (C) 2009 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef RegexParser_h
	#define RegexParser_h

	#include <wtf/Platform.h>

	#if ENABLE(YARR)

	#include <UString.h>
	#include <wtf/ASCIICType.h>
	#include <wtf/unicode/Unicode.h>
	#include <limits.h>

	namespace JSC { namespace Yarr {

	enum BuiltInCharacterClassID {
	DigitClassID,
	SpaceClassID,
	WordClassID,
	NewlineClassID,
	};

	// The Parser class should not be used directly - only via the Yarr::parse() method.
	template<class Delegate>
	class Parser {
	private:
	template<class FriendDelegate>
	friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);

	enum ErrorCode {
	NoError,
	PatternTooLarge,
	QuantifierOutOfOrder,
	QuantifierWithoutAtom,
	MissingParentheses,
	ParenthesesUnmatched,
	ParenthesesTypeInvalid,
	CharacterClassUnmatched,
	CharacterClassOutOfOrder,
	EscapeUnterminated,
	NumberOfErrorCodes
	};

	/*
	* CharacterClassParserDelegate:
	*
	* The class CharacterClassParserDelegate is used in the parsing of character
	* classes. This class handles detection of character ranges. This class
	* implements enough of the delegate interface such that it can be passed to
	* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
	* to perform the parsing of escape characters in character sets.
	*/
	class CharacterClassParserDelegate {
	public:
	CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
	: m_delegate(delegate)
	, m_err(err)
	, m_state(empty)
	{
	}

	/*
	* begin():
	*
	* Called at beginning of construction.
	*/
	void begin(bool invert)
	{
	m_delegate.atomCharacterClassBegin(invert);
	}

	/*
	* atomPatternCharacterUnescaped():
	*
	* This method is called directly from parseCharacterClass(), to report a new
	* pattern character token. This method differs from atomPatternCharacter(),
	* which will be called from parseEscape(), since a hypen provided via this
	* method may be indicating a character range, but a hyphen parsed by
	* parseEscape() cannot be interpreted as doing so.
	*/
	void atomPatternCharacterUnescaped(UChar ch)
	{
	switch (m_state) {
	case empty:
	m_character = ch;
	m_state = cachedCharacter;
	break;

	case cachedCharacter:
	if (ch == '-')
	m_state = cachedCharacterHyphen;
	else {
	m_delegate.atomCharacterClassAtom(m_character);
	m_character = ch;
	}
	break;

	case cachedCharacterHyphen:
	if (ch >= m_character)
	m_delegate.atomCharacterClassRange(m_character, ch);
	else
	m_err = CharacterClassOutOfOrder;
	m_state = empty;
	}
	}

	/*
	* atomPatternCharacter():
	*
	* Adds a pattern character, called by parseEscape(), as such will not
	* interpret a hyphen as indicating a character range.
	*/
	void atomPatternCharacter(UChar ch)
	{
	// Flush if a character is already pending to prevent the
	// hyphen from begin interpreted as indicating a range.
	if((ch == '-') && (m_state == cachedCharacter))
	flush();

	atomPatternCharacterUnescaped(ch);
	}

	/*
	* atomBuiltInCharacterClass():
	*
	* Adds a built-in character class, called by parseEscape().
	*/
	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
	{
	flush();
	m_delegate.atomCharacterClassBuiltIn(classID, invert);
	}

	/*
	* end():
	*
	* Called at end of construction.
	*/
	void end()
	{
	flush();
	m_delegate.atomCharacterClassEnd();
	}

	// parseEscape() should never call these delegate methods when
	// invoked with inCharacterClass set.
	void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
	void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }

	private:
	void flush()
	{
	if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
	m_delegate.atomCharacterClassAtom(m_character);
	if (m_state == cachedCharacterHyphen)
	m_delegate.atomCharacterClassAtom('-');
	m_state = empty;
	}

	Delegate& m_delegate;
	ErrorCode& m_err;
	enum CharacterClassConstructionState {
	empty,
	cachedCharacter,
	cachedCharacterHyphen,
	} m_state;
	UChar m_character;
	};

	Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
	: m_delegate(delegate)
	, m_backReferenceLimit(backReferenceLimit)
	, m_err(NoError)
	, m_data(pattern.data())
	, m_size(pattern.size())
	, m_index(0)
	, m_parenthesesNestingDepth(0)
	{
	}

	/*
	* parseEscape():
	*
	* Helper for parseTokens() AND parseCharacterClass().
	* Unlike the other parser methods, this function does not report tokens
	* directly to the member delegate (m_delegate), instead tokens are
	* emitted to the delegate provided as an argument. In the case of atom
	* escapes, parseTokens() will call parseEscape() passing m_delegate as
	* an argument, and as such the escape will be reported to the delegate.
	*
	* However this method may also be used by parseCharacterClass(), in which
	* case a CharacterClassParserDelegate will be passed as the delegate that
	* tokens should be added to. A boolean flag is also provided to indicate
	* whether that an escape in a CharacterClass is being parsed (some parsing
	* rules change in this context).
	*
	* The boolean value returned by this method indicates whether the token
	* parsed was an atom (outside of a characted class \b and \B will be
	* interpreted as assertions).
	*/
	template<bool inCharacterClass, class EscapeDelegate>
	bool parseEscape(EscapeDelegate& delegate)
	{
	ASSERT(!m_err);
	ASSERT(peek() == '\\');
	consume();

	if (atEndOfPattern()) {
	m_err = EscapeUnterminated;
	return false;
	}

	switch (peek()) {
	// Assertions
	case 'b':
	consume();
	if (inCharacterClass)
	delegate.atomPatternCharacter('\b');
	else {
	delegate.assertionWordBoundary(false);
	return false;
	}
	break;
	case 'B':
	consume();
	if (inCharacterClass)
	delegate.atomPatternCharacter('B');
	else {
	delegate.assertionWordBoundary(true);
	return false;
	}
	break;

	// CharacterClassEscape
	case 'd':
	consume();
	delegate.atomBuiltInCharacterClass(DigitClassID, false);
	break;
	case 's':
	consume();
	delegate.atomBuiltInCharacterClass(SpaceClassID, false);
	break;
	case 'w':
	consume();
	delegate.atomBuiltInCharacterClass(WordClassID, false);
	break;
	case 'D':
	consume();
	delegate.atomBuiltInCharacterClass(DigitClassID, true);
	break;
	case 'S':
	consume();
	delegate.atomBuiltInCharacterClass(SpaceClassID, true);
	break;
	case 'W':
	consume();
	delegate.atomBuiltInCharacterClass(WordClassID, true);
	break;

	// DecimalEscape
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9': {
	// To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
	// First, try to parse this as backreference.
	if (!inCharacterClass) {
	ParseState state = saveState();

	unsigned backReference = consumeNumber();
	if (backReference <= m_backReferenceLimit) {
	delegate.atomBackReference(backReference);
	break;
	}

	restoreState(state);
	}

	// Not a backreference, and not octal.
	if (peek() >= '8') {
	delegate.atomPatternCharacter('\\');
	break;
	}

	// Fall-through to handle this as an octal escape.
	}

	// Octal escape
	case '0':
	delegate.atomPatternCharacter(consumeOctal());
	break;

	// ControlEscape
	case 'f':
	consume();
	delegate.atomPatternCharacter('\f');
	break;
	case 'n':
	consume();
	delegate.atomPatternCharacter('\n');
	break;
	case 'r':
	consume();
	delegate.atomPatternCharacter('\r');
	break;
	case 't':
	consume();
	delegate.atomPatternCharacter('\t');
	break;
	case 'v':
	consume();
	delegate.atomPatternCharacter('\v');
	break;

	// ControlLetter
	case 'c': {
	ParseState state = saveState();
	consume();
	if (!atEndOfPattern()) {
	int control = consume();

	// To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
	if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) \|\| (control == '_') : WTF::isASCIIAlpha(control)) {
	delegate.atomPatternCharacter(control & 0x1f);
	break;
	}
	}
	restoreState(state);
	delegate.atomPatternCharacter('\\');
	break;
	}

	// HexEscape
	case 'x': {
	consume();
	int x = tryConsumeHex(2);
	if (x == -1)
	delegate.atomPatternCharacter('x');
	else
	delegate.atomPatternCharacter(x);
	break;
	}

	// UnicodeEscape
	case 'u': {
	consume();
	int u = tryConsumeHex(4);
	if (u == -1)
	delegate.atomPatternCharacter('u');
	else
	delegate.atomPatternCharacter(u);
	break;
	}

	// IdentityEscape
	default:
	delegate.atomPatternCharacter(consume());
	}

	return true;
	}

	/*
	* parseAtomEscape(), parseCharacterClassEscape():
	*
	* These methods alias to parseEscape().
	*/
	bool parseAtomEscape()
	{
	return parseEscape<false>(m_delegate);
	}
	void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
	{
	parseEscape<true>(delegate);
	}

	/*
	* parseCharacterClass():
	*
	* Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
	* to an instance of CharacterClassParserDelegate, to describe the character class to the
	* delegate.
	*/
	void parseCharacterClass()
	{
	ASSERT(!m_err);
	ASSERT(peek() == '[');
	consume();

	CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);

	characterClassConstructor.begin(tryConsume('^'));

	while (!atEndOfPattern()) {
	switch (peek()) {
	case ']':
	consume();
	characterClassConstructor.end();
	return;

	case '\\':
	parseCharacterClassEscape(characterClassConstructor);
	break;

	default:
	characterClassConstructor.atomPatternCharacterUnescaped(consume());
	}

	if (m_err)
	return;
	}

	m_err = CharacterClassUnmatched;
	}

	/*
	* parseParenthesesBegin():
	*
	* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
	*/
	void parseParenthesesBegin()
	{
	ASSERT(!m_err);
	ASSERT(peek() == '(');
	consume();

	if (tryConsume('?')) {
	if (atEndOfPattern()) {
	m_err = ParenthesesTypeInvalid;
	return;
	}

	switch (consume()) {
	case ':':
	m_delegate.atomParenthesesSubpatternBegin(false);
	break;

	case '=':
	m_delegate.atomParentheticalAssertionBegin();
	break;

	case '!':
	m_delegate.atomParentheticalAssertionBegin(true);
	break;

	default:
	m_err = ParenthesesTypeInvalid;
	}
	} else
	m_delegate.atomParenthesesSubpatternBegin();

	++m_parenthesesNestingDepth;
	}

	/*
	* parseParenthesesEnd():
	*
	* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
	*/
	void parseParenthesesEnd()
	{
	ASSERT(!m_err);
	ASSERT(peek() == ')');
	consume();

	if (m_parenthesesNestingDepth > 0)
	m_delegate.atomParenthesesEnd();
	else
	m_err = ParenthesesUnmatched;

	--m_parenthesesNestingDepth;
	}

	/*
	* parseQuantifier():
	*
	* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
	*/
	void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
	{
	ASSERT(!m_err);
	ASSERT(min <= max);

	if (lastTokenWasAnAtom)
	m_delegate.quantifyAtom(min, max, !tryConsume('?'));
	else
	m_err = QuantifierWithoutAtom;
	}

	/*
	* parseTokens():
	*
	* This method loops over the input pattern reporting tokens to the delegate.
	* The method returns when a parse error is detected, or the end of the pattern
	* is reached. One piece of state is tracked around the loop, which is whether
	* the last token passed to the delegate was an atom (this is necessary to detect
	* a parse error when a quantifier provided without an atom to quantify).
	*/
	void parseTokens()
	{
	bool lastTokenWasAnAtom = false;

	while (!atEndOfPattern()) {
	switch (peek()) {
	case '\|':
	consume();
	m_delegate.disjunction();
	lastTokenWasAnAtom = false;
	break;

	case '(':
	parseParenthesesBegin();
	lastTokenWasAnAtom = false;
	break;

	case ')':
	parseParenthesesEnd();
	lastTokenWasAnAtom = true;
	break;

	case '^':
	consume();
	m_delegate.assertionBOL();
	lastTokenWasAnAtom = false;
	break;

	case '$':
	consume();
	m_delegate.assertionEOL();
	lastTokenWasAnAtom = false;
	break;

	case '.':
	consume();
	m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
	lastTokenWasAnAtom = true;
	break;

	case '[':
	parseCharacterClass();
	lastTokenWasAnAtom = true;
	break;

	case '\\':
	lastTokenWasAnAtom = parseAtomEscape();
	break;

	case '*':
	consume();
	parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
	lastTokenWasAnAtom = false;
	break;

	case '+':
	consume();
	parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
	lastTokenWasAnAtom = false;
	break;

	case '?':
	consume();
	parseQuantifier(lastTokenWasAnAtom, 0, 1);
	lastTokenWasAnAtom = false;
	break;

	case '{': {
	ParseState state = saveState();

	consume();
	if (peekIsDigit()) {
	unsigned min = consumeNumber();
	unsigned max = min;

	if (tryConsume(','))
	max = peekIsDigit() ? consumeNumber() : UINT_MAX;

	if (tryConsume('}')) {
	if (min <= max)
	parseQuantifier(lastTokenWasAnAtom, min, max);
	else
	m_err = QuantifierOutOfOrder;
	lastTokenWasAnAtom = false;
	break;
	}
	}

	restoreState(state);
	} // if we did not find a complete quantifer, fall through to the default case.

	default:
	m_delegate.atomPatternCharacter(consume());
	lastTokenWasAnAtom = true;
	}

	if (m_err)
	return;
	}

	if (m_parenthesesNestingDepth > 0)
	m_err = MissingParentheses;
	}

	/*
	* parse():
	*
	* This method calls regexBegin(), calls parseTokens() to parse over the input
	* patterns, calls regexEnd() or regexError() as appropriate, and converts any
	* error code to a const char* for a result.
	*/
	const char* parse()
	{
	m_delegate.regexBegin();

	if (m_size > MAX_PATTERN_SIZE)
	m_err = PatternTooLarge;
	else
	parseTokens();
	ASSERT(atEndOfPattern() \|\| m_err);

	if (m_err)
	m_delegate.regexError();
	else
	m_delegate.regexEnd();

	// The order of this array must match the ErrorCode enum.
	static const char* errorMessages[NumberOfErrorCodes] = {
	0, // NoError
	"regular expression too large",
	"numbers out of order in {} quantifier",
	"nothing to repeat",
	"missing )",
	"unmatched parentheses",
	"unrecognized character after (?",
	"missing terminating ] for character class",
	"range out of order in character class",
	"\\ at end of pattern"
	};

	return errorMessages[m_err];
	}


	// Misc helper functions:

	typedef unsigned ParseState;

	ParseState saveState()
	{
	return m_index;
	}

	void restoreState(ParseState state)
	{
	m_index = state;
	}

	bool atEndOfPattern()
	{
	ASSERT(m_index <= m_size);
	return m_index == m_size;
	}

	int peek()
	{
	ASSERT(m_index < m_size);
	return m_data[m_index];
	}

	bool peekIsDigit()
	{
	return !atEndOfPattern() && WTF::isASCIIDigit(peek());
	}

	unsigned peekDigit()
	{
	ASSERT(peekIsDigit());
	return peek() - '0';
	}

	int consume()
	{
	ASSERT(m_index < m_size);
	return m_data[m_index++];
	}

	unsigned consumeDigit()
	{
	ASSERT(peekIsDigit());
	return consume() - '0';
	}

	unsigned consumeNumber()
	{
	unsigned n = consumeDigit();
	// check for overflow.
	for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
	n = newValue;
	consume();
	}
	return n;
	}

	unsigned consumeOctal()
	{
	ASSERT(WTF::isASCIIOctalDigit(peek()));

	unsigned n = consumeDigit();
	while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
	n = n * 8 + consumeDigit();
	return n;
	}

	bool tryConsume(UChar ch)
	{
	if (atEndOfPattern() \|\| (m_data[m_index] != ch))
	return false;
	++m_index;
	return true;
	}

	int tryConsumeHex(int count)
	{
	ParseState state = saveState();

	int n = 0;
	while (count--) {
	if (atEndOfPattern() \|\| !WTF::isASCIIHexDigit(peek())) {
	restoreState(state);
	return -1;
	}
	n = (n << 4) \| WTF::toASCIIHexValue(consume());
	}
	return n;
	}

	Delegate& m_delegate;
	unsigned m_backReferenceLimit;
	ErrorCode m_err;
	const UChar* m_data;
	unsigned m_size;
	unsigned m_index;
	unsigned m_parenthesesNestingDepth;

	// Derived by empirical testing of compile time in PCRE and WREC.
	static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
	};

	/*
	* Yarr::parse():
	*
	* The parse method is passed a pattern to be parsed and a delegate upon which
	* callbacks will be made to record the parsed tokens forming the regex.
	* Yarr::parse() returns null on success, or a const C string providing an error
	* message where a parse error occurs.
	*
	* The Delegate must implement the following interface:
	*
	* void assertionBOL();
	* void assertionEOL();
	* void assertionWordBoundary(bool invert);
	*
	* void atomPatternCharacter(UChar ch);
	* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
	* void atomCharacterClassBegin(bool invert)
	* void atomCharacterClassAtom(UChar ch)
	* void atomCharacterClassRange(UChar begin, UChar end)
	* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
	* void atomCharacterClassEnd()
	* void atomParenthesesSubpatternBegin(bool capture = true);
	* void atomParentheticalAssertionBegin(bool invert = false);
	* void atomParenthesesEnd();
	* void atomBackReference(unsigned subpatternId);
	*
	* void quantifyAtom(unsigned min, unsigned max, bool greedy);
	*
	* void disjunction();
	*
	* void regexBegin();
	* void regexEnd();
	* void regexError();
	*
	* Before any call recording tokens are made, regexBegin() will be called on the
	* delegate once. Once parsing is complete either regexEnd() or regexError() will
	* be called, as appropriate.
	*
	* The regular expression is described by a sequence of assertion() and atom()
	* callbacks to the delegate, describing the terms in the regular expression.
	* Following an atom a quantifyAtom() call may occur to indicate that the previous
	* atom should be quantified. In the case of atoms described across multiple
	* calls (parentheses and character classes) the call to quantifyAtom() will come
	* after the call to the atomEnd() method, never after atomBegin().
	*
	* Character classes may either be described by a single call to
	* atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
	* In the latter case, ...Begin() will be called, followed by a sequence of
	* calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
	*
	* Sequences of atoms and assertions are broken into alternatives via calls to
	* disjunction(). Assertions, atoms, and disjunctions emitted between calls to
	* atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
	* atomParenthesesBegin() is passed a subpatternId. In the case of a regular
	* capturing subpattern, this will be the subpatternId associated with these
	* parentheses, and will also by definition be the lowest subpatternId of these
	* parentheses and of any nested paretheses. The atomParenthesesEnd() method
	* is passed the subpatternId of the last capturing subexpression nested within
	* these paretheses. In the case of a capturing subpattern with no nested
	* capturing subpatterns, the same subpatternId will be passed to the begin and
	* end functions. In the case of non-capturing subpatterns the subpatternId
	* passed to the begin method is also the first possible subpatternId that might
	* be nested within these paretheses. If a set of non-capturing parentheses does
	* not contain any capturing subpatterns, then the subpatternId passed to begin
	* will be greater than the subpatternId passed to end.
	*/

	template<class Delegate>
	const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
	{
	return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
	}

	} } // namespace JSC::Yarr

	#endif

	#endif // RegexParser_h