| /* |
| * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
| * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ |
| * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef MarkupTokenizerBase_h |
| #define MarkupTokenizerBase_h |
| |
| #include "SegmentedString.h" |
| #include <wtf/Noncopyable.h> |
| #include <wtf/PassOwnPtr.h> |
| #include <wtf/Vector.h> |
| #include <wtf/text/AtomicString.h> |
| #include <wtf/text/TextPosition.h> |
| |
| namespace WebCore { |
| |
| // Never use this type for a variable, as it contains several non-virtual functions. |
| template<typename Token, typename State> |
| class MarkupTokenizerBase { |
| WTF_MAKE_NONCOPYABLE(MarkupTokenizerBase); |
| WTF_MAKE_FAST_ALLOCATED; |
| public: |
| virtual ~MarkupTokenizerBase() { } |
| |
| typename State::State state() const { return m_state; } |
| void setState(typename State::State state) { m_state = state; } |
| |
| bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } |
| void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } |
| |
| // This method needs to be defined in a template specialization when subclassing this template |
| inline bool shouldSkipNullCharacters() const; |
| |
| protected: |
| // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream |
| class InputStreamPreprocessor { |
| WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor); |
| public: |
| InputStreamPreprocessor(MarkupTokenizerBase<Token, State>* tokenizer) |
| : m_tokenizer(tokenizer) |
| , m_nextInputCharacter('\0') |
| , m_skipNextNewLine(false) |
| { |
| } |
| |
| UChar nextInputCharacter() const { return m_nextInputCharacter; } |
| |
| // Returns whether we succeeded in peeking at the next character. |
| // The only way we can fail to peek is if there are no more |
| // characters in |source| (after collapsing \r\n, etc). |
| ALWAYS_INLINE bool peek(SegmentedString& source) |
| { |
| PeekAgain: |
| m_nextInputCharacter = *source; |
| |
| // Every branch in this function is expensive, so we have a |
| // fast-reject branch for characters that don't require special |
| // handling. Please run the parser benchmark whenever you touch |
| // this function. It's very hot. |
| static const UChar specialCharacterMask = '\n' | '\r' | '\0'; |
| if (m_nextInputCharacter & ~specialCharacterMask) { |
| m_skipNextNewLine = false; |
| return true; |
| } |
| |
| if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { |
| m_skipNextNewLine = false; |
| source.advancePastNewlineAndUpdateLineNumber(); |
| if (source.isEmpty()) |
| return false; |
| m_nextInputCharacter = *source; |
| } |
| if (m_nextInputCharacter == '\r') { |
| m_nextInputCharacter = '\n'; |
| m_skipNextNewLine = true; |
| } else { |
| m_skipNextNewLine = false; |
| // FIXME: The spec indicates that the surrogate pair range as well as |
| // a number of specific character values are parse errors and should be replaced |
| // by the replacement character. We suspect this is a problem with the spec as doing |
| // that filtering breaks surrogate pair handling and causes us not to match Minefield. |
| if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) { |
| if (m_tokenizer->shouldSkipNullCharacters()) { |
| source.advancePastNonNewline(); |
| if (source.isEmpty()) |
| return false; |
| goto PeekAgain; |
| } |
| m_nextInputCharacter = 0xFFFD; |
| } |
| } |
| return true; |
| } |
| |
| // Returns whether there are more characters in |source| after advancing. |
| bool advance(SegmentedString& source) |
| { |
| source.advanceAndUpdateLineNumber(); |
| if (source.isEmpty()) |
| return false; |
| return peek(source); |
| } |
| |
| static const UChar endOfFileMarker = 0; |
| |
| private: |
| bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const |
| { |
| return source.isClosed() && source.length() == 1; |
| } |
| |
| MarkupTokenizerBase<Token, State>* m_tokenizer; |
| |
| // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character |
| UChar m_nextInputCharacter; |
| bool m_skipNextNewLine; |
| }; |
| |
| MarkupTokenizerBase() : m_inputStreamPreprocessor(this) { reset(); } |
| |
| inline void bufferCharacter(UChar character) |
| { |
| ASSERT(character != InputStreamPreprocessor::endOfFileMarker); |
| m_token->ensureIsCharacterToken(); |
| m_token->appendToCharacter(character); |
| } |
| |
| inline void bufferCodePoint(unsigned); |
| |
| // This method can get hidden in subclasses |
| inline bool emitAndResumeIn(SegmentedString& source, typename State::State state) |
| { |
| m_state = state; |
| source.advanceAndUpdateLineNumber(); |
| return true; |
| } |
| |
| // This method can get hidden in subclasses |
| inline bool emitAndReconsumeIn(SegmentedString&, typename State::State state) |
| { |
| m_state = state; |
| return true; |
| } |
| |
| inline bool emitEndOfFile(SegmentedString& source) |
| { |
| if (haveBufferedCharacterToken()) |
| return true; |
| m_state = State::DataState; |
| source.advanceAndUpdateLineNumber(); |
| m_token->clear(); |
| m_token->makeEndOfFile(); |
| return true; |
| } |
| |
| void reset() |
| { |
| m_state = State::DataState; |
| m_token = 0; |
| } |
| |
| inline bool haveBufferedCharacterToken() |
| { |
| return m_token->type() == Token::Type::Character; |
| } |
| |
| typename State::State m_state; |
| |
| // m_token is owned by the caller. If nextToken is not on the stack, |
| // this member might be pointing to unallocated memory. |
| Token* m_token; |
| |
| bool m_forceNullCharacterReplacement; |
| |
| // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character |
| UChar m_additionalAllowedCharacter; |
| |
| // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream |
| InputStreamPreprocessor m_inputStreamPreprocessor; |
| }; |
| |
| } |
| |
| #endif // MarkupTokenizerBase_h |