Source/WebCore/xml/parser/MarkupTokenizerBase.h - WebKit - Git at Google

 /*
  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef MarkupTokenizerBase_h
 #define MarkupTokenizerBase_h

 #include "SegmentedString.h"
 #include <wtf/Noncopyable.h>
 #include <wtf/PassOwnPtr.h>
 #include <wtf/Vector.h>
 #include <wtf/text/AtomicString.h>
 #include <wtf/text/TextPosition.h>

 namespace WebCore {

 // Never use this type for a variable, as it contains several non-virtual functions.
 template<typename Token, typename State>
 class MarkupTokenizerBase {
     WTF_MAKE_NONCOPYABLE(MarkupTokenizerBase);
     WTF_MAKE_FAST_ALLOCATED;
 public:
     virtual ~MarkupTokenizerBase() { }

     typename State::State state() const { return m_state; }
     void setState(typename State::State state) { m_state = state; }

     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }

     // This method needs to be defined in a template specialization when subclassing this template
     inline bool shouldSkipNullCharacters() const;

 protected:
     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     class InputStreamPreprocessor {
         WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
     public:
         InputStreamPreprocessor(MarkupTokenizerBase<Token, State>* tokenizer)
             : m_tokenizer(tokenizer)
             , m_nextInputCharacter('\0')
             , m_skipNextNewLine(false)
         {
         }

         UChar nextInputCharacter() const { return m_nextInputCharacter; }

         // Returns whether we succeeded in peeking at the next character.
         // The only way we can fail to peek is if there are no more
         // characters in |source| (after collapsing \r\n, etc).
         ALWAYS_INLINE bool peek(SegmentedString& source)
         {
         PeekAgain:
             m_nextInputCharacter = *source;

             // Every branch in this function is expensive, so we have a
             // fast-reject branch for characters that don't require special
             // handling. Please run the parser benchmark whenever you touch
             // this function. It's very hot.
             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
             if (m_nextInputCharacter & ~specialCharacterMask) {
                 m_skipNextNewLine = false;
                 return true;
             }

             if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
                 m_skipNextNewLine = false;
                 source.advancePastNewlineAndUpdateLineNumber();
                 if (source.isEmpty())
                     return false;
                 m_nextInputCharacter = *source;
             }
             if (m_nextInputCharacter == '\r') {
                 m_nextInputCharacter = '\n';
                 m_skipNextNewLine = true;
             } else {
                 m_skipNextNewLine = false;
                 // FIXME: The spec indicates that the surrogate pair range as well as
                 // a number of specific character values are parse errors and should be replaced
                 // by the replacement character. We suspect this is a problem with the spec as doing
                 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
                 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
                     if (m_tokenizer->shouldSkipNullCharacters()) {
                         source.advancePastNonNewline();
                         if (source.isEmpty())
                             return false;
                         goto PeekAgain;
                     }
                     m_nextInputCharacter = 0xFFFD;
                 }
             }
             return true;
         }

         // Returns whether there are more characters in |source| after advancing.
         bool advance(SegmentedString& source)
         {
             source.advanceAndUpdateLineNumber();
             if (source.isEmpty())
                 return false;
             return peek(source);
         }

         static const UChar endOfFileMarker = 0;

     private:
         bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
         {
             return source.isClosed() && source.length() == 1;
         }

         MarkupTokenizerBase<Token, State>* m_tokenizer;

         // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
         UChar m_nextInputCharacter;
         bool m_skipNextNewLine;
     };

     MarkupTokenizerBase() : m_inputStreamPreprocessor(this) { reset(); }

     inline void bufferCharacter(UChar character)
     {
         ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
         m_token->ensureIsCharacterToken();
         m_token->appendToCharacter(character);
     }

     inline void bufferCodePoint(unsigned);

     // This method can get hidden in subclasses
     inline bool emitAndResumeIn(SegmentedString& source, typename State::State state)
     {
         m_state = state;
         source.advanceAndUpdateLineNumber();
         return true;
     }

     // This method can get hidden in subclasses
     inline bool emitAndReconsumeIn(SegmentedString&, typename State::State state)
     {
         m_state = state;
         return true;
     }

     inline bool emitEndOfFile(SegmentedString& source)
     {
         if (haveBufferedCharacterToken())
             return true;
         m_state = State::DataState;
         source.advanceAndUpdateLineNumber();
         m_token->clear();
         m_token->makeEndOfFile();
         return true;
     }

     void reset()
     {
         m_state = State::DataState;
         m_token = 0;
     }

     inline bool haveBufferedCharacterToken()
     {
         return m_token->type() == Token::Type::Character;
     }

     typename State::State m_state;

     // m_token is owned by the caller. If nextToken is not on the stack,
     // this member might be pointing to unallocated memory.
     Token* m_token;

     bool m_forceNullCharacterReplacement;

     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
     UChar m_additionalAllowedCharacter;

     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     InputStreamPreprocessor m_inputStreamPreprocessor;
 };

 }

 #endif // MarkupTokenizerBase_h
	/*
	* Copyright (C) 2008 Apple Inc. All Rights Reserved.
	* Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
	* Copyright (C) 2010 Google, Inc. All Rights Reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef MarkupTokenizerBase_h
	#define MarkupTokenizerBase_h

	#include "SegmentedString.h"
	#include <wtf/Noncopyable.h>
	#include <wtf/PassOwnPtr.h>
	#include <wtf/Vector.h>
	#include <wtf/text/AtomicString.h>
	#include <wtf/text/TextPosition.h>

	namespace WebCore {

	// Never use this type for a variable, as it contains several non-virtual functions.
	template<typename Token, typename State>
	class MarkupTokenizerBase {
	WTF_MAKE_NONCOPYABLE(MarkupTokenizerBase);
	WTF_MAKE_FAST_ALLOCATED;
	public:
	virtual ~MarkupTokenizerBase() { }

	typename State::State state() const { return m_state; }
	void setState(typename State::State state) { m_state = state; }

	bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
	void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }

	// This method needs to be defined in a template specialization when subclassing this template
	inline bool shouldSkipNullCharacters() const;

	protected:
	// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
	class InputStreamPreprocessor {
	WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
	public:
	InputStreamPreprocessor(MarkupTokenizerBase<Token, State>* tokenizer)
	: m_tokenizer(tokenizer)
	, m_nextInputCharacter('\0')
	, m_skipNextNewLine(false)
	{
	}

	UChar nextInputCharacter() const { return m_nextInputCharacter; }

	// Returns whether we succeeded in peeking at the next character.
	// The only way we can fail to peek is if there are no more
	// characters in \|source\| (after collapsing \r\n, etc).
	ALWAYS_INLINE bool peek(SegmentedString& source)
	{
	PeekAgain:
	m_nextInputCharacter = *source;

	// Every branch in this function is expensive, so we have a
	// fast-reject branch for characters that don't require special
	// handling. Please run the parser benchmark whenever you touch
	// this function. It's very hot.
	static const UChar specialCharacterMask = '\n' \| '\r' \| '\0';
	if (m_nextInputCharacter & ~specialCharacterMask) {
	m_skipNextNewLine = false;
	return true;
	}

	if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
	m_skipNextNewLine = false;
	source.advancePastNewlineAndUpdateLineNumber();
	if (source.isEmpty())
	return false;
	m_nextInputCharacter = *source;
	}
	if (m_nextInputCharacter == '\r') {
	m_nextInputCharacter = '\n';
	m_skipNextNewLine = true;
	} else {
	m_skipNextNewLine = false;
	// FIXME: The spec indicates that the surrogate pair range as well as
	// a number of specific character values are parse errors and should be replaced
	// by the replacement character. We suspect this is a problem with the spec as doing
	// that filtering breaks surrogate pair handling and causes us not to match Minefield.
	if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
	if (m_tokenizer->shouldSkipNullCharacters()) {
	source.advancePastNonNewline();
	if (source.isEmpty())
	return false;
	goto PeekAgain;
	}
	m_nextInputCharacter = 0xFFFD;
	}
	}
	return true;
	}

	// Returns whether there are more characters in \|source\| after advancing.
	bool advance(SegmentedString& source)
	{
	source.advanceAndUpdateLineNumber();
	if (source.isEmpty())
	return false;
	return peek(source);
	}

	static const UChar endOfFileMarker = 0;

	private:
	bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
	{
	return source.isClosed() && source.length() == 1;
	}

	MarkupTokenizerBase<Token, State>* m_tokenizer;

	// http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
	UChar m_nextInputCharacter;
	bool m_skipNextNewLine;
	};

	MarkupTokenizerBase() : m_inputStreamPreprocessor(this) { reset(); }

	inline void bufferCharacter(UChar character)
	{
	ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
	m_token->ensureIsCharacterToken();
	m_token->appendToCharacter(character);
	}

	inline void bufferCodePoint(unsigned);

	// This method can get hidden in subclasses
	inline bool emitAndResumeIn(SegmentedString& source, typename State::State state)
	{
	m_state = state;
	source.advanceAndUpdateLineNumber();
	return true;
	}

	// This method can get hidden in subclasses
	inline bool emitAndReconsumeIn(SegmentedString&, typename State::State state)
	{
	m_state = state;
	return true;
	}

	inline bool emitEndOfFile(SegmentedString& source)
	{
	if (haveBufferedCharacterToken())
	return true;
	m_state = State::DataState;
	source.advanceAndUpdateLineNumber();
	m_token->clear();
	m_token->makeEndOfFile();
	return true;
	}

	void reset()
	{
	m_state = State::DataState;
	m_token = 0;
	}

	inline bool haveBufferedCharacterToken()
	{
	return m_token->type() == Token::Type::Character;
	}

	typename State::State m_state;

	// m_token is owned by the caller. If nextToken is not on the stack,
	// this member might be pointing to unallocated memory.
	Token* m_token;

	bool m_forceNullCharacterReplacement;

	// http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
	UChar m_additionalAllowedCharacter;

	// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
	InputStreamPreprocessor m_inputStreamPreprocessor;
	};

	}

	#endif // MarkupTokenizerBase_h