| /* |
| Copyright (C) 1997 Martin Jones (mjones@kde.org) |
| (C) 1997 Torben Weis (weis@kde.org) |
| (C) 1998 Waldo Bastian (bastian@kde.org) |
| (C) 2001 Dirk Mueller (mueller@kde.org) |
| Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
| |
| This library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Library General Public |
| License as published by the Free Software Foundation; either |
| version 2 of the License, or (at your option) any later version. |
| |
| This library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Library General Public License for more details. |
| |
| You should have received a copy of the GNU Library General Public License |
| along with this library; see the file COPYING.LIB. If not, write to |
| the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| Boston, MA 02110-1301, USA. |
| */ |
| |
| #ifndef LegacyHTMLDocumentParser_h |
| #define LegacyHTMLDocumentParser_h |
| |
| #include "CachedResourceClient.h" |
| #include "CachedResourceHandle.h" |
| #include "FragmentScriptingPermission.h" |
| #include "NamedNodeMap.h" |
| #include "ScriptableDocumentParser.h" |
| #include "SegmentedString.h" |
| #include "Timer.h" |
| #include <wtf/Deque.h> |
| #include <wtf/OwnPtr.h> |
| #include <wtf/Vector.h> |
| |
| namespace WebCore { |
| |
| class CachedScript; |
| class DocumentFragment; |
| class Document; |
| class HTMLDocument; |
| class HTMLScriptElement; |
| class HTMLViewSourceDocument; |
| class FrameView; |
| class LegacyHTMLTreeBuilder; |
| class Node; |
| class LegacyPreloadScanner; |
| class ScriptSourceCode; |
| |
| /** |
| * @internal |
| * represents one HTML tag. Consists of a numerical id, and the list |
| * of attributes. Can also represent text. In this case the id = 0 and |
| * text contains the text. |
| */ |
| struct Token { |
| Token() |
| : beginTag(true) |
| , selfClosingTag(false) |
| , brokenXMLStyle(false) |
| , m_sourceInfo(0) |
| { } |
| ~Token() { } |
| |
| void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); |
| |
| bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } |
| bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } |
| |
| void reset() |
| { |
| attrs = 0; |
| text = 0; |
| tagName = nullAtom; |
| beginTag = true; |
| selfClosingTag = false; |
| brokenXMLStyle = false; |
| if (m_sourceInfo) |
| m_sourceInfo->clear(); |
| } |
| |
| void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); } |
| |
| RefPtr<NamedNodeMap> attrs; |
| RefPtr<StringImpl> text; |
| AtomicString tagName; |
| bool beginTag; |
| bool selfClosingTag; |
| bool brokenXMLStyle; |
| OwnPtr<Vector<UChar> > m_sourceInfo; |
| }; |
| |
| enum DoctypeState { |
| DoctypeBegin, |
| DoctypeBeforeName, |
| DoctypeName, |
| DoctypeAfterName, |
| DoctypeBeforePublicID, |
| DoctypePublicID, |
| DoctypeAfterPublicID, |
| DoctypeBeforeSystemID, |
| DoctypeSystemID, |
| DoctypeAfterSystemID, |
| DoctypeBogus |
| }; |
| |
| class DoctypeToken { |
| public: |
| DoctypeToken() {} |
| |
| void reset() |
| { |
| m_name.clear(); |
| m_publicID.clear(); |
| m_systemID.clear(); |
| m_state = DoctypeBegin; |
| m_source.clear(); |
| m_forceQuirks = false; |
| } |
| |
| DoctypeState state() { return m_state; } |
| void setState(DoctypeState s) { m_state = s; } |
| |
| Vector<UChar> m_name; |
| Vector<UChar> m_publicID; |
| Vector<UChar> m_systemID; |
| DoctypeState m_state; |
| |
| Vector<UChar> m_source; |
| |
| bool m_forceQuirks; // Used by the HTML5 parser. |
| }; |
| |
| //----------------------------------------------------------------------------- |
| |
| // FIXME: This class does too much. Right now it is both an HTML tokenizer as well |
| // as handling all of the non-tokenizer-specific junk related to tokenizing HTML |
| // (like dealing with <script> tags). The HTML tokenizer bits should be pushed |
| // down into a separate HTML tokenizer class. |
| |
| class LegacyHTMLDocumentParser : public ScriptableDocumentParser, public CachedResourceClient { |
| public: |
| LegacyHTMLDocumentParser(HTMLDocument*, bool reportErrors); |
| LegacyHTMLDocumentParser(HTMLViewSourceDocument*); |
| LegacyHTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); |
| virtual ~LegacyHTMLDocumentParser(); |
| |
| bool forceSynchronous() const { return m_state.forceSynchronous(); } |
| void setForceSynchronous(bool force); |
| |
| static void parseDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); |
| |
| protected: |
| // Exposed for FTPDirectoryDocumentParser |
| virtual void insert(const SegmentedString&); |
| virtual void finish(); |
| |
| private: |
| // ScriptableDocumentParser |
| virtual void append(const SegmentedString&); |
| virtual bool finishWasCalled(); |
| virtual bool isWaitingForScripts() const; |
| virtual void stopParsing(); |
| virtual bool processingData() const; |
| virtual bool isExecutingScript() const { return !!m_executingScript; } |
| |
| virtual int lineNumber() const { return m_lineNumber; } |
| virtual int columnNumber() const { return 1; } |
| |
| virtual bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } |
| |
| virtual void executeScriptsWaitingForStylesheets(); |
| |
| virtual LegacyHTMLTreeBuilder* htmlTreeBuilder() const { return m_treeBuilder.get(); } |
| |
| class State; |
| |
| void begin(); |
| void end(); |
| void reset(); |
| |
| void willWriteHTML(const SegmentedString&); |
| void write(const SegmentedString&, bool appendData); |
| ALWAYS_INLINE void advance(State&); |
| void didWriteHTML(); |
| |
| PassRefPtr<Node> processToken(); |
| void processDoctypeToken(); |
| |
| State processListing(SegmentedString, State); |
| State parseComment(SegmentedString&, State); |
| State parseDoctype(SegmentedString&, State); |
| State parseServer(SegmentedString&, State); |
| State parseText(SegmentedString&, State); |
| State parseNonHTMLText(SegmentedString&, State); |
| State parseTag(SegmentedString&, State); |
| State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); |
| State parseProcessingInstruction(SegmentedString&, State); |
| State scriptHandler(State); |
| State scriptExecution(const ScriptSourceCode&, State); |
| void setSrc(const SegmentedString&); |
| |
| // check if we have enough space in the buffer. |
| // if not enlarge it |
| inline void checkBuffer(int len = 10) |
| { |
| if ((m_dest - m_buffer) > m_bufferSize - len) |
| enlargeBuffer(len); |
| } |
| |
| inline void checkScriptBuffer(int len = 10) |
| { |
| if (m_scriptCodeSize + len >= m_scriptCodeCapacity) |
| enlargeScriptBuffer(len); |
| } |
| |
| void enlargeBuffer(int len); |
| void enlargeScriptBuffer(int len); |
| |
| bool continueProcessing(int& processedCount, double startTime, State&); |
| void timerFired(Timer<LegacyHTMLDocumentParser>*); |
| void allDataProcessed(); |
| |
| // from CachedResourceClient |
| void notifyFinished(CachedResource*); |
| |
| void executeExternalScriptsIfReady(); |
| void executeExternalScriptsTimerFired(Timer<LegacyHTMLDocumentParser>*); |
| bool continueExecutingExternalScripts(double startTime); |
| |
| // Internal buffers |
| /////////////////// |
| UChar* m_buffer; |
| int m_bufferSize; |
| UChar* m_dest; |
| |
| Token m_currentToken; |
| |
| // This buffer holds the raw characters we've seen between the beginning of |
| // the attribute name and the first character of the attribute value. |
| Vector<UChar, 32> m_rawAttributeBeforeValue; |
| |
| // DocumentParser flags |
| ////////////////// |
| // are we in quotes within a html tag |
| enum { NoQuote, SingleQuote, DoubleQuote } tquote; |
| |
| // Are we in a &... character entity description? |
| enum EntityState { |
| NoEntity = 0, |
| SearchEntity = 1, |
| NumericSearch = 2, |
| Hexadecimal = 3, |
| Decimal = 4, |
| EntityName = 5, |
| SearchSemicolon = 6 |
| }; |
| unsigned EntityUnicodeValue; |
| |
| enum TagState { |
| NoTag = 0, |
| TagName = 1, |
| SearchAttribute = 2, |
| AttributeName = 3, |
| SearchEqual = 4, |
| SearchValue = 5, |
| QuotedValue = 6, |
| Value = 7, |
| SearchEnd = 8 |
| }; |
| |
| class State { |
| public: |
| State() : m_bits(0) { } |
| |
| TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); } |
| void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } |
| EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); } |
| void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } |
| |
| bool inScript() const { return testBit(InScript); } |
| void setInScript(bool v) { setBit(InScript, v); } |
| bool inStyle() const { return testBit(InStyle); } |
| void setInStyle(bool v) { setBit(InStyle, v); } |
| bool inXmp() const { return testBit(InXmp); } |
| void setInXmp(bool v) { setBit(InXmp, v); } |
| bool inTitle() const { return testBit(InTitle); } |
| void setInTitle(bool v) { setBit(InTitle, v); } |
| bool inIFrame() const { return testBit(InIFrame); } |
| void setInIFrame(bool v) { setBit(InIFrame, v); } |
| bool inPlainText() const { return testBit(InPlainText); } |
| void setInPlainText(bool v) { setBit(InPlainText, v); } |
| bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } |
| void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } |
| bool inComment() const { return testBit(InComment); } |
| void setInComment(bool v) { setBit(InComment, v); } |
| bool inDoctype() const { return testBit(InDoctype); } |
| void setInDoctype(bool v) { setBit(InDoctype, v); } |
| bool inTextArea() const { return testBit(InTextArea); } |
| void setInTextArea(bool v) { setBit(InTextArea, v); } |
| bool escaped() const { return testBit(Escaped); } |
| void setEscaped(bool v) { setBit(Escaped, v); } |
| bool inServer() const { return testBit(InServer); } |
| void setInServer(bool v) { setBit(InServer, v); } |
| bool skipLF() const { return testBit(SkipLF); } |
| void setSkipLF(bool v) { setBit(SkipLF, v); } |
| bool startTag() const { return testBit(StartTag); } |
| void setStartTag(bool v) { setBit(StartTag, v); } |
| bool discardLF() const { return testBit(DiscardLF); } |
| void setDiscardLF(bool v) { setBit(DiscardLF, v); } |
| bool allowYield() const { return testBit(AllowYield); } |
| void setAllowYield(bool v) { setBit(AllowYield, v); } |
| bool loadingExtScript() const { return testBit(LoadingExtScript); } |
| void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } |
| bool forceSynchronous() const { return testBit(ForceSynchronous); } |
| void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } |
| |
| bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } |
| bool hasTagState() const { return m_bits & TagMask; } |
| bool hasEntityState() const { return m_bits & EntityMask; } |
| |
| bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } |
| |
| private: |
| static const int EntityShift = 4; |
| enum StateBits { |
| TagMask = (1 << 4) - 1, |
| EntityMask = (1 << 7) - (1 << 4), |
| InScript = 1 << 7, |
| InStyle = 1 << 8, |
| // Bit 9 unused |
| InXmp = 1 << 10, |
| InTitle = 1 << 11, |
| InPlainText = 1 << 12, |
| InProcessingInstruction = 1 << 13, |
| InComment = 1 << 14, |
| InTextArea = 1 << 15, |
| Escaped = 1 << 16, |
| InServer = 1 << 17, |
| SkipLF = 1 << 18, |
| StartTag = 1 << 19, |
| DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard |
| AllowYield = 1 << 21, |
| LoadingExtScript = 1 << 22, |
| ForceSynchronous = 1 << 23, |
| InIFrame = 1 << 24, |
| InDoctype = 1 << 25 |
| }; |
| |
| void setBit(StateBits bit, bool value) |
| { |
| if (value) |
| m_bits |= bit; |
| else |
| m_bits &= ~bit; |
| } |
| bool testBit(StateBits bit) const { return m_bits & bit; } |
| |
| unsigned m_bits; |
| }; |
| |
| State m_state; |
| |
| DoctypeToken m_doctypeToken; |
| int m_doctypeSearchCount; |
| int m_doctypeSecondarySearchCount; |
| |
| bool m_brokenServer; |
| |
| // Name of an attribute that we just scanned. |
| AtomicString m_attrName; |
| |
| // Used to store the code of a scripting sequence |
| UChar* m_scriptCode; |
| // Size of the script sequenze stored in @ref #scriptCode |
| int m_scriptCodeSize; |
| // Maximal size that can be stored in @ref #scriptCode |
| int m_scriptCodeCapacity; |
| // resync point of script code size |
| int m_scriptCodeResync; |
| |
| // Stores characters if we are scanning for a string like "</script>" |
| UChar searchBuffer[10]; |
| |
| // Counts where we are in the string we are scanning for |
| int searchCount; |
| // the stopper string |
| const char* m_searchStopper; |
| int m_searchStopperLength; |
| |
| // if no more data is coming, just parse what we have (including ext scripts that |
| // may be still downloading) and finish |
| bool m_noMoreData; |
| // URL to get source code of script from |
| String m_scriptTagSrcAttrValue; |
| String m_scriptTagCharsetAttrValue; |
| // the HTML code we will parse after the external script we are waiting for has loaded |
| SegmentedString m_pendingSrc; |
| |
| // the HTML code we will parse after this particular script has |
| // loaded, but before all pending HTML |
| SegmentedString* m_currentPrependingSrc; |
| |
| // true if we are executing a script while parsing a document. This causes the parsing of |
| // the output of the script to be postponed until after the script has finished executing |
| int m_executingScript; |
| Deque<CachedResourceHandle<CachedScript> > m_pendingScripts; |
| RefPtr<HTMLScriptElement> m_scriptNode; |
| |
| bool m_requestingScript; |
| bool m_hasScriptsWaitingForStylesheets; |
| |
| // if we found one broken comment, there are most likely others as well |
| // store a flag to get rid of the O(n^2) behaviour in such a case. |
| bool m_brokenComments; |
| // current line number |
| int m_lineNumber; |
| int m_currentScriptTagStartLineNumber; |
| int m_currentTagStartLineNumber; |
| |
| double m_tokenizerTimeDelay; |
| int m_tokenizerChunkSize; |
| |
| // The timer for continued processing. |
| Timer<LegacyHTMLDocumentParser> m_timer; |
| |
| // The timer for continued executing external scripts. |
| Timer<LegacyHTMLDocumentParser> m_externalScriptsTimer; |
| |
| // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. |
| // So any fixed number might be too small, but rather than rewriting all usage of this buffer |
| // we'll just make it large enough to handle all imaginable cases. |
| #define CBUFLEN 1024 |
| UChar m_cBuffer[CBUFLEN + 2]; |
| unsigned int m_cBufferPos; |
| |
| SegmentedString m_src; |
| OwnPtr<LegacyHTMLTreeBuilder> m_treeBuilder; |
| bool m_inWrite; |
| bool m_fragment; |
| FragmentScriptingPermission m_scriptingPermission; |
| |
| OwnPtr<LegacyPreloadScanner> m_preloadScanner; |
| }; |
| |
| UChar decodeNamedEntity(const char*); |
| |
| } // namespace WebCore |
| |
| #endif // LegacyHTMLDocumentParser_h |