blob: 4c0fab4ff9fd32954c5c11c22ebc742b12eb429a [file] [log] [blame]
kociendabb0c24b2001-08-24 14:24:40 +00001/*
2 This file is part of the KDE libraries
3
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1998 Waldo Bastian (bastian@kde.org)
7 (C) 2001 Dirk Mueller (mueller@kde.org)
darin85c3a502006-02-17 01:08:41 +00008 Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc.
kociendabb0c24b2001-08-24 14:24:40 +00009
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Library General Public
12 License as published by the Free Software Foundation; either
13 version 2 of the License, or (at your option) any later version.
14
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Library General Public License for more details.
19
20 You should have received a copy of the GNU Library General Public License
21 along with this library; see the file COPYING.LIB. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA.
24*/
kociendabb0c24b2001-08-24 14:24:40 +000025
26#ifndef HTMLTOKENIZER_H
27#define HTMLTOKENIZER_H
28
darinb53ebdc2006-07-09 15:10:21 +000029#include "DeprecatedPtrQueue.h"
eseidel40eb1b92006-03-25 22:20:36 +000030#include "NamedMappedAttrMap.h"
darind03140b2006-01-19 08:59:31 +000031#include "SegmentedString.h"
darina52f4e12006-02-02 02:51:03 +000032#include "Timer.h"
darinb53ebdc2006-07-09 15:10:21 +000033#include "XMLTokenizer.h"
darine775cf72006-07-09 22:48:56 +000034#include "CachedResourceClient.h"
kociendabb0c24b2001-08-24 14:24:40 +000035
darind03140b2006-01-19 08:59:31 +000036namespace WebCore {
hyatt3b4f6d42004-02-07 01:19:44 +000037
38class CachedScript;
darinb9481ed2006-03-20 02:57:59 +000039class DocumentFragment;
40class Document;
hyatt3ad24072006-06-26 23:53:02 +000041class HTMLDocument;
42class HTMLViewSourceDocument;
darinffd93c32006-01-31 17:09:20 +000043class FrameView;
darin644b75e2006-02-21 06:59:15 +000044class HTMLParser;
darinb9481ed2006-03-20 02:57:59 +000045class Node;
kociendabb0c24b2001-08-24 14:24:40 +000046
hyatt3b4f6d42004-02-07 01:19:44 +000047/**
48 * @internal
49 * represents one HTML tag. Consists of a numerical id, and the list
50 * of attributes. Can also represent text. In this case the id = 0 and
51 * text contains the text.
52 */
53class Token
54{
55public:
darin2a4c3742005-12-27 18:26:16 +000056 Token() : beginTag(true), flat(false) { }
hyatt59136b72005-07-09 20:19:28 +000057
darinb9481ed2006-03-20 02:57:59 +000058 void addAttribute(Document*, const AtomicString& attrName, const AtomicString& v);
hyatt59136b72005-07-09 20:19:28 +000059
darind03140b2006-01-19 08:59:31 +000060 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
61 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
hyatt59136b72005-07-09 20:19:28 +000062
hyatt3b4f6d42004-02-07 01:19:44 +000063 void reset()
64 {
darin2a4c3742005-12-27 18:26:16 +000065 attrs = 0;
66 text = 0;
darind03140b2006-01-19 08:59:31 +000067 tagName = nullAtom;
hyatt59136b72005-07-09 20:19:28 +000068 beginTag = true;
hyatt3b4f6d42004-02-07 01:19:44 +000069 flat = false;
70 }
hyatt59136b72005-07-09 20:19:28 +000071
darinb9481ed2006-03-20 02:57:59 +000072 RefPtr<NamedMappedAttrMap> attrs;
73 RefPtr<StringImpl> text;
darind03140b2006-01-19 08:59:31 +000074 AtomicString tagName;
darin2a4c3742005-12-27 18:26:16 +000075 bool beginTag;
76 bool flat;
kociendabb0c24b2001-08-24 14:24:40 +000077};
78
kociendabb0c24b2001-08-24 14:24:40 +000079//-----------------------------------------------------------------------------
80
darine775cf72006-07-09 22:48:56 +000081class HTMLTokenizer : public Tokenizer, public CachedResourceClient
kociendabb0c24b2001-08-24 14:24:40 +000082{
kociendabb0c24b2001-08-24 14:24:40 +000083public:
hyatt3ad24072006-06-26 23:53:02 +000084 HTMLTokenizer(HTMLDocument*);
85 HTMLTokenizer(HTMLViewSourceDocument*);
darinb9481ed2006-03-20 02:57:59 +000086 HTMLTokenizer(DocumentFragment*);
kociendabb0c24b2001-08-24 14:24:40 +000087 virtual ~HTMLTokenizer();
88
darin7ab31092006-05-10 04:59:57 +000089 virtual bool write(const SegmentedString&, bool appendData);
darina3cce732004-07-22 20:50:10 +000090 virtual void finish();
jensd7ffc9e2005-02-17 19:53:50 +000091 virtual void setForceSynchronous(bool force);
darined60ff22004-11-12 22:04:26 +000092 virtual bool isWaitingForScripts() const;
ggaren33e65442005-10-22 01:41:36 +000093 virtual void stopParsing();
hyatt9c4ba9b2004-11-10 03:47:56 +000094 virtual bool processingData() const;
eseidel25c8c222006-03-21 06:46:47 +000095 virtual int executingScript() const { return m_executingScript; }
kociendabb0c24b2001-08-24 14:24:40 +000096
darina52f4e12006-02-02 02:51:03 +000097private:
mjsb1c8f662005-10-18 03:15:31 +000098 class State;
99
100 // Where we are in parsing a tag
darina3cce732004-07-22 20:50:10 +0000101 void begin();
102 void end();
103
kociendabb0c24b2001-08-24 14:24:40 +0000104 void reset();
darinb9481ed2006-03-20 02:57:59 +0000105 PassRefPtr<Node> processToken();
kociendabb0c24b2001-08-24 14:24:40 +0000106
hyattd2c53f22006-01-15 07:12:43 +0000107 State processListing(SegmentedString, State);
108 State parseComment(SegmentedString&, State);
109 State parseServer(SegmentedString&, State);
110 State parseText(SegmentedString&, State);
111 State parseSpecial(SegmentedString&, State);
112 State parseTag(SegmentedString&, State);
darin7ab31092006-05-10 04:59:57 +0000113 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& _cBufferPos, bool start, bool parsingTag);
hyattd2c53f22006-01-15 07:12:43 +0000114 State parseProcessingInstruction(SegmentedString&, State);
mjsb1c8f662005-10-18 03:15:31 +0000115 State scriptHandler(State);
darinb9481ed2006-03-20 02:57:59 +0000116 State scriptExecution(const DeprecatedString& script, State state, DeprecatedString scriptURL = DeprecatedString(), int baseLine = 0);
darin7ab31092006-05-10 04:59:57 +0000117 void setSrc(const SegmentedString&);
kociendabb0c24b2001-08-24 14:24:40 +0000118
119 // check if we have enough space in the buffer.
120 // if not enlarge it
121 inline void checkBuffer(int len = 10)
122 {
123 if ( (dest - buffer) > size-len )
124 enlargeBuffer(len);
125 }
126 inline void checkScriptBuffer(int len = 10)
127 {
128 if ( scriptCodeSize + len >= scriptCodeMaxSize )
129 enlargeScriptBuffer(len);
130 }
131
132 void enlargeBuffer(int len);
133 void enlargeScriptBuffer(int len);
134
darin7ab31092006-05-10 04:59:57 +0000135 bool continueProcessing(int& processedCount, double startTime, State&);
darina52f4e12006-02-02 02:51:03 +0000136 void timerFired(Timer<HTMLTokenizer>*);
hyatt9c4ba9b2004-11-10 03:47:56 +0000137 void allDataProcessed();
138
darine775cf72006-07-09 22:48:56 +0000139 // from CachedResourceClient
140 void notifyFinished(CachedResource *finishedObj);
mjs9d0f55f2003-11-17 23:27:42 +0000141
kociendabb0c24b2001-08-24 14:24:40 +0000142 // Internal buffers
143 ///////////////////
darin7ab31092006-05-10 04:59:57 +0000144 UChar* buffer;
145 UChar* dest;
kociendabb0c24b2001-08-24 14:24:40 +0000146
hyatt3b4f6d42004-02-07 01:19:44 +0000147 Token currToken;
kociendabb0c24b2001-08-24 14:24:40 +0000148
149 // the size of buffer
150 int size;
151
152 // Tokenizer flags
153 //////////////////
154 // are we in quotes within a html tag
darin7ab31092006-05-10 04:59:57 +0000155 enum { NoQuote, SingleQuote, DoubleQuote } tquote;
kociendabb0c24b2001-08-24 14:24:40 +0000156
kociendabb0c24b2001-08-24 14:24:40 +0000157 // Are we in a &... character entity description?
mjsb1c8f662005-10-18 03:15:31 +0000158 enum EntityState {
kociendabb0c24b2001-08-24 14:24:40 +0000159 NoEntity = 0,
mjsb1c8f662005-10-18 03:15:31 +0000160 SearchEntity = 1,
161 NumericSearch = 2,
162 Hexadecimal = 3,
163 Decimal = 4,
164 EntityName = 5,
165 SearchSemicolon = 6
166 };
darin45265222003-05-07 16:01:49 +0000167 unsigned EntityUnicodeValue;
kociendabb0c24b2001-08-24 14:24:40 +0000168
mjsb1c8f662005-10-18 03:15:31 +0000169 enum TagState {
170 NoTag = 0,
171 TagName = 1,
172 SearchAttribute = 2,
173 AttributeName = 3,
174 SearchEqual = 4,
175 SearchValue = 5,
176 QuotedValue = 6,
177 Value = 7,
178 SearchEnd = 8
179 };
kociendabb0c24b2001-08-24 14:24:40 +0000180
mjsb1c8f662005-10-18 03:15:31 +0000181 class State {
182 public:
183 State() : m_bits(0) {}
kociendabb0c24b2001-08-24 14:24:40 +0000184
mjsb1c8f662005-10-18 03:15:31 +0000185 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
186 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
187 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
188 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
kociendabb0c24b2001-08-24 14:24:40 +0000189
mjsb1c8f662005-10-18 03:15:31 +0000190 bool inScript() const { return testBit(InScript); }
191 void setInScript(bool v) { setBit(InScript, v); }
192 bool inStyle() const { return testBit(InStyle); }
193 void setInStyle(bool v) { setBit(InStyle, v); }
194 bool inSelect() const { return testBit(InSelect); }
195 void setInSelect(bool v) { setBit(InSelect, v); }
196 bool inXmp() const { return testBit(InXmp); }
197 void setInXmp(bool v) { setBit(InXmp, v); }
198 bool inTitle() const { return testBit(InTitle); }
199 void setInTitle(bool v) { setBit(InTitle, v); }
200 bool inPlainText() const { return testBit(InPlainText); }
201 void setInPlainText(bool v) { setBit(InPlainText, v); }
202 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
203 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
204 bool inComment() const { return testBit(InComment); }
205 void setInComment(bool v) { setBit(InComment, v); }
206 bool inTextArea() const { return testBit(InTextArea); }
207 void setInTextArea(bool v) { setBit(InTextArea, v); }
208 bool escaped() const { return testBit(Escaped); }
209 void setEscaped(bool v) { setBit(Escaped, v); }
210 bool inServer() const { return testBit(InServer); }
211 void setInServer(bool v) { setBit(InServer, v); }
212 bool skipLF() const { return testBit(SkipLF); }
213 void setSkipLF(bool v) { setBit(SkipLF, v); }
214 bool startTag() const { return testBit(StartTag); }
215 void setStartTag(bool v) { setBit(StartTag, v); }
216 bool discardLF() const { return testBit(DiscardLF); }
217 void setDiscardLF(bool v) { setBit(DiscardLF, v); }
218 bool allowYield() const { return testBit(AllowYield); }
219 void setAllowYield(bool v) { setBit(AllowYield, v); }
220 bool loadingExtScript() const { return testBit(LoadingExtScript); }
221 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
222 bool forceSynchronous() const { return testBit(ForceSynchronous); }
223 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
kociendabb0c24b2001-08-24 14:24:40 +0000224
mjsb1c8f662005-10-18 03:15:31 +0000225 bool inAnySpecial() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle); }
226 bool hasTagState() const { return m_bits & TagMask; }
227 bool hasEntityState() const { return m_bits & EntityMask; }
darinf028f812002-06-10 20:08:04 +0000228
mjsb1c8f662005-10-18 03:15:31 +0000229 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | TagMask | EntityMask | InPlainText | InComment | InServer | InProcessingInstruction | StartTag); }
kociendabb0c24b2001-08-24 14:24:40 +0000230
mjsb1c8f662005-10-18 03:15:31 +0000231 private:
232 static const int EntityShift = 4;
233 enum StateBits {
234 TagMask = (1 << 4) - 1,
235 EntityMask = (1 << 7) - (1 << 4),
236 InScript = 1 << 7,
237 InStyle = 1 << 8,
238 InSelect = 1 << 9,
239 InXmp = 1 << 10,
240 InTitle = 1 << 11,
241 InPlainText = 1 << 12,
242 InProcessingInstruction = 1 << 13,
243 InComment = 1 << 14,
244 InTextArea = 1 << 15,
245 Escaped = 1 << 16,
246 InServer = 1 << 17,
247 SkipLF = 1 << 18,
248 StartTag = 1 << 19,
249 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
250 AllowYield = 1 << 21,
251 LoadingExtScript = 1 << 22,
weinigab5f09e2006-07-29 23:15:25 +0000252 ForceSynchronous = 1 << 23
mjsb1c8f662005-10-18 03:15:31 +0000253 };
254
255 void setBit(StateBits bit, bool value)
256 {
257 if (value)
258 m_bits |= bit;
259 else
260 m_bits &= ~bit;
261 }
262 bool testBit(StateBits bit) const { return m_bits & bit; }
kociendabb0c24b2001-08-24 14:24:40 +0000263
mjsb1c8f662005-10-18 03:15:31 +0000264 unsigned m_bits;
265 };
kociendabb0c24b2001-08-24 14:24:40 +0000266
mjsb1c8f662005-10-18 03:15:31 +0000267 State m_state;
mjs6f821c82002-03-22 00:31:57 +0000268
269 bool brokenServer;
270
hyatt2773dff2005-07-18 21:44:31 +0000271 // Name of an attribute that we just scanned.
darind03140b2006-01-19 08:59:31 +0000272 AtomicString attrName;
hyattbaa79d02002-12-16 01:51:51 +0000273
kociendabb0c24b2001-08-24 14:24:40 +0000274 // Used to store the code of a srcipting sequence
darin7ab31092006-05-10 04:59:57 +0000275 UChar* scriptCode;
kociendabb0c24b2001-08-24 14:24:40 +0000276 // Size of the script sequenze stored in @ref #scriptCode
277 int scriptCodeSize;
278 // Maximal size that can be stored in @ref #scriptCode
279 int scriptCodeMaxSize;
gramps0aed4d62001-09-19 15:53:27 +0000280 // resync point of script code size
281 int scriptCodeResync;
mjs6f821c82002-03-22 00:31:57 +0000282
kociendabb0c24b2001-08-24 14:24:40 +0000283 // Stores characters if we are scanning for a string like "</script>"
darin7ab31092006-05-10 04:59:57 +0000284 UChar searchBuffer[10];
kociendabb0c24b2001-08-24 14:24:40 +0000285 // Counts where we are in the string we are scanning for
286 int searchCount;
287 // The string we are searching for
darin7ab31092006-05-10 04:59:57 +0000288 const UChar* searchFor;
gramps0aed4d62001-09-19 15:53:27 +0000289 // the stopper string
290 const char* searchStopper;
291 // the stopper len
292 int searchStopperLen;
kociendabb0c24b2001-08-24 14:24:40 +0000293 // if no more data is coming, just parse what we have (including ext scripts that
294 // may be still downloading) and finish
295 bool noMoreData;
296 // URL to get source code of script from
darinb9481ed2006-03-20 02:57:59 +0000297 DeprecatedString scriptSrc;
darinb3547a32006-09-06 04:40:44 +0000298 String scriptSrcCharset;
kociendabb0c24b2001-08-24 14:24:40 +0000299 bool javascript;
300 // the HTML code we will parse after the external script we are waiting for has loaded
hyattd2c53f22006-01-15 07:12:43 +0000301 SegmentedString pendingSrc;
mjs4ed3d112004-07-20 20:45:22 +0000302
303 // the HTML code we will parse after this particular script has
304 // loaded, but before all pending HTML
hyattd2c53f22006-01-15 07:12:43 +0000305 SegmentedString *currentPrependingSrc;
mjs4ed3d112004-07-20 20:45:22 +0000306
kociendabb0c24b2001-08-24 14:24:40 +0000307 // true if we are executing a script while parsing a document. This causes the parsing of
308 // the output of the script to be postponed until after the script has finished executing
309 int m_executingScript;
darinb9481ed2006-03-20 02:57:59 +0000310 DeprecatedPtrQueue<CachedScript> pendingScripts;
311 RefPtr<Node> scriptNode;
kociendabb0c24b2001-08-24 14:24:40 +0000312
ddkilzer24652a92006-07-02 11:39:43 +0000313 bool m_requestingScript;
314
kociendabb0c24b2001-08-24 14:24:40 +0000315 // if we found one broken comment, there are most likely others as well
316 // store a flag to get rid of the O(n^2) behaviour in such a case.
317 bool brokenComments;
mjs6f821c82002-03-22 00:31:57 +0000318 // current line number
319 int lineno;
320 // line number at which the current <script> started
321 int scriptStartLineno;
322 int tagStartLineno;
kociendabb0c24b2001-08-24 14:24:40 +0000323
hyatt9c4ba9b2004-11-10 03:47:56 +0000324 // The timer for continued processing.
darina52f4e12006-02-02 02:51:03 +0000325 Timer<HTMLTokenizer> m_timer;
hyatt9c4ba9b2004-11-10 03:47:56 +0000326
sullivan6062ecb2003-07-24 22:43:50 +0000327// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
328// So any fixed number might be too small, but rather than rewriting all usage of this buffer
329// we'll just make it large enough to handle all imaginable cases.
330#define CBUFLEN 1024
kociendabb0c24b2001-08-24 14:24:40 +0000331 char cBuffer[CBUFLEN+2];
mjsb1c8f662005-10-18 03:15:31 +0000332 unsigned int m_cBufferPos;
kdecker2e74e952005-03-15 21:44:32 +0000333
hyattd2c53f22006-01-15 07:12:43 +0000334 SegmentedString src;
darinb9481ed2006-03-20 02:57:59 +0000335 Document* m_doc;
darin644b75e2006-02-21 06:59:15 +0000336 HTMLParser* parser;
darin895eae12003-01-12 17:01:13 +0000337 bool inWrite;
darine9700da2006-03-06 23:09:48 +0000338 bool m_fragment;
kociendabb0c24b2001-08-24 14:24:40 +0000339};
darinb95d6c42002-06-04 00:19:07 +0000340
darinb9481ed2006-03-20 02:57:59 +0000341void parseHTMLDocumentFragment(const String&, DocumentFragment*);
darin644b75e2006-02-21 06:59:15 +0000342
darin7ab31092006-05-10 04:59:57 +0000343UChar decodeNamedEntity(const char*);
eseidel363bc0d2005-10-27 06:03:33 +0000344
hyatt3b4f6d42004-02-07 01:19:44 +0000345}
346
darin644b75e2006-02-21 06:59:15 +0000347#endif