blob: fd18595df2ee772db15764b82325527fb2de98dc [file] [log] [blame]
/*
This file is part of the KDE libraries
Copyright (C) 1997 Martin Jones (mjones@kde.org)
(C) 1997 Torben Weis (weis@kde.org)
(C) 1998 Waldo Bastian (bastian@kde.org)
(C) 1999 Lars Knoll (knoll@kde.org)
(C) 1999 Antti Koivisto (koivisto@kde.org)
(C) 2001 Dirk Mueller (mueller@kde.org)
Copyright (C) 2004, 2005, 2006 Apple Computer, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
#include "config.h"
#include "htmltokenizer.h"
#include "CachedScript.h"
#include "DocLoader.h"
#include "DocumentFragmentImpl.h"
#include "EventNames.h"
#include "Frame.h"
#include "FrameView.h"
#include "HTMLElementImpl.h"
#include "SystemTime.h"
#include "csshelper.h"
#include "html_documentimpl.h"
#include "htmlnames.h"
#include "htmlparser.h"
#include "kjs_proxy.h"
#include <ctype.h>
#include <stdlib.h>
#include "kentities.c"
// #define INSTRUMENT_LAYOUT_SCHEDULING 1
#define TOKENIZER_CHUNK_SIZE 4096
namespace WebCore {
using namespace HTMLNames;
using namespace EventNames;
// FIXME: We would like this constant to be 200ms.
// Yielding more aggressively results in increased responsiveness and better incremental rendering.
// It slows down overall page-load on slower machines, though, so for now we set a value of 500.
const double tokenizerTimeDelay = 0.500;
static const char commentStart [] = "<!--";
static const char scriptEnd [] = "</script";
static const char xmpEnd [] = "</xmp";
static const char styleEnd [] = "</style";
static const char textareaEnd [] = "</textarea";
static const char titleEnd [] = "</title";
#define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) fastMalloc( sizeof(QChar)*( N ) )
#define KHTML_DELETE_QCHAR_VEC( P ) fastFree((char*)( P ))
// Full support for MS Windows extensions to Latin-1.
// Technically these extensions should only be activated for pages
// marked "windows-1252" or "cp1252", but
// in the standard Microsoft way, these extensions infect hundreds of thousands
// of web pages. Note that people with non-latin-1 Microsoft extensions
// are SOL.
//
// See: http://www.microsoft.com/globaldev/reference/WinCP.asp
// http://www.bbsinc.com/iso8859.html
// http://www.obviously.com/
//
// There may be better equivalents
// We need this for entities at least. For non-entity text, we could
// handle this in the text encoding.
// To cover non-entity text, I think this function would need to be called
// in more places. There seem to be some places that don't call fixUpChar.
static const unsigned short windowsLatin1ExtensionArray[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
};
static inline QChar fixUpChar(QChar c)
{
unsigned short code = c.unicode();
if ((code & ~0x1F) != 0x0080)
return c;
return windowsLatin1ExtensionArray[code - 0x80];
}
inline bool tagMatch(const char *s1, const QChar *s2, uint length)
{
for (uint i = 0; i != length; ++i) {
char c1 = s1[i];
char uc1 = toupper(c1);
QChar c2 = s2[i];
if (c1 != c2 && uc1 != c2)
return false;
}
return true;
}
void Token::addAttribute(DocumentImpl* doc, const AtomicString& attrName, const AtomicString& v)
{
AttributeImpl* a = 0;
if (!attrName.isEmpty() && attrName != "/") {
a = new MappedAttributeImpl(attrName, v);
if (!attrs)
attrs = new NamedMappedAttrMapImpl(0);
attrs->insertAttribute(a);
}
}
// ----------------------------------------------------------------------------
HTMLTokenizer::HTMLTokenizer(DocumentImpl* doc)
: buffer(0)
, scriptCode(0)
, scriptCodeSize(0)
, scriptCodeMaxSize(0)
, scriptCodeResync(0)
, m_executingScript(0)
, m_timer(this, &HTMLTokenizer::timerFired)
, m_doc(doc)
, inWrite(false)
, m_fragment(false)
{
parser = new HTMLParser(doc);
begin();
}
HTMLTokenizer::HTMLTokenizer(DocumentFragmentImpl* frag)
: buffer(0)
, scriptCode(0)
, scriptCodeSize(0)
, scriptCodeMaxSize(0)
, scriptCodeResync(0)
, m_executingScript(0)
, m_timer(this, &HTMLTokenizer::timerFired)
, m_doc(frag->getDocument())
, inWrite(false)
, m_fragment(true)
{
parser = new HTMLParser(frag);
begin();
}
void HTMLTokenizer::reset()
{
ASSERT(m_executingScript == 0);
while (!pendingScripts.isEmpty()) {
CachedScript *cs = pendingScripts.dequeue();
ASSERT(cs->accessCount() > 0);
cs->deref(this);
}
if (buffer)
KHTML_DELETE_QCHAR_VEC(buffer);
buffer = dest = 0;
size = 0;
if (scriptCode)
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
m_timer.stop();
m_state.setAllowYield(false);
m_state.setForceSynchronous(false);
currToken.reset();
}
void HTMLTokenizer::begin()
{
m_executingScript = 0;
m_state.setLoadingExtScript(false);
reset();
size = 254;
buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
dest = buffer;
tquote = NoQuote;
searchCount = 0;
m_state.setEntityState(NoEntity);
scriptSrc = QString::null;
pendingSrc.clear();
currentPrependingSrc = 0;
noMoreData = false;
brokenComments = false;
brokenServer = false;
lineno = 0;
scriptStartLineno = 0;
tagStartLineno = 0;
m_state.setForceSynchronous(false);
}
void HTMLTokenizer::setForceSynchronous(bool force)
{
m_state.setForceSynchronous(force);
}
HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
{
// This function adds the listing 'list' as
// preformatted text-tokens to the token-collection
while (!list.isEmpty()) {
if (state.skipLF()) {
state.setSkipLF(false);
if (*list == '\n') {
++list;
continue;
}
}
checkBuffer();
if (*list == '\n' || *list == '\r') {
if (state.discardLF())
// Ignore this LF
state.setDiscardLF(false); // We have discarded 1 LF
else
*dest++ = '\n';
/* Check for MS-DOS CRLF sequence */
if (*list == '\r')
state.setSkipLF(true);
++list;
} else {
state.setDiscardLF(false);
*dest++ = *list;
++list;
}
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
{
ASSERT(state.inTextArea() || state.inTitle() || !state.hasEntityState());
ASSERT(!state.hasTagState());
ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() == 1 );
if (state.inScript())
scriptStartLineno = lineno + src.lineCount();
if (state.inComment())
state = parseComment(src, state);
while ( !src.isEmpty() ) {
checkScriptBuffer();
unsigned char ch = src->latin1();
if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && !state.inTitle() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
state.setInComment(true);
state = parseComment(src, state);
continue;
}
if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
++src;
scriptCodeSize = scriptCodeResync-1;
scriptCodeResync = 0;
scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
if (state.inScript())
state = scriptHandler(state);
else {
state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
processToken();
if (state.inStyle()) {
currToken.tagName = styleTag.localName();
currToken.beginTag = false;
} else if (state.inTextArea()) {
currToken.tagName = textareaTag.localName();
currToken.beginTag = false;
} else if (state.inTitle()) {
currToken.tagName = titleTag.localName();
currToken.beginTag = false;
} else if (state.inXmp()) {
currToken.tagName = xmpTag.localName();
currToken.beginTag = false;
}
processToken();
state.setInStyle(false);
state.setInScript(false);
state.setInTextArea(false);
state.setInTitle(false);
state.setInXmp(false);
tquote = NoQuote;
scriptCodeSize = scriptCodeResync = 0;
}
return state;
}
// possible end of tagname, lets check.
if ( !scriptCodeResync && !state.escaped() && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
scriptCodeSize >= searchStopperLen &&
tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
scriptCodeResync = scriptCodeSize-searchStopperLen+1;
tquote = NoQuote;
continue;
}
if ( scriptCodeResync && !state.escaped() ) {
if(ch == '\"')
tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
else if(ch == '\'')
tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
tquote = NoQuote;
}
state.setEscaped(!state.escaped() && ch == '\\');
if (!scriptCodeResync && (state.inTextArea() || state.inTitle()) && !src.escaped() && ch == '&') {
QChar *scriptCodeDest = scriptCode+scriptCodeSize;
++src;
state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
scriptCodeSize = scriptCodeDest-scriptCode;
}
else {
scriptCode[scriptCodeSize++] = fixUpChar(*src);
++src;
}
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
{
// We are inside a <script>
bool doScriptExec = false;
// (Bugzilla 3837) Scripts following a frameset element should not execute or,
// in the case of extern scripts, even load.
bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->hasTagName(framesetTag));
CachedScript* cs = 0;
// don't load external scripts for standalone documents (for now)
if (!scriptSrc.isEmpty() && parser->doc()->frame()) {
// forget what we just got; load from src url instead
if (!parser->skipMode() && !followingFrameset) {
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Requesting script at time %d\n", parser->doc()->elapsedTime());
#endif
if ( (cs = parser->doc()->docLoader()->requestScript(scriptSrc, scriptSrcCharset) ))
pendingScripts.enqueue(cs);
else
scriptNode = 0;
} else
scriptNode = 0;
scriptSrc=QString::null;
}
else {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "---START SCRIPT---" << endl;
kdDebug( 6036 ) << QString(scriptCode, scriptCodeSize) << endl;
kdDebug( 6036 ) << "---END SCRIPT---" << endl;
#endif
scriptNode = 0;
// Parse scriptCode containing <script> info
doScriptExec = true;
}
state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
QString exScript( buffer, dest-buffer );
processToken();
currToken.tagName = scriptTag.localName();
currToken.beginTag = false;
processToken();
SegmentedString *savedPrependingSrc = currentPrependingSrc;
SegmentedString prependingSrc;
currentPrependingSrc = &prependingSrc;
if (!parser->skipMode() && !followingFrameset) {
if (cs) {
if (savedPrependingSrc) {
savedPrependingSrc->append(src);
} else {
pendingSrc.prepend(src);
}
setSrc(SegmentedString());
scriptCodeSize = scriptCodeResync = 0;
// the ref() call below may call notifyFinished if the script is already in cache,
// and that mucks with the state directly, so we must write it back to the object.
m_state = state;
cs->ref(this);
state = m_state;
// will be 0 if script was already loaded and ref() executed it
if (!pendingScripts.isEmpty())
state.setLoadingExtScript(true);
}
else if (!m_fragment && doScriptExec && javascript ) {
if (!m_executingScript)
pendingSrc.prepend(src);
else
prependingSrc = src;
setSrc(SegmentedString());
scriptCodeSize = scriptCodeResync = 0;
state = scriptExecution(exScript, state, QString::null, scriptStartLineno);
}
}
state.setInScript(false);
scriptCodeSize = scriptCodeResync = 0;
if (!m_executingScript && !state.loadingExtScript()) {
src.append(pendingSrc);
pendingSrc.clear();
} else if (!prependingSrc.isEmpty()) {
// restore first so that the write appends in the right place
// (does not hurt to do it again below)
currentPrependingSrc = savedPrependingSrc;
// we need to do this slightly modified bit of one of the write() cases
// because we want to prepend to pendingSrc rather than appending
// if there's no previous prependingSrc
if (state.loadingExtScript()) {
if (currentPrependingSrc) {
currentPrependingSrc->append(prependingSrc);
} else {
pendingSrc.prepend(prependingSrc);
}
} else {
m_state = state;
write(prependingSrc, false);
state = m_state;
}
}
currentPrependingSrc = savedPrependingSrc;
return state;
}
HTMLTokenizer::State HTMLTokenizer::scriptExecution(const QString& str, State state, QString scriptURL, int baseLine)
{
if (m_fragment || !m_doc->frame())
return state;
bool oldscript = state.inScript();
m_executingScript++;
state.setInScript(false);
QString url = scriptURL.isNull() ? m_doc->frame()->document()->URL() : scriptURL;
SegmentedString *savedPrependingSrc = currentPrependingSrc;
SegmentedString prependingSrc;
currentPrependingSrc = &prependingSrc;
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("beginning script execution at %d\n", parser->doc()->elapsedTime());
#endif
m_state = state;
m_doc->frame()->executeScript(url,baseLine,0,str);
state = m_state;
state.setAllowYield(true);
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("ending script execution at %d\n", parser->doc()->elapsedTime());
#endif
m_executingScript--;
state.setInScript(oldscript);
if (!m_executingScript && !state.loadingExtScript()) {
src.append(pendingSrc);
pendingSrc.clear();
} else if (!prependingSrc.isEmpty()) {
// restore first so that the write appends in the right place
// (does not hurt to do it again below)
currentPrependingSrc = savedPrependingSrc;
// we need to do this slightly modified bit of one of the write() cases
// because we want to prepend to pendingSrc rather than appending
// if there's no previous prependingSrc
if (state.loadingExtScript()) {
if (currentPrependingSrc) {
currentPrependingSrc->append(prependingSrc);
} else {
pendingSrc.prepend(prependingSrc);
}
} else {
m_state = state;
write(prependingSrc, false);
state = m_state;
}
}
currentPrependingSrc = savedPrependingSrc;
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
{
// FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
bool strict = !parser->doc()->inCompatMode() && !state.inScript() && !state.inStyle();
int delimiterCount = 0;
bool canClose = false;
checkScriptBuffer(src.length());
while ( !src.isEmpty() ) {
scriptCode[ scriptCodeSize++ ] = *src;
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("comment is now: *%s*",
QConstString((QChar*)src.operator->(), kMin(16U, src.length())).qstring().latin1());
#endif
if (strict) {
if (src->unicode() == '-') {
delimiterCount++;
if (delimiterCount == 2) {
delimiterCount = 0;
canClose = !canClose;
}
}
else
delimiterCount = 0;
}
if ((!strict || canClose) && src->unicode() == '>') {
bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
int endCharsCount = 1; // start off with one for the '>' character
if (!strict) {
// In quirks mode just check for -->
if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
endCharsCount = 3;
}
else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' &&
scriptCode[scriptCodeSize-2] == '!') {
// Other browsers will accept --!> as a close comment, even though it's
// not technically valid.
endCharsCount = 4;
}
}
if (canClose || handleBrokenComments || endCharsCount > 1) {
++src;
if (!(state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) {
#ifdef INCLUDE_COMMENTS_IN_DOM // FIXME: Turn this on soon.
checkScriptBuffer();
scriptCode[scriptCodeSize] = 0;
scriptCode[scriptCodeSize + 1] = 0;
currToken.tagName = commentAtom;
currToken.beginTag = true;
state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
processToken();
currToken.tagName = commentAtom;
currToken.beginTag = false;
processToken();
#endif
scriptCodeSize = 0;
}
state.setInComment(false);
return state; // Finished parsing comment
}
}
++src;
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
{
checkScriptBuffer(src.length());
while (!src.isEmpty()) {
scriptCode[scriptCodeSize++] = *src;
if (src->unicode() == '>' &&
scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
++src;
state.setInServer(false);
scriptCodeSize = 0;
return state; // Finished parsing server include
}
++src;
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
{
char oldchar = 0;
while ( !src.isEmpty() )
{
unsigned char chbegin = src->latin1();
if(chbegin == '\'') {
tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
}
else if(chbegin == '\"') {
tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
}
// Look for '?>'
// some crappy sites omit the "?" before it, so
// we look for an unquoted '>' instead. (IE compatible)
else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
{
// We got a '?>' sequence
state.setInProcessingInstruction(false);
++src;
state.setDiscardLF(true);
return state; // Finished parsing comment!
}
++src;
oldchar = chbegin;
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
{
while (!src.isEmpty()) {
unsigned short cc = src->unicode();
if (state.skipLF()) {
state.setSkipLF(false);
if (cc == '\n') {
++src;
continue;
}
}
// do we need to enlarge the buffer?
checkBuffer();
if (cc == '\r') {
state.setSkipLF(true);
*dest++ = '\n';
} else
*dest++ = fixUpChar(cc);
++src;
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, QChar *&dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
{
if (start)
{
cBufferPos = 0;
state.setEntityState(SearchEntity);
EntityUnicodeValue = 0;
}
while(!src.isEmpty())
{
unsigned short cc = src->unicode();
switch(state.entityState()) {
case NoEntity:
ASSERT(state.entityState() != NoEntity);
return state;
case SearchEntity:
if(cc == '#') {
cBuffer[cBufferPos++] = cc;
++src;
state.setEntityState(NumericSearch);
}
else
state.setEntityState(EntityName);
break;
case NumericSearch:
if(cc == 'x' || cc == 'X') {
cBuffer[cBufferPos++] = cc;
++src;
state.setEntityState(Hexadecimal);
}
else if(cc >= '0' && cc <= '9')
state.setEntityState(Decimal);
else
state.setEntityState(SearchSemicolon);
break;
case Hexadecimal:
{
int ll = kMin(src.length(), 10-cBufferPos);
while(ll--) {
QChar csrc(src->lower());
cc = csrc.cell();
if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
state.setEntityState(SearchSemicolon);
break;
}
EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
cBuffer[cBufferPos++] = cc;
++src;
}
if (cBufferPos == 10)
state.setEntityState(SearchSemicolon);
break;
}
case Decimal:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
cc = src->cell();
if(src->row() || !(cc >= '0' && cc <= '9')) {
state.setEntityState(SearchSemicolon);
break;
}
EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
cBuffer[cBufferPos++] = cc;
++src;
}
if (cBufferPos == 9)
state.setEntityState(SearchSemicolon);
break;
}
case EntityName:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
QChar csrc = *src;
cc = csrc.cell();
if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
(cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
state.setEntityState(SearchSemicolon);
break;
}
cBuffer[cBufferPos++] = cc;
++src;
}
if (cBufferPos == 9)
state.setEntityState(SearchSemicolon);
if (state.entityState() == SearchSemicolon) {
if(cBufferPos > 1) {
const Entity *e = findEntity(cBuffer, cBufferPos);
if(e)
EntityUnicodeValue = e->code;
// be IE compatible
if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
EntityUnicodeValue = 0;
}
}
else
break;
}
case SearchSemicolon:
// Don't allow values that are more than 21 bits.
if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x1FFFFF) {
if (*src == ';')
++src;
if (EntityUnicodeValue <= 0xFFFF) {
checkBuffer();
src.push(fixUpChar(EntityUnicodeValue));
} else {
// Convert to UTF-16, using surrogate code points.
QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
checkBuffer(2);
src.push(c1);
src.push(c2);
}
} else {
checkBuffer(10);
// ignore the sequence, add it to the buffer as plaintext
*dest++ = '&';
for(unsigned int i = 0; i < cBufferPos; i++)
dest[i] = cBuffer[i];
dest += cBufferPos;
}
state.setEntityState(NoEntity);
return state;
}
}
return state;
}
HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
{
ASSERT(!state.hasEntityState());
unsigned cBufferPos = m_cBufferPos;
while (!src.isEmpty())
{
checkBuffer();
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
uint l = 0;
while(l < src.length() && (*(src.operator->()+l)).latin1() != '>')
l++;
qDebug("src is now: *%s*, tquote: %d",
QConstString((QChar*)src.operator->(), l).qstring().latin1(), tquote);
#endif
switch(state.tagState()) {
case NoTag:
{
m_cBufferPos = cBufferPos;
return state;
}
case TagName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("TagName");
#endif
if (searchCount > 0)
{
if (*src == commentStart[searchCount])
{
searchCount++;
if (searchCount == 4)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Found comment" << endl;
#endif
// Found '<!--' sequence
++src;
dest = buffer; // ignore the previous part of this tag
state.setInComment(true);
state.setTagState(NoTag);
// Fix bug 34302 at kde.bugs.org. Go ahead and treat
// <!--> as a valid comment, since both mozilla and IE on windows
// can handle this case. Only do this in quirks mode. -dwh
if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
state.setInComment(false);
++src;
if (!src.isEmpty())
cBuffer[cBufferPos++] = src->cell();
}
else
state = parseComment(src, state);
m_cBufferPos = cBufferPos;
return state; // Finished parsing tag!
}
// cuts of high part, is okay
cBuffer[cBufferPos++] = src->cell();
++src;
break;
}
else
searchCount = 0; // Stop looking for '<!--' sequence
}
bool finish = false;
unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
unsigned short curchar = *src;
if(curchar <= ' ' || curchar == '>' ) {
finish = true;
break;
}
// tolower() shows up on profiles. This is faster!
if (curchar >= 'A' && curchar <= 'Z')
cBuffer[cBufferPos++] = curchar + ('a' - 'A');
else
cBuffer[cBufferPos++] = curchar;
++src;
}
// Disadvantage: we add the possible rest of the tag
// as attribute names. ### judge if this causes problems
if(finish || CBUFLEN == cBufferPos) {
bool beginTag;
char* ptr = cBuffer;
unsigned int len = cBufferPos;
cBuffer[cBufferPos] = '\0';
if ((cBufferPos > 0) && (*ptr == '/'))
{
// End Tag
beginTag = false;
ptr++;
len--;
}
else
// Start Tag
beginTag = true;
// Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/".
if (len > 1 && ptr[len-1] == '/')
ptr[--len] = '\0';
// Now that we've shaved off any invalid / that might have followed the name), make the tag.
if (ptr[0] != '!' && strcmp(ptr, "!doctype") != 0) {
currToken.tagName = AtomicString(ptr);
currToken.beginTag = beginTag;
}
dest = buffer;
state.setTagState(SearchAttribute);
cBufferPos = 0;
}
break;
}
case SearchAttribute:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchAttribute");
#endif
bool atespace = false;
unsigned short curchar;
while(!src.isEmpty()) {
curchar = *src;
// In this mode just ignore any quotes we encounter and treat them like spaces.
if (curchar > ' ' && curchar != '\'' && curchar != '"') {
if (curchar == '<' || curchar == '>')
state.setTagState(SearchEnd);
else
state.setTagState(AttributeName);
cBufferPos = 0;
break;
}
atespace = true;
++src;
}
break;
}
case AttributeName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("AttributeName");
#endif
unsigned short curchar;
int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
curchar = *src;
if (curchar <= '>' && (curchar >= '=' || curchar <= ' ')) {
cBuffer[cBufferPos] = '\0';
attrName = AtomicString(cBuffer);
dest = buffer;
*dest++ = 0;
state.setTagState(SearchEqual);
// This is a deliberate quirk to match Mozilla and Opera. We have to do this
// since sites that use the "standards-compliant" path sometimes send
// <script src="foo.js"/>. Both Moz and Opera will honor this, despite it
// being bogus HTML. They do not honor the "/" for other tags. This behavior
// also deviates from WinIE, but in this case we'll just copy Moz and Opera.
if (currToken.tagName == scriptTag && curchar == '>' && attrName == "/")
currToken.flat = true;
break;
}
// tolower() shows up on profiles. This is faster!
if (curchar >= 'A' && curchar <= 'Z')
cBuffer[cBufferPos++] = curchar + ('a' - 'A');
else
cBuffer[cBufferPos++] = curchar;
++src;
}
if ( cBufferPos == CBUFLEN ) {
cBuffer[cBufferPos] = '\0';
attrName = AtomicString(cBuffer);
dest = buffer;
*dest++ = 0;
state.setTagState(SearchEqual);
}
break;
}
case SearchEqual:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchEqual");
#endif
unsigned short curchar;
bool atespace = false;
while(!src.isEmpty()) {
curchar = src->unicode();
// In this mode just ignore any quotes we encounter and treat them like spaces.
if (curchar > ' ' && curchar != '\'' && curchar != '"') {
if(curchar == '=') {
#ifdef TOKEN_DEBUG
kdDebug(6036) << "found equal" << endl;
#endif
state.setTagState(SearchValue);
++src;
}
else {
currToken.addAttribute(parser->doc(), attrName, emptyAtom);
dest = buffer;
state.setTagState(SearchAttribute);
}
break;
}
atespace = true;
++src;
}
break;
}
case SearchValue:
{
unsigned short curchar;
while(!src.isEmpty()) {
curchar = src->unicode();
if(curchar > ' ') {
if(( curchar == '\'' || curchar == '\"' )) {
tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
state.setTagState(QuotedValue);
++src;
} else
state.setTagState(Value);
break;
}
++src;
}
break;
}
case QuotedValue:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("QuotedValue");
#endif
unsigned short curchar;
while(!src.isEmpty()) {
checkBuffer();
curchar = src->unicode();
if (curchar == '>' && attrName.isEmpty()) {
// Handle a case like <img '>. Just go ahead and be willing
// to close the whole tag. Don't consume the character and
// just go back into SearchEnd while ignoring the whole
// value.
// FIXME: Note that this is actually not a very good solution. It's
// an interim hack and doesn't handle the general case of
// unmatched quotes among attributes that have names. -dwh
while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
dest--; // remove trailing newlines
AtomicString v(buffer+1, dest-buffer-1);
attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
currToken.addAttribute(parser->doc(), attrName, v);
state.setTagState(SearchAttribute);
dest = buffer;
tquote = NoQuote;
break;
}
if(curchar <= '\'' && !src.escaped()) {
// ### attributes like '&{blaa....};' are supposed to be treated as jscript.
if ( curchar == '&' )
{
++src;
state = parseEntity(src, dest, state, cBufferPos, true, true);
break;
}
else if ( (tquote == SingleQuote && curchar == '\'') ||
(tquote == DoubleQuote && curchar == '\"') )
{
// some <input type=hidden> rely on trailing spaces. argh
while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
dest--; // remove trailing newlines
AtomicString v(buffer+1, dest-buffer-1);
if (attrName.isEmpty())
attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
currToken.addAttribute(parser->doc(), attrName, v);
dest = buffer;
state.setTagState(SearchAttribute);
tquote = NoQuote;
++src;
break;
}
}
*dest++ = fixUpChar(*src);
++src;
}
break;
}
case Value:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("Value");
#endif
unsigned short curchar;
while(!src.isEmpty()) {
checkBuffer();
curchar = src->unicode();
if(curchar <= '>' && !src.escaped()) {
// parse Entities
if ( curchar == '&' )
{
++src;
state = parseEntity(src, dest, state, cBufferPos, true, true);
break;
}
// no quotes. Every space means end of value
// '/' does not delimit in IE!
if ( curchar <= ' ' || curchar == '>' )
{
AtomicString v(buffer+1, dest-buffer-1);
currToken.addAttribute(parser->doc(), attrName, v);
dest = buffer;
state.setTagState(SearchAttribute);
break;
}
}
*dest++ = fixUpChar(*src);
++src;
}
break;
}
case SearchEnd:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchEnd");
#endif
while(!src.isEmpty()) {
if (*src == '>' || *src == '<')
break;
if (*src == '/')
currToken.flat = true;
++src;
}
if (src.isEmpty()) break;
searchCount = 0; // Stop looking for '<!--' sequence
state.setTagState(NoTag);
tquote = NoQuote;
if (*src != '<')
++src;
if (currToken.tagName == nullAtom) { //stop if tag is unknown
m_cBufferPos = cBufferPos;
return state;
}
AtomicString tagName = currToken.tagName;
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
kdDebug( 6036 ) << "appending Tag: " << tagName.qstring() << endl;
#endif
// Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
// compatibility.
bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
bool beginTag = !currToken.flat && currToken.beginTag;
if (currToken.beginTag && currToken.tagName == scriptTag) {
AttributeImpl* a = 0;
bool foundTypeAttribute = false;
scriptSrc = QString::null;
scriptSrcCharset = QString::null;
if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
parser->doc()->frame() &&
parser->doc()->frame()->jScriptEnabled() && /* jscript allowed at all? */
!m_fragment /* are we a regular tokenizer or just for innerHTML ? */
) {
if ((a = currToken.attrs->getAttributeItem(srcAttr)))
scriptSrc = parser->doc()->completeURL(parseURL(a->value()).qstring());
if ((a = currToken.attrs->getAttributeItem(charsetAttr)))
scriptSrcCharset = a->value().qstring().stripWhiteSpace();
if ( scriptSrcCharset.isEmpty() )
scriptSrcCharset = parser->doc()->frame()->encoding();
/* Check type before language, since language is deprecated */
if ((a = currToken.attrs->getAttributeItem(typeAttr)) != 0 && !a->value().isEmpty())
foundTypeAttribute = true;
else
a = currToken.attrs->getAttributeItem(languageAttr);
}
javascript = true;
if( foundTypeAttribute ) {
/*
Mozilla 1.5 accepts application/x-javascript, and some web references claim it is the only
correct variation, but WinIE 6 doesn't accept it.
Neither Mozilla 1.5 nor WinIE 6 accept application/javascript, application/ecmascript, or
application/x-ecmascript.
Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
We want to accept all the values that either of these browsers accept, but not other values.
*/
QString type = a->value().qstring().stripWhiteSpace().lower();
if( type.compare("application/x-javascript") != 0 &&
type.compare("text/javascript") != 0 &&
type.compare("text/javascript1.0") != 0 &&
type.compare("text/javascript1.1") != 0 &&
type.compare("text/javascript1.2") != 0 &&
type.compare("text/javascript1.3") != 0 &&
type.compare("text/javascript1.4") != 0 &&
type.compare("text/javascript1.5") != 0 &&
type.compare("text/jscript") != 0 &&
type.compare("text/ecmascript") != 0 &&
type.compare("text/livescript") )
javascript = false;
} else if( a ) {
/*
Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
We want to accept all the values that either of these browsers accept, but not other values.
*/
DOMString lang = a->value().domString().lower();
if( lang != "" &&
lang != "javascript" &&
lang != "javascript1.0" &&
lang != "javascript1.1" &&
lang != "javascript1.2" &&
lang != "javascript1.3" &&
lang != "javascript1.4" &&
lang != "javascript1.5" &&
lang != "ecmascript" &&
lang != "livescript" &&
lang != "jscript")
javascript = false;
}
}
RefPtr<NodeImpl> n = processToken();
if (tagName == preTag) {
if (beginTag)
state.setDiscardLF(true); // Discard the first LF after we open a pre.
} else if (tagName == scriptTag) {
ASSERT(!scriptNode);
scriptNode = n;
if (beginTag) {
searchStopper = scriptEnd;
searchStopperLen = 8;
state.setInScript(true);
state = parseSpecial(src, state);
} else if (isSelfClosingScript) { // Handle <script src="foo"/>
state.setInScript(true);
state = scriptHandler(state);
}
} else if (tagName == styleTag) {
if (beginTag) {
searchStopper = styleEnd;
searchStopperLen = 7;
state.setInStyle(true);
state = parseSpecial(src, state);
}
} else if (tagName == textareaTag) {
if (beginTag) {
searchStopper = textareaEnd;
searchStopperLen = 10;
state.setInTextArea(true);
state = parseSpecial(src, state);
}
} else if (tagName == titleTag) {
if (beginTag) {
searchStopper = titleEnd;
searchStopperLen = 7;
State savedState = state;
SegmentedString savedSrc = src;
long savedLineno = lineno;
state.setInTitle(true);
state = parseSpecial(src, state);
if (state.inTitle() && src.isEmpty()) {
// We just ate the rest of the document as the title #text node!
// Reset the state then retokenize without special title handling.
// Let the parser clean up the missing </title> tag.
// FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
// at the end of the document unless noMoreData is also true. We need
// to detect this case elsewhere, and save the state somewhere other
// than a local variable.
state = savedState;
src = savedSrc;
lineno = savedLineno;
scriptCodeSize = 0;
}
}
} else if (tagName == xmpTag) {
if (beginTag) {
searchStopper = xmpEnd;
searchStopperLen = 5;
state.setInXmp(true);
state = parseSpecial(src, state);
}
} else if (tagName == selectTag)
state.setInSelect(beginTag);
else if (tagName == plaintextTag)
state.setInPlainText(beginTag);
m_cBufferPos = cBufferPos;
return state; // Finished parsing tag!
}
} // end switch
}
m_cBufferPos = cBufferPos;
return state;
}
inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
{
// We don't want to be checking elapsed time with every character, so we only check after we've
// processed a certain number of characters.
bool allowedYield = state.allowYield();
state.setAllowYield(false);
if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
processedCount = 0;
if (currentTime() - startTime > tokenizerTimeDelay) {
/* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
load, but this hurts overall performance on slower machines. For now turn this
off.
|| (!parser->doc()->haveStylesheetsLoaded() &&
(parser->doc()->documentElement()->id() != ID_HTML || parser->doc()->body()))) {*/
// Schedule the timer to keep processing as soon as possible.
m_timer.startOneShot(0);
#if INSTRUMENT_LAYOUT_SCHEDULING
if (currentTime() - startTime > tokenizerTimeDelay)
printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
#endif
return false;
}
}
processedCount++;
return true;
}
bool HTMLTokenizer::write(const SegmentedString &str, bool appendData)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
#endif
if (!buffer)
return false;
if (m_parserStopped)
return false;
if ( ( m_executingScript && appendData ) || !pendingScripts.isEmpty() ) {
// don't parse; we will do this later
if (currentPrependingSrc) {
currentPrependingSrc->append(str);
} else {
pendingSrc.append(str);
}
return false;
}
if (!src.isEmpty())
src.append(str);
else
setSrc(str);
// Once a timer is set, it has control of when the tokenizer continues.
if (m_timer.isActive())
return false;
bool wasInWrite = inWrite;
inWrite = true;
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Beginning write at time %d\n", parser->doc()->elapsedTime());
#endif
int processedCount = 0;
double startTime = currentTime();
Frame *frame = parser->doc()->frame();
State state = m_state;
while (!src.isEmpty() && (!frame || !frame->isScheduledLocationChangePending())) {
if (!continueProcessing(processedCount, startTime, state))
break;
// do we need to enlarge the buffer?
checkBuffer();
unsigned short cc = src->unicode();
bool wasSkipLF = state.skipLF();
if (wasSkipLF)
state.setSkipLF(false);
if (wasSkipLF && (cc == '\n'))
++src;
else if (state.needsSpecialWriteHandling()) {
// it's important to keep needsSpecialWriteHandling with the flags this block tests
if (state.hasEntityState())
state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
else if (state.inPlainText())
state = parseText(src, state);
else if (state.inAnySpecial())
state = parseSpecial(src, state);
else if (state.inComment())
state = parseComment(src, state);
else if (state.inServer())
state = parseServer(src, state);
else if (state.inProcessingInstruction())
state = parseProcessingInstruction(src, state);
else if (state.hasTagState())
state = parseTag(src, state);
else if (state.startTag()) {
state.setStartTag(false);
switch(cc) {
case '/':
break;
case '!': {
// <!-- comment -->
searchCount = 1; // Look for '<!--' sequence to start comment
break;
}
case '?': {
// xml processing instruction
state.setInProcessingInstruction(true);
tquote = NoQuote;
state = parseProcessingInstruction(src, state);
continue;
break;
}
case '%':
if (!brokenServer) {
// <% server stuff, handle as comment %>
state.setInServer(true);
tquote = NoQuote;
state = parseServer(src, state);
continue;
}
// else fall through
default: {
if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
// Start of a Start-Tag
} else {
// Invalid tag
// Add as is
*dest = '<';
dest++;
continue;
}
}
}; // end case
processToken();
m_cBufferPos = 0;
state.setTagState(TagName);
state = parseTag(src, state);
}
} else if (cc == '&' && !src.escaped()) {
++src;
state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
} else if (cc == '<' && !src.escaped()) {
tagStartLineno = lineno+src.lineCount();
++src;
state.setStartTag(true);
} else if (cc == '\n' || cc == '\r') {
if (state.discardLF())
// Ignore this LF
state.setDiscardLF(false); // We have discarded 1 LF
else
// Process this LF
*dest++ = '\n';
/* Check for MS-DOS CRLF sequence */
if (cc == '\r')
state.setSkipLF(true);
++src;
} else {
state.setDiscardLF(false);
*dest++ = fixUpChar(*src);
++src;
}
}
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Ending write at time %d\n", parser->doc()->elapsedTime());
#endif
inWrite = wasInWrite;
m_state = state;
if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
end(); // this actually causes us to be deleted
return true;
}
return false;
}
void HTMLTokenizer::stopParsing()
{
Tokenizer::stopParsing();
m_timer.stop();
// The part needs to know that the tokenizer has finished with its data,
// regardless of whether it happened naturally or due to manual intervention.
if (!m_fragment && m_doc->frame())
m_doc->frame()->tokenizerProcessedData();
}
bool HTMLTokenizer::processingData() const
{
return m_timer.isActive();
}
void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
{
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Beginning timer write at time %d\n", parser->doc()->elapsedTime());
#endif
if (parser->doc()->view() && parser->doc()->view()->layoutPending() && !parser->doc()->minimumLayoutDelay()) {
// Restart the timer and let layout win. This is basically a way of ensuring that the layout
// timer has higher priority than our timer.
m_timer.startOneShot(0);
return;
}
RefPtr<Frame> frame = m_fragment ? 0 : m_doc->frame();
// Invoke write() as though more data came in.
bool didCallEnd = write(SegmentedString(), true);
// If we called end() during the write, we need to let WebKit know that we're done processing the data.
if (didCallEnd && frame)
frame->tokenizerProcessedData();
}
void HTMLTokenizer::end()
{
ASSERT(!m_timer.isActive());
m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
if (buffer) {
// parseTag is using the buffer for different matters
if (!m_state.hasTagState())
processToken();
if (scriptCode)
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
KHTML_DELETE_QCHAR_VEC(buffer);
buffer = 0;
}
parser->finished();
}
void HTMLTokenizer::finish()
{
// do this as long as we don't find matching comment ends
while((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
// we've found an unmatched comment start
if (m_state.inComment())
brokenComments = true;
else
brokenServer = true;
checkScriptBuffer();
scriptCode[scriptCodeSize] = 0;
scriptCode[scriptCodeSize + 1] = 0;
int pos;
QString food;
if (m_state.inScript() || m_state.inStyle())
food.setUnicode(scriptCode, scriptCodeSize);
else if (m_state.inServer()) {
food = "<";
food += QString(scriptCode, scriptCodeSize);
} else {
pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
}
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
m_state.setInComment(false);
m_state.setInServer(false);
if (!food.isEmpty())
write(food, true);
}
// this indicates we will not receive any more data... but if we are waiting on
// an external script to load, we can't finish parsing until that is done
noMoreData = true;
if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
end(); // this actually causes us to be deleted
}
PassRefPtr<NodeImpl> HTMLTokenizer::processToken()
{
KJSProxyImpl* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->jScript() : 0;
if (jsProxy)
jsProxy->setEventHandlerLineno(tagStartLineno);
if (dest > buffer) {
#ifdef TOKEN_DEBUG
if(currToken.tagName.length()) {
qDebug( "unexpected token: %s, str: *%s*", currToken.tagName.qstring().latin1(),QConstString( buffer,dest-buffer ).qstring().latin1() );
ASSERT(0);
}
#endif
currToken.text = new DOMStringImpl( buffer, dest - buffer );
if (currToken.tagName != commentAtom)
currToken.tagName = textAtom;
} else if (currToken.tagName == nullAtom) {
currToken.reset();
if (jsProxy)
jsProxy->setEventHandlerLineno(lineno+src.lineCount());
return 0;
}
dest = buffer;
#ifdef TOKEN_DEBUG
QString name = currToken.tagName.qstring();
QString text;
if(currToken.text)
text = QConstString(currToken.text->s, currToken.text->l).qstring();
kdDebug( 6036 ) << "Token --> " << name << endl;
if (currToken.flat)
kdDebug( 6036 ) << "Token is FLAT!" << endl;
if(!text.isNull())
kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
unsigned l = currToken.attrs ? currToken.attrs->length() : 0;
if(l) {
kdDebug( 6036 ) << "Attributes: " << l << endl;
for (unsigned i = 0; i < l; ++i) {
AttributeImpl* c = currToken.attrs->attributeItem(i);
kdDebug( 6036 ) << " " << c->localName().qstring()
<< "=\"" << c->value().qstring() << "\"" << endl;
}
}
kdDebug( 6036 ) << endl;
#endif
RefPtr<NodeImpl> n;
if (!m_parserStopped)
// pass the token over to the parser, the parser DOES NOT delete the token
n = parser->parseToken(&currToken);
currToken.reset();
if (jsProxy)
jsProxy->setEventHandlerLineno(0);
return n.release();
}
HTMLTokenizer::~HTMLTokenizer()
{
ASSERT(!inWrite);
reset();
delete parser;
}
void HTMLTokenizer::enlargeBuffer(int len)
{
int newsize = kMax(size*2, size+len);
int oldoffs = (dest - buffer);
buffer = (QChar*)fastRealloc(buffer, newsize*sizeof(QChar));
dest = buffer + oldoffs;
size = newsize;
}
void HTMLTokenizer::enlargeScriptBuffer(int len)
{
int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
scriptCode = (QChar*)fastRealloc(scriptCode, newsize*sizeof(QChar));
scriptCodeMaxSize = newsize;
}
void HTMLTokenizer::notifyFinished(CachedObject */*finishedObj*/)
{
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("script loaded at %d\n", parser->doc()->elapsedTime());
#endif
ASSERT(!pendingScripts.isEmpty());
bool finished = false;
while (!finished && pendingScripts.head()->isLoaded()) {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Finished loading an external script" << endl;
#endif
CachedScript* cs = pendingScripts.dequeue();
ASSERT(cs->accessCount() > 0);
DOMString scriptSource = cs->script();
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "External script is:" << endl << scriptSource.qstring() << endl;
#endif
setSrc(SegmentedString());
// make sure we forget about the script before we execute the new one
// infinite recursion might happen otherwise
QString cachedScriptUrl( cs->url().qstring() );
bool errorOccurred = cs->errorOccurred();
cs->deref(this);
RefPtr<NodeImpl> n = scriptNode;
scriptNode = 0;
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("external script beginning execution at %d\n", parser->doc()->elapsedTime());
#endif
if (errorOccurred)
EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, false, false);
else {
m_state = scriptExecution(scriptSource.qstring(), m_state, cachedScriptUrl);
EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
}
// The state of pendingScripts.isEmpty() can change inside the scriptExecution()
// call above, so test afterwards.
finished = pendingScripts.isEmpty();
if (finished) {
m_state.setLoadingExtScript(false);
#if INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("external script finished execution at %d\n", parser->doc()->elapsedTime());
#endif
}
// 'inScript' is true when we are called synchronously from
// parseScript(). In that case parseScript() will take care
// of 'scriptOutput'.
if (!m_state.inScript()) {
SegmentedString rest = pendingSrc;
pendingSrc.clear();
write(rest, false);
// we might be deleted at this point, do not
// access any members.
}
}
}
bool HTMLTokenizer::isWaitingForScripts() const
{
return m_state.loadingExtScript();
}
void HTMLTokenizer::setSrc(const SegmentedString &source)
{
lineno += src.lineCount();
src = source;
src.resetLineCount();
}
void parseHTMLDocumentFragment(const DOMString &source, DocumentFragmentImpl *fragment)
{
HTMLTokenizer tok(fragment);
tok.setForceSynchronous(true);
tok.write(source.qstring(), true);
tok.finish();
ASSERT(!tok.processingData()); // make sure we're done (see 3963151)
}
unsigned short decodeNamedEntity(const char* name)
{
const Entity* e = findEntity(name, strlen(name));
return e ? e->code : 0;
}
}