WebCore/khtml/html/htmltokenizer.cpp - WebKit - Git at Google

 /*
     This file is part of the KDE libraries

     Copyright (C) 1997 Martin Jones (mjones@kde.org)
               (C) 1997 Torben Weis (weis@kde.org)
               (C) 1998 Waldo Bastian (bastian@kde.org)
               (C) 1999 Lars Knoll (knoll@kde.org)
               (C) 1999 Antti Koivisto (koivisto@kde.org)
               (C) 2001 Dirk Mueller (mueller@kde.org)
     Copyright (C) 2004, 2005, 2006 Apple Computer, Inc.

     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Library General Public
     License as published by the Free Software Foundation; either
     version 2 of the License, or (at your option) any later version.

     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Library General Public License for more details.

     You should have received a copy of the GNU Library General Public License
     along with this library; see the file COPYING.LIB.  If not, write to
     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     Boston, MA 02111-1307, USA.
 */

 #include "config.h"
 #include "htmltokenizer.h"

 #include "CachedScript.h"
 #include "DocLoader.h"
 #include "DocumentFragmentImpl.h"
 #include "EventNames.h"
 #include "Frame.h"
 #include "FrameView.h"
 #include "HTMLElementImpl.h"
 #include "SystemTime.h"
 #include "csshelper.h"
 #include "html_documentimpl.h"
 #include "htmlnames.h"
 #include "htmlparser.h"
 #include "kjs_proxy.h"
 #include <ctype.h>
 #include <stdlib.h>

 #include "kentities.c"

 // #define INSTRUMENT_LAYOUT_SCHEDULING 1

 #define TOKENIZER_CHUNK_SIZE  4096

 namespace WebCore {

 using namespace HTMLNames;
 using namespace EventNames;

 // FIXME: We would like this constant to be 200ms.
 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
 const double tokenizerTimeDelay = 0.500;

 static const char commentStart [] = "<!--";
 static const char scriptEnd [] = "</script";
 static const char xmpEnd [] = "</xmp";
 static const char styleEnd [] =  "</style";
 static const char textareaEnd [] = "</textarea";
 static const char titleEnd [] = "</title";

 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) fastMalloc( sizeof(QChar)*( N ) )
 #define KHTML_DELETE_QCHAR_VEC( P ) fastFree((char*)( P ))

 // Full support for MS Windows extensions to Latin-1.
 // Technically these extensions should only be activated for pages
 // marked "windows-1252" or "cp1252", but
 // in the standard Microsoft way, these extensions infect hundreds of thousands
 // of web pages.  Note that people with non-latin-1 Microsoft extensions
 // are SOL.
 //
 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
 //      http://www.bbsinc.com/iso8859.html
 //      http://www.obviously.com/
 //
 // There may be better equivalents

 // We need this for entities at least. For non-entity text, we could
 // handle this in the text encoding.

 // To cover non-entity text, I think this function would need to be called
 // in more places. There seem to be some places that don't call fixUpChar.

 static const unsigned short windowsLatin1ExtensionArray[32] = {
     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
 };

 static inline QChar fixUpChar(QChar c)
 {
     unsigned short code = c.unicode();
     if ((code & ~0x1F) != 0x0080)
         return c;
     return windowsLatin1ExtensionArray[code - 0x80];
 }

 inline bool tagMatch(const char *s1, const QChar *s2, uint length)
 {
     for (uint i = 0; i != length; ++i) {
         char c1 = s1[i];
         char uc1 = toupper(c1);
         QChar c2 = s2[i];
         if (c1 != c2 && uc1 != c2)
             return false;
     }
     return true;
 }

 void Token::addAttribute(DocumentImpl* doc, const AtomicString& attrName, const AtomicString& v)
 {
     AttributeImpl* a = 0;
     if (!attrName.isEmpty() && attrName != "/") {
         a = new MappedAttributeImpl(attrName, v);
         if (!attrs)
             attrs = new NamedMappedAttrMapImpl(0);
         attrs->insertAttribute(a);
     }
 }

 // ----------------------------------------------------------------------------

 HTMLTokenizer::HTMLTokenizer(DocumentImpl* doc)
     : buffer(0)
     , scriptCode(0)
     , scriptCodeSize(0)
     , scriptCodeMaxSize(0)
     , scriptCodeResync(0)
     , m_executingScript(0)
     , m_timer(this, &HTMLTokenizer::timerFired)
     , m_doc(doc)
     , inWrite(false)
     , m_fragment(false)
 {
     parser = new HTMLParser(doc);
     begin();
 }

 HTMLTokenizer::HTMLTokenizer(DocumentFragmentImpl* frag)
     : buffer(0)
     , scriptCode(0)
     , scriptCodeSize(0)
     , scriptCodeMaxSize(0)
     , scriptCodeResync(0)
     , m_executingScript(0)
     , m_timer(this, &HTMLTokenizer::timerFired)
     , m_doc(frag->getDocument())
     , inWrite(false)
     , m_fragment(true)
 {
     parser = new HTMLParser(frag);
     begin();
 }

 void HTMLTokenizer::reset()
 {
     ASSERT(m_executingScript == 0);

     while (!pendingScripts.isEmpty()) {
       CachedScript *cs = pendingScripts.dequeue();
       ASSERT(cs->accessCount() > 0);
       cs->deref(this);
     }

     if (buffer)
         KHTML_DELETE_QCHAR_VEC(buffer);
     buffer = dest = 0;
     size = 0;

     if (scriptCode)
         KHTML_DELETE_QCHAR_VEC(scriptCode);
     scriptCode = 0;
     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;

     m_timer.stop();
     m_state.setAllowYield(false);
     m_state.setForceSynchronous(false);

     currToken.reset();
 }

 void HTMLTokenizer::begin()
 {
     m_executingScript = 0;
     m_state.setLoadingExtScript(false);
     reset();
     size = 254;
     buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
     dest = buffer;
     tquote = NoQuote;
     searchCount = 0;
     m_state.setEntityState(NoEntity);
     scriptSrc = QString::null;
     pendingSrc.clear();
     currentPrependingSrc = 0;
     noMoreData = false;
     brokenComments = false;
     brokenServer = false;
     lineno = 0;
     scriptStartLineno = 0;
     tagStartLineno = 0;
     m_state.setForceSynchronous(false);
 }

 void HTMLTokenizer::setForceSynchronous(bool force)
 {
     m_state.setForceSynchronous(force);
 }

 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
 {
     // This function adds the listing 'list' as
     // preformatted text-tokens to the token-collection
     while (!list.isEmpty()) {
         if (state.skipLF()) {
             state.setSkipLF(false);
             if (*list == '\n') {
                 ++list;
                 continue;
             }
         }

         checkBuffer();

         if (*list == '\n' || *list == '\r') {
             if (state.discardLF())
                 // Ignore this LF
                 state.setDiscardLF(false); // We have discarded 1 LF
             else
                 *dest++ = '\n';

             /* Check for MS-DOS CRLF sequence */
             if (*list == '\r')
                 state.setSkipLF(true);

             ++list;
         } else {
             state.setDiscardLF(false);
             *dest++ = *list;
             ++list;
         }
     }

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
 {
     ASSERT(state.inTextArea() || state.inTitle() || !state.hasEntityState());
     ASSERT(!state.hasTagState());
     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() == 1 );
     if (state.inScript())
         scriptStartLineno = lineno + src.lineCount();

     if (state.inComment())
         state = parseComment(src, state);

     while ( !src.isEmpty() ) {
         checkScriptBuffer();
         unsigned char ch = src->latin1();
         if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && !state.inTitle() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
             state.setInComment(true);
             state = parseComment(src, state);
             continue;
         }
         if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
             ++src;
             scriptCodeSize = scriptCodeResync-1;
             scriptCodeResync = 0;
             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
             if (state.inScript())
                 state = scriptHandler(state);
             else {
                 state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
                 processToken();
                 if (state.inStyle()) {
                     currToken.tagName = styleTag.localName();
                     currToken.beginTag = false;
                 } else if (state.inTextArea()) {
                     currToken.tagName = textareaTag.localName();
                     currToken.beginTag = false;
                 } else if (state.inTitle()) {
                     currToken.tagName = titleTag.localName();
                     currToken.beginTag = false;
                 } else if (state.inXmp()) {
                     currToken.tagName = xmpTag.localName();
                     currToken.beginTag = false;
                 }
                 processToken();
                 state.setInStyle(false);
                 state.setInScript(false);
                 state.setInTextArea(false);
                 state.setInTitle(false);
                 state.setInXmp(false);
                 tquote = NoQuote;
                 scriptCodeSize = scriptCodeResync = 0;
             }
             return state;
         }
         // possible end of tagname, lets check.
         if ( !scriptCodeResync && !state.escaped() && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
              scriptCodeSize >= searchStopperLen &&
              tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
             tquote = NoQuote;
             continue;
         }
         if ( scriptCodeResync && !state.escaped() ) {
             if(ch == '\"')
                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
             else if(ch == '\'')
                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
                 tquote = NoQuote;
         }
         state.setEscaped(!state.escaped() && ch == '\\');
         if (!scriptCodeResync && (state.inTextArea() || state.inTitle()) && !src.escaped() && ch == '&') {
             QChar *scriptCodeDest = scriptCode+scriptCodeSize;
             ++src;
             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
             scriptCodeSize = scriptCodeDest-scriptCode;
         }
         else {
             scriptCode[scriptCodeSize++] = fixUpChar(*src);
             ++src;
         }
     }

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
 {
     // We are inside a <script>
     bool doScriptExec = false;

     // (Bugzilla 3837) Scripts following a frameset element should not execute or,
     // in the case of extern scripts, even load.
     bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->hasTagName(framesetTag));

     CachedScript* cs = 0;
     // don't load external scripts for standalone documents (for now)
     if (!scriptSrc.isEmpty() && parser->doc()->frame()) {
         // forget what we just got; load from src url instead
         if (!parser->skipMode() && !followingFrameset) {
 #if INSTRUMENT_LAYOUT_SCHEDULING
             if (!parser->doc()->ownerElement())
                 printf("Requesting script at time %d\n", parser->doc()->elapsedTime());
 #endif
             if ( (cs = parser->doc()->docLoader()->requestScript(scriptSrc, scriptSrcCharset) ))
                 pendingScripts.enqueue(cs);
             else
                 scriptNode = 0;
         } else
             scriptNode = 0;
         scriptSrc=QString::null;
     }
     else {
 #ifdef TOKEN_DEBUG
         kdDebug( 6036 ) << "---START SCRIPT---" << endl;
         kdDebug( 6036 ) << QString(scriptCode, scriptCodeSize) << endl;
         kdDebug( 6036 ) << "---END SCRIPT---" << endl;
 #endif
         scriptNode = 0;
         // Parse scriptCode containing <script> info
         doScriptExec = true;
     }
     state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
     QString exScript( buffer, dest-buffer );
     processToken();
     currToken.tagName = scriptTag.localName();
     currToken.beginTag = false;
     processToken();

     SegmentedString *savedPrependingSrc = currentPrependingSrc;
     SegmentedString prependingSrc;
     currentPrependingSrc = &prependingSrc;
     if (!parser->skipMode() && !followingFrameset) {
         if (cs) {
             if (savedPrependingSrc) {
                 savedPrependingSrc->append(src);
             } else {
                 pendingSrc.prepend(src);
             }
             setSrc(SegmentedString());
             scriptCodeSize = scriptCodeResync = 0;

             // the ref() call below may call notifyFinished if the script is already in cache,
             // and that mucks with the state directly, so we must write it back to the object.
             m_state = state;
             cs->ref(this);
             state = m_state;
             // will be 0 if script was already loaded and ref() executed it
             if (!pendingScripts.isEmpty())
                 state.setLoadingExtScript(true);
         }
         else if (!m_fragment && doScriptExec && javascript ) {
             if (!m_executingScript)
                 pendingSrc.prepend(src);
             else
                 prependingSrc = src;
             setSrc(SegmentedString());
             scriptCodeSize = scriptCodeResync = 0;
             state = scriptExecution(exScript, state, QString::null, scriptStartLineno);
         }
     }

     state.setInScript(false);
     scriptCodeSize = scriptCodeResync = 0;

     if (!m_executingScript && !state.loadingExtScript()) {
         src.append(pendingSrc);
         pendingSrc.clear();
     } else if (!prependingSrc.isEmpty()) {
         // restore first so that the write appends in the right place
         // (does not hurt to do it again below)
         currentPrependingSrc = savedPrependingSrc;

         // we need to do this slightly modified bit of one of the write() cases
         // because we want to prepend to pendingSrc rather than appending
         // if there's no previous prependingSrc
         if (state.loadingExtScript()) {
             if (currentPrependingSrc) {
                 currentPrependingSrc->append(prependingSrc);
             } else {
                 pendingSrc.prepend(prependingSrc);
             }
         } else {
             m_state = state;
             write(prependingSrc, false);
             state = m_state;
         }
     }

     currentPrependingSrc = savedPrependingSrc;

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const QString& str, State state, QString scriptURL, int baseLine)
 {
     if (m_fragment || !m_doc->frame())
         return state;
     bool oldscript = state.inScript();
     m_executingScript++;
     state.setInScript(false);
     QString url = scriptURL.isNull() ? m_doc->frame()->document()->URL() : scriptURL;

     SegmentedString *savedPrependingSrc = currentPrependingSrc;
     SegmentedString prependingSrc;
     currentPrependingSrc = &prependingSrc;

 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("beginning script execution at %d\n", parser->doc()->elapsedTime());
 #endif

     m_state = state;
     m_doc->frame()->executeScript(url,baseLine,0,str);
     state = m_state;

     state.setAllowYield(true);

 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("ending script execution at %d\n", parser->doc()->elapsedTime());
 #endif

     m_executingScript--;
     state.setInScript(oldscript);

     if (!m_executingScript && !state.loadingExtScript()) {
         src.append(pendingSrc);
         pendingSrc.clear();
     } else if (!prependingSrc.isEmpty()) {
         // restore first so that the write appends in the right place
         // (does not hurt to do it again below)
         currentPrependingSrc = savedPrependingSrc;

         // we need to do this slightly modified bit of one of the write() cases
         // because we want to prepend to pendingSrc rather than appending
         // if there's no previous prependingSrc
         if (state.loadingExtScript()) {
             if (currentPrependingSrc) {
                 currentPrependingSrc->append(prependingSrc);
             } else {
                 pendingSrc.prepend(prependingSrc);
             }
         } else {
             m_state = state;
             write(prependingSrc, false);
             state = m_state;
         }
     }

     currentPrependingSrc = savedPrependingSrc;

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
 {
     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
     bool strict = !parser->doc()->inCompatMode() && !state.inScript() && !state.inStyle();
     int delimiterCount = 0;
     bool canClose = false;
     checkScriptBuffer(src.length());
     while ( !src.isEmpty() ) {
         scriptCode[ scriptCodeSize++ ] = *src;
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
         qDebug("comment is now: *%s*",
                QConstString((QChar*)src.operator->(), kMin(16U, src.length())).qstring().latin1());
 #endif

         if (strict) {
             if (src->unicode() == '-') {
                 delimiterCount++;
                 if (delimiterCount == 2) {
                     delimiterCount = 0;
                     canClose = !canClose;
                 }
             }
             else
                 delimiterCount = 0;
         }

         if ((!strict || canClose) && src->unicode() == '>') {
             bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
             int endCharsCount = 1; // start off with one for the '>' character
             if (!strict) {
                 // In quirks mode just check for -->
                 if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
                     endCharsCount = 3;
                 }
                 else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' &&
                     scriptCode[scriptCodeSize-2] == '!') {
                     // Other browsers will accept --!> as a close comment, even though it's
                     // not technically valid.
                     endCharsCount = 4;
                 }
             }
             if (canClose || handleBrokenComments || endCharsCount > 1) {
                 ++src;
                 if (!(state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) {
 #ifdef INCLUDE_COMMENTS_IN_DOM // FIXME: Turn this on soon.
                     checkScriptBuffer();
                     scriptCode[scriptCodeSize] = 0;
                     scriptCode[scriptCodeSize + 1] = 0;
                     currToken.tagName = commentAtom;
                     currToken.beginTag = true;
                     state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
                     processToken();
                     currToken.tagName = commentAtom;
                     currToken.beginTag = false;
                     processToken();
 #endif
                     scriptCodeSize = 0;
                 }
                 state.setInComment(false);
                 return state; // Finished parsing comment
             }
         }
         ++src;
     }

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
 {
     checkScriptBuffer(src.length());
     while (!src.isEmpty()) {
         scriptCode[scriptCodeSize++] = *src;
         if (src->unicode() == '>' &&
             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
             ++src;
             state.setInServer(false);
             scriptCodeSize = 0;
             return state; // Finished parsing server include
         }
         ++src;
     }
     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
 {
     char oldchar = 0;
     while ( !src.isEmpty() )
     {
         unsigned char chbegin = src->latin1();
         if(chbegin == '\'') {
             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
         }
         else if(chbegin == '\"') {
             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
         }
         // Look for '?>'
         // some crappy sites omit the "?" before it, so
         // we look for an unquoted '>' instead. (IE compatible)
         else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
         {
             // We got a '?>' sequence
             state.setInProcessingInstruction(false);
             ++src;
             state.setDiscardLF(true);
             return state; // Finished parsing comment!
         }
         ++src;
         oldchar = chbegin;
     }

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
 {
     while (!src.isEmpty()) {
         unsigned short cc = src->unicode();

         if (state.skipLF()) {
             state.setSkipLF(false);
             if (cc == '\n') {
                 ++src;
                 continue;
             }
         }

         // do we need to enlarge the buffer?
         checkBuffer();

         if (cc == '\r') {
             state.setSkipLF(true);
             *dest++ = '\n';
         } else
             *dest++ = fixUpChar(cc);
         ++src;
     }

     return state;
 }


 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, QChar *&dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
 {
     if (start)
     {
         cBufferPos = 0;
         state.setEntityState(SearchEntity);
         EntityUnicodeValue = 0;
     }

     while(!src.isEmpty())
     {
         unsigned short cc = src->unicode();
         switch(state.entityState()) {
         case NoEntity:
             ASSERT(state.entityState() != NoEntity);
             return state;

         case SearchEntity:
             if(cc == '#') {
                 cBuffer[cBufferPos++] = cc;
                 ++src;
                 state.setEntityState(NumericSearch);
             }
             else
                 state.setEntityState(EntityName);

             break;

         case NumericSearch:
             if(cc == 'x' || cc == 'X') {
                 cBuffer[cBufferPos++] = cc;
                 ++src;
                 state.setEntityState(Hexadecimal);
             }
             else if(cc >= '0' && cc <= '9')
                 state.setEntityState(Decimal);
             else
                 state.setEntityState(SearchSemicolon);

             break;

         case Hexadecimal:
         {
             int ll = kMin(src.length(), 10-cBufferPos);
             while(ll--) {
                 QChar csrc(src->lower());
                 cc = csrc.cell();

                 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
                     state.setEntityState(SearchSemicolon);
                     break;
                 }
                 EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
                 cBuffer[cBufferPos++] = cc;
                 ++src;
             }
             if (cBufferPos == 10)
                 state.setEntityState(SearchSemicolon);
             break;
         }
         case Decimal:
         {
             int ll = kMin(src.length(), 9-cBufferPos);
             while(ll--) {
                 cc = src->cell();

                 if(src->row() || !(cc >= '0' && cc <= '9')) {
                     state.setEntityState(SearchSemicolon);
                     break;
                 }

                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
                 cBuffer[cBufferPos++] = cc;
                 ++src;
             }
             if (cBufferPos == 9)
                 state.setEntityState(SearchSemicolon);
             break;
         }
         case EntityName:
         {
             int ll = kMin(src.length(), 9-cBufferPos);
             while(ll--) {
                 QChar csrc = *src;
                 cc = csrc.cell();

                 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
                                    (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
                     state.setEntityState(SearchSemicolon);
                     break;
                 }

                 cBuffer[cBufferPos++] = cc;
                 ++src;
             }
             if (cBufferPos == 9)
                 state.setEntityState(SearchSemicolon);
             if (state.entityState() == SearchSemicolon) {
                 if(cBufferPos > 1) {
                     const Entity *e = findEntity(cBuffer, cBufferPos);
                     if(e)
                         EntityUnicodeValue = e->code;

                     // be IE compatible
                     if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
                         EntityUnicodeValue = 0;
                 }
             }
             else
                 break;
         }
         case SearchSemicolon:
             // Don't allow values that are more than 21 bits.
             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x1FFFFF) {

                 if (*src == ';')
                     ++src;

                 if (EntityUnicodeValue <= 0xFFFF) {
                     checkBuffer();
                     src.push(fixUpChar(EntityUnicodeValue));
                 } else {
                     // Convert to UTF-16, using surrogate code points.
                     QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
                     QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
                     checkBuffer(2);
                     src.push(c1);
                     src.push(c2);
                 }
             } else {
                 checkBuffer(10);
                 // ignore the sequence, add it to the buffer as plaintext
                 *dest++ = '&';
                 for(unsigned int i = 0; i < cBufferPos; i++)
                     dest[i] = cBuffer[i];
                 dest += cBufferPos;
             }

             state.setEntityState(NoEntity);
             return state;
         }
     }

     return state;
 }

 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
 {
     ASSERT(!state.hasEntityState());

     unsigned cBufferPos = m_cBufferPos;

     while (!src.isEmpty())
     {
         checkBuffer();
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
         uint l = 0;
         while(l < src.length() && (*(src.operator->()+l)).latin1() != '>')
             l++;
         qDebug("src is now: *%s*, tquote: %d",
                QConstString((QChar*)src.operator->(), l).qstring().latin1(), tquote);
 #endif
         switch(state.tagState()) {
         case NoTag:
         {
             m_cBufferPos = cBufferPos;
             return state;
         }
         case TagName:
         {
 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
             qDebug("TagName");
 #endif
             if (searchCount > 0)
             {
                 if (*src == commentStart[searchCount])
                 {
                     searchCount++;
                     if (searchCount == 4)
                     {
 #ifdef TOKEN_DEBUG
                         kdDebug( 6036 ) << "Found comment" << endl;
 #endif
                         // Found '<!--' sequence
                         ++src;
                         dest = buffer; // ignore the previous part of this tag
                         state.setInComment(true);
                         state.setTagState(NoTag);

                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
                         // <!--> as a valid comment, since both mozilla and IE on windows
                         // can handle this case.  Only do this in quirks mode. -dwh
                         if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
                           state.setInComment(false);
                           ++src;
                           if (!src.isEmpty())
                               cBuffer[cBufferPos++] = src->cell();
                         }
                         else
                           state = parseComment(src, state);

                         m_cBufferPos = cBufferPos;
                         return state; // Finished parsing tag!
                     }
                     // cuts of high part, is okay
                     cBuffer[cBufferPos++] = src->cell();
                     ++src;
                     break;
                 }
                 else
                     searchCount = 0; // Stop looking for '<!--' sequence
             }

             bool finish = false;
             unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
             while(ll--) {
                 unsigned short curchar = *src;
                 if(curchar <= ' ' || curchar == '>' ) {
                     finish = true;
                     break;
                 }

                 // tolower() shows up on profiles. This is faster!
                 if (curchar >= 'A' && curchar <= 'Z')
                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
                 else
                     cBuffer[cBufferPos++] = curchar;
                 ++src;
             }

             // Disadvantage: we add the possible rest of the tag
             // as attribute names. ### judge if this causes problems
             if(finish || CBUFLEN == cBufferPos) {
                 bool beginTag;
                 char* ptr = cBuffer;
                 unsigned int len = cBufferPos;
                 cBuffer[cBufferPos] = '\0';
                 if ((cBufferPos > 0) && (*ptr == '/'))
                 {
                     // End Tag
                     beginTag = false;
                     ptr++;
                     len--;
                 }
                 else
                     // Start Tag
                     beginTag = true;

                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
                 if (len > 1 && ptr[len-1] == '/')
                     ptr[--len] = '\0';

                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
                 if (ptr[0] != '!' && strcmp(ptr, "!doctype") != 0) {
                     currToken.tagName = AtomicString(ptr);
                     currToken.beginTag = beginTag;
                 }
                 dest = buffer;
                 state.setTagState(SearchAttribute);
                 cBufferPos = 0;
             }
             break;
         }
         case SearchAttribute:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
                 qDebug("SearchAttribute");
 #endif
             bool atespace = false;
             unsigned short curchar;
             while(!src.isEmpty()) {
                 curchar = *src;
                 // In this mode just ignore any quotes we encounter and treat them like spaces.
                 if (curchar > ' ' && curchar != '\'' && curchar != '"') {
                     if (curchar == '<' || curchar == '>')
                         state.setTagState(SearchEnd);
                     else
                         state.setTagState(AttributeName);

                     cBufferPos = 0;
                     break;
                 }
                 atespace = true;
                 ++src;
             }
             break;
         }
         case AttributeName:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
                 qDebug("AttributeName");
 #endif
             unsigned short curchar;
             int ll = kMin(src.length(), CBUFLEN-cBufferPos);

             while(ll--) {
                 curchar = *src;
                 if (curchar <= '>' && (curchar >= '=' || curchar <= ' ')) {
                     cBuffer[cBufferPos] = '\0';
                     attrName = AtomicString(cBuffer);
                     dest = buffer;
                     *dest++ = 0;
                     state.setTagState(SearchEqual);
                     // This is a deliberate quirk to match Mozilla and Opera.  We have to do this
                     // since sites that use the "standards-compliant" path sometimes send
                     // <script src="foo.js"/>.  Both Moz and Opera will honor this, despite it
                     // being bogus HTML.  They do not honor the "/" for other tags.  This behavior
                     // also deviates from WinIE, but in this case we'll just copy Moz and Opera.
                     if (currToken.tagName == scriptTag && curchar == '>' && attrName == "/")
                         currToken.flat = true;
                     break;
                 }

                 // tolower() shows up on profiles. This is faster!
                 if (curchar >= 'A' && curchar <= 'Z')
                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
                 else
                     cBuffer[cBufferPos++] = curchar;
                 ++src;
             }
             if ( cBufferPos == CBUFLEN ) {
                 cBuffer[cBufferPos] = '\0';
                 attrName = AtomicString(cBuffer);
                 dest = buffer;
                 *dest++ = 0;
                 state.setTagState(SearchEqual);
             }
             break;
         }
         case SearchEqual:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
                 qDebug("SearchEqual");
 #endif
             unsigned short curchar;
             bool atespace = false;
             while(!src.isEmpty()) {
                 curchar = src->unicode();
                 // In this mode just ignore any quotes we encounter and treat them like spaces.
                 if (curchar > ' ' && curchar != '\'' && curchar != '"') {
                     if(curchar == '=') {
 #ifdef TOKEN_DEBUG
                         kdDebug(6036) << "found equal" << endl;
 #endif
                         state.setTagState(SearchValue);
                         ++src;
                     }
                     else {
                         currToken.addAttribute(parser->doc(), attrName, emptyAtom);
                         dest = buffer;
                         state.setTagState(SearchAttribute);
                     }
                     break;
                 }
                 atespace = true;
                 ++src;
             }
             break;
         }
         case SearchValue:
         {
             unsigned short curchar;
             while(!src.isEmpty()) {
                 curchar = src->unicode();
                 if(curchar > ' ') {
                     if(( curchar == '\'' || curchar == '\"' )) {
                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
                         state.setTagState(QuotedValue);
                         ++src;
                     } else
                         state.setTagState(Value);

                     break;
                 }
                 ++src;
             }
             break;
         }
         case QuotedValue:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
                 qDebug("QuotedValue");
 #endif
             unsigned short curchar;
             while(!src.isEmpty()) {
                 checkBuffer();

                 curchar = src->unicode();
                 if (curchar == '>' && attrName.isEmpty()) {
                     // Handle a case like <img '>.  Just go ahead and be willing
                     // to close the whole tag.  Don't consume the character and
                     // just go back into SearchEnd while ignoring the whole
                     // value.
                     // FIXME: Note that this is actually not a very good solution. It's
                     // an interim hack and doesn't handle the general case of
                     // unmatched quotes among attributes that have names. -dwh
                     while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
                         dest--; // remove trailing newlines
                     AtomicString v(buffer+1, dest-buffer-1);
                     attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
                     currToken.addAttribute(parser->doc(), attrName, v);
                     state.setTagState(SearchAttribute);
                     dest = buffer;
                     tquote = NoQuote;
                     break;
                 }

                 if(curchar <= '\'' && !src.escaped()) {
                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
                     if ( curchar == '&' )
                     {
                         ++src;
                         state = parseEntity(src, dest, state, cBufferPos, true, true);
                         break;
                     }
                     else if ( (tquote == SingleQuote && curchar == '\'') ||
                               (tquote == DoubleQuote && curchar == '\"') )
                     {
                         // some <input type=hidden> rely on trailing spaces. argh
                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
                             dest--; // remove trailing newlines
                         AtomicString v(buffer+1, dest-buffer-1);
                         if (attrName.isEmpty())
                             attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
                         currToken.addAttribute(parser->doc(), attrName, v);

                         dest = buffer;
                         state.setTagState(SearchAttribute);
                         tquote = NoQuote;
                         ++src;
                         break;
                     }
                 }
                 *dest++ = fixUpChar(*src);
                 ++src;
             }
             break;
         }
         case Value:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
             qDebug("Value");
 #endif
             unsigned short curchar;
             while(!src.isEmpty()) {
                 checkBuffer();
                 curchar = src->unicode();
                 if(curchar <= '>' && !src.escaped()) {
                     // parse Entities
                     if ( curchar == '&' )
                     {
                         ++src;
                         state = parseEntity(src, dest, state, cBufferPos, true, true);
                         break;
                     }
                     // no quotes. Every space means end of value
                     // '/' does not delimit in IE!
                     if ( curchar <= ' ' || curchar == '>' )
                     {
                         AtomicString v(buffer+1, dest-buffer-1);
                         currToken.addAttribute(parser->doc(), attrName, v);
                         dest = buffer;
                         state.setTagState(SearchAttribute);
                         break;
                     }
                 }

                 *dest++ = fixUpChar(*src);
                 ++src;
             }
             break;
         }
         case SearchEnd:
         {
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
                 qDebug("SearchEnd");
 #endif
             while(!src.isEmpty()) {
                 if (*src == '>' || *src == '<')
                     break;

                 if (*src == '/')
                     currToken.flat = true;

                 ++src;
             }
             if (src.isEmpty()) break;

             searchCount = 0; // Stop looking for '<!--' sequence
             state.setTagState(NoTag);
             tquote = NoQuote;

             if (*src != '<')
                 ++src;

             if (currToken.tagName == nullAtom) { //stop if tag is unknown
                 m_cBufferPos = cBufferPos;
                 return state;
             }

             AtomicString tagName = currToken.tagName;
 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
             kdDebug( 6036 ) << "appending Tag: " << tagName.qstring() << endl;
 #endif

             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
             // compatibility.
             bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
             bool beginTag = !currToken.flat && currToken.beginTag;
             if (currToken.beginTag && currToken.tagName == scriptTag) {
                 AttributeImpl* a = 0;
                 bool foundTypeAttribute = false;
                 scriptSrc = QString::null;
                 scriptSrcCharset = QString::null;
                 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
                      parser->doc()->frame() &&
                      parser->doc()->frame()->jScriptEnabled() && /* jscript allowed at all? */
                      !m_fragment /* are we a regular tokenizer or just for innerHTML ? */
                     ) {
                     if ((a = currToken.attrs->getAttributeItem(srcAttr)))
                         scriptSrc = parser->doc()->completeURL(parseURL(a->value()).qstring());
                     if ((a = currToken.attrs->getAttributeItem(charsetAttr)))
                         scriptSrcCharset = a->value().qstring().stripWhiteSpace();
                     if ( scriptSrcCharset.isEmpty() )
                         scriptSrcCharset = parser->doc()->frame()->encoding();
                     /* Check type before language, since language is deprecated */
                     if ((a = currToken.attrs->getAttributeItem(typeAttr)) != 0 && !a->value().isEmpty())
                         foundTypeAttribute = true;
                     else
                         a = currToken.attrs->getAttributeItem(languageAttr);
                 }
                 javascript = true;

                 if( foundTypeAttribute ) {
                     /*
                         Mozilla 1.5 accepts application/x-javascript, and some web references claim it is the only
                         correct variation, but WinIE 6 doesn't accept it.
                         Neither Mozilla 1.5 nor WinIE 6 accept application/javascript, application/ecmascript, or
                         application/x-ecmascript.
                         Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
                         Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
                         Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
                         Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
                         We want to accept all the values that either of these browsers accept, but not other values.
                      */
                     QString type = a->value().qstring().stripWhiteSpace().lower();
                     if( type.compare("application/x-javascript") != 0 &&
                         type.compare("text/javascript") != 0 &&
                         type.compare("text/javascript1.0") != 0 &&
                         type.compare("text/javascript1.1") != 0 &&
                         type.compare("text/javascript1.2") != 0 &&
                         type.compare("text/javascript1.3") != 0 &&
                         type.compare("text/javascript1.4") != 0 &&
                         type.compare("text/javascript1.5") != 0 &&
                         type.compare("text/jscript") != 0 &&
                         type.compare("text/ecmascript") != 0 &&
                         type.compare("text/livescript") )
                         javascript = false;
                 } else if( a ) {
                     /*
                      Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
                      Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
                      Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
                      We want to accept all the values that either of these browsers accept, but not other values.
                      */
                     DOMString lang = a->value().domString().lower();
                     if( lang != "" &&
                         lang != "javascript" &&
                         lang != "javascript1.0" &&
                         lang != "javascript1.1" &&
                         lang != "javascript1.2" &&
                         lang != "javascript1.3" &&
                         lang != "javascript1.4" &&
                         lang != "javascript1.5" &&
                         lang != "ecmascript" &&
                         lang != "livescript" &&
                         lang != "jscript")
                         javascript = false;
                 }
             }

             RefPtr<NodeImpl> n = processToken();

             if (tagName == preTag) {
                 if (beginTag)
                     state.setDiscardLF(true); // Discard the first LF after we open a pre.
             } else if (tagName == scriptTag) {
                 ASSERT(!scriptNode);
                 scriptNode = n;
                 if (beginTag) {
                     searchStopper = scriptEnd;
                     searchStopperLen = 8;
                     state.setInScript(true);
                     state = parseSpecial(src, state);
                 } else if (isSelfClosingScript) { // Handle <script src="foo"/>
                     state.setInScript(true);
                     state = scriptHandler(state);
                 }
             } else if (tagName == styleTag) {
                 if (beginTag) {
                     searchStopper = styleEnd;
                     searchStopperLen = 7;
                     state.setInStyle(true);
                     state = parseSpecial(src, state);
                 }
             } else if (tagName == textareaTag) {
                 if (beginTag) {
                     searchStopper = textareaEnd;
                     searchStopperLen = 10;
                     state.setInTextArea(true);
                     state = parseSpecial(src, state);
                 }
             } else if (tagName == titleTag) {
                 if (beginTag) {
                     searchStopper = titleEnd;
                     searchStopperLen = 7;
                     State savedState = state;
                     SegmentedString savedSrc = src;
                     long savedLineno = lineno;
                     state.setInTitle(true);
                     state = parseSpecial(src, state);
                     if (state.inTitle() && src.isEmpty()) {
                         // We just ate the rest of the document as the title #text node!
                         // Reset the state then retokenize without special title handling.
                         // Let the parser clean up the missing </title> tag.
                         // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
                         // at the end of the document unless noMoreData is also true. We need
                         // to detect this case elsewhere, and save the state somewhere other
                         // than a local variable.
                         state = savedState;
                         src = savedSrc;
                         lineno = savedLineno;
                         scriptCodeSize = 0;
                     }
                 }
             } else if (tagName == xmpTag) {
                 if (beginTag) {
                     searchStopper = xmpEnd;
                     searchStopperLen = 5;
                     state.setInXmp(true);
                     state = parseSpecial(src, state);
                 }
             } else if (tagName == selectTag)
                 state.setInSelect(beginTag);
             else if (tagName == plaintextTag)
                 state.setInPlainText(beginTag);
             m_cBufferPos = cBufferPos;
             return state; // Finished parsing tag!
         }
         } // end switch
     }
     m_cBufferPos = cBufferPos;
     return state;
 }

 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
 {
     // We don't want to be checking elapsed time with every character, so we only check after we've
     // processed a certain number of characters.
     bool allowedYield = state.allowYield();
     state.setAllowYield(false);
     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
         processedCount = 0;
         if (currentTime() - startTime > tokenizerTimeDelay) {
             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
                load, but this hurts overall performance on slower machines.  For now turn this
                off.
             || (!parser->doc()->haveStylesheetsLoaded() &&
                 (parser->doc()->documentElement()->id() != ID_HTML || parser->doc()->body()))) {*/
             // Schedule the timer to keep processing as soon as possible.
             m_timer.startOneShot(0);
 #if INSTRUMENT_LAYOUT_SCHEDULING
             if (currentTime() - startTime > tokenizerTimeDelay)
                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
 #endif
             return false;
         }
     }

     processedCount++;
     return true;
 }

 bool HTMLTokenizer::write(const SegmentedString &str, bool appendData)
 {
 #ifdef TOKEN_DEBUG
     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
 #endif

     if (!buffer)
         return false;

     if (m_parserStopped)
         return false;

     if ( ( m_executingScript && appendData ) || !pendingScripts.isEmpty() ) {
         // don't parse; we will do this later
         if (currentPrependingSrc) {
             currentPrependingSrc->append(str);
         } else {
             pendingSrc.append(str);
         }
         return false;
     }

     if (!src.isEmpty())
         src.append(str);
     else
         setSrc(str);

     // Once a timer is set, it has control of when the tokenizer continues.
     if (m_timer.isActive())
         return false;

     bool wasInWrite = inWrite;
     inWrite = true;

 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("Beginning write at time %d\n", parser->doc()->elapsedTime());
 #endif

     int processedCount = 0;
     double startTime = currentTime();

     Frame *frame = parser->doc()->frame();

     State state = m_state;

     while (!src.isEmpty() && (!frame || !frame->isScheduledLocationChangePending())) {
         if (!continueProcessing(processedCount, startTime, state))
             break;

         // do we need to enlarge the buffer?
         checkBuffer();

         unsigned short cc = src->unicode();

         bool wasSkipLF = state.skipLF();
         if (wasSkipLF)
             state.setSkipLF(false);

         if (wasSkipLF && (cc == '\n'))
             ++src;
         else if (state.needsSpecialWriteHandling()) {
             // it's important to keep needsSpecialWriteHandling with the flags this block tests
             if (state.hasEntityState())
                 state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
             else if (state.inPlainText())
                 state = parseText(src, state);
             else if (state.inAnySpecial())
                 state = parseSpecial(src, state);
             else if (state.inComment())
                 state = parseComment(src, state);
             else if (state.inServer())
                 state = parseServer(src, state);
             else if (state.inProcessingInstruction())
                 state = parseProcessingInstruction(src, state);
             else if (state.hasTagState())
                 state = parseTag(src, state);
             else if (state.startTag()) {
                 state.setStartTag(false);

                 switch(cc) {
                 case '/':
                     break;
                 case '!': {
                     // <!-- comment -->
                     searchCount = 1; // Look for '<!--' sequence to start comment

                     break;
                 }
                 case '?': {
                     // xml processing instruction
                     state.setInProcessingInstruction(true);
                     tquote = NoQuote;
                     state = parseProcessingInstruction(src, state);
                     continue;

                     break;
                 }
                 case '%':
                     if (!brokenServer) {
                         // <% server stuff, handle as comment %>
                         state.setInServer(true);
                         tquote = NoQuote;
                         state = parseServer(src, state);
                         continue;
                     }
                     // else fall through
                 default: {
                     if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
                         // Start of a Start-Tag
                     } else {
                         // Invalid tag
                         // Add as is
                         *dest = '<';
                         dest++;
                         continue;
                     }
                 }
                 }; // end case

                 processToken();

                 m_cBufferPos = 0;
                 state.setTagState(TagName);
                 state = parseTag(src, state);
             }
         } else if (cc == '&' && !src.escaped()) {
             ++src;
             state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
         } else if (cc == '<' && !src.escaped()) {
             tagStartLineno = lineno+src.lineCount();
             ++src;
             state.setStartTag(true);
         } else if (cc == '\n' || cc == '\r') {
             if (state.discardLF())
                 // Ignore this LF
                 state.setDiscardLF(false); // We have discarded 1 LF
             else
                 // Process this LF
                 *dest++ = '\n';

             /* Check for MS-DOS CRLF sequence */
             if (cc == '\r')
                 state.setSkipLF(true);
             ++src;
         } else {
             state.setDiscardLF(false);
             *dest++ = fixUpChar(*src);
             ++src;
         }
     }

 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("Ending write at time %d\n", parser->doc()->elapsedTime());
 #endif

     inWrite = wasInWrite;

     m_state = state;

     if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
         end(); // this actually causes us to be deleted
         return true;
     }
     return false;
 }

 void HTMLTokenizer::stopParsing()
 {
     Tokenizer::stopParsing();
     m_timer.stop();

     // The part needs to know that the tokenizer has finished with its data,
     // regardless of whether it happened naturally or due to manual intervention.
     if (!m_fragment && m_doc->frame())
         m_doc->frame()->tokenizerProcessedData();
 }

 bool HTMLTokenizer::processingData() const
 {
     return m_timer.isActive();
 }

 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
 {
 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("Beginning timer write at time %d\n", parser->doc()->elapsedTime());
 #endif

     if (parser->doc()->view() && parser->doc()->view()->layoutPending() && !parser->doc()->minimumLayoutDelay()) {
         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
         // timer has higher priority than our timer.
         m_timer.startOneShot(0);
         return;
     }

     RefPtr<Frame> frame = m_fragment ? 0 : m_doc->frame();

     // Invoke write() as though more data came in.
     bool didCallEnd = write(SegmentedString(), true);

     // If we called end() during the write,  we need to let WebKit know that we're done processing the data.
     if (didCallEnd && frame)
         frame->tokenizerProcessedData();
 }

 void HTMLTokenizer::end()
 {
     ASSERT(!m_timer.isActive());
     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.

     if (buffer) {
         // parseTag is using the buffer for different matters
         if (!m_state.hasTagState())
             processToken();

         if (scriptCode)
             KHTML_DELETE_QCHAR_VEC(scriptCode);
         scriptCode = 0;
         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;

         KHTML_DELETE_QCHAR_VEC(buffer);
         buffer = 0;
     }

     parser->finished();
 }

 void HTMLTokenizer::finish()
 {
     // do this as long as we don't find matching comment ends
     while((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
         // we've found an unmatched comment start
         if (m_state.inComment())
             brokenComments = true;
         else
             brokenServer = true;
         checkScriptBuffer();
         scriptCode[scriptCodeSize] = 0;
         scriptCode[scriptCodeSize + 1] = 0;
         int pos;
         QString food;
         if (m_state.inScript() || m_state.inStyle())
             food.setUnicode(scriptCode, scriptCodeSize);
         else if (m_state.inServer()) {
             food = "<";
             food += QString(scriptCode, scriptCodeSize);
         } else {
             pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
             food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
         }
         KHTML_DELETE_QCHAR_VEC(scriptCode);
         scriptCode = 0;
         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
         m_state.setInComment(false);
         m_state.setInServer(false);
         if (!food.isEmpty())
             write(food, true);
     }
     // this indicates we will not receive any more data... but if we are waiting on
     // an external script to load, we can't finish parsing until that is done
     noMoreData = true;
     if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
         end(); // this actually causes us to be deleted
 }

 PassRefPtr<NodeImpl> HTMLTokenizer::processToken()
 {
     KJSProxyImpl* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->jScript() : 0;
     if (jsProxy)
         jsProxy->setEventHandlerLineno(tagStartLineno);
     if (dest > buffer) {
 #ifdef TOKEN_DEBUG
         if(currToken.tagName.length()) {
             qDebug( "unexpected token: %s, str: *%s*", currToken.tagName.qstring().latin1(),QConstString( buffer,dest-buffer ).qstring().latin1() );
             ASSERT(0);
         }

 #endif
         currToken.text = new DOMStringImpl( buffer, dest - buffer );
         if (currToken.tagName != commentAtom)
             currToken.tagName = textAtom;
     } else if (currToken.tagName == nullAtom) {
         currToken.reset();
         if (jsProxy)
             jsProxy->setEventHandlerLineno(lineno+src.lineCount());
         return 0;
     }

     dest = buffer;

 #ifdef TOKEN_DEBUG
     QString name = currToken.tagName.qstring();
     QString text;
     if(currToken.text)
         text = QConstString(currToken.text->s, currToken.text->l).qstring();

     kdDebug( 6036 ) << "Token --> " << name << endl;
     if (currToken.flat)
         kdDebug( 6036 ) << "Token is FLAT!" << endl;
     if(!text.isNull())
         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
     unsigned l = currToken.attrs ? currToken.attrs->length() : 0;
     if(l) {
         kdDebug( 6036 ) << "Attributes: " << l << endl;
         for (unsigned i = 0; i < l; ++i) {
             AttributeImpl* c = currToken.attrs->attributeItem(i);
             kdDebug( 6036 ) << "    " << c->localName().qstring()
                             << "=\"" << c->value().qstring() << "\"" << endl;
         }
     }
     kdDebug( 6036 ) << endl;
 #endif

     RefPtr<NodeImpl> n;

     if (!m_parserStopped)
         // pass the token over to the parser, the parser DOES NOT delete the token
         n = parser->parseToken(&currToken);

     currToken.reset();
     if (jsProxy)
         jsProxy->setEventHandlerLineno(0);

     return n.release();
 }

 HTMLTokenizer::~HTMLTokenizer()
 {
     ASSERT(!inWrite);
     reset();
     delete parser;
 }


 void HTMLTokenizer::enlargeBuffer(int len)
 {
     int newsize = kMax(size*2, size+len);
     int oldoffs = (dest - buffer);

     buffer = (QChar*)fastRealloc(buffer, newsize*sizeof(QChar));
     dest = buffer + oldoffs;
     size = newsize;
 }

 void HTMLTokenizer::enlargeScriptBuffer(int len)
 {
     int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
     scriptCode = (QChar*)fastRealloc(scriptCode, newsize*sizeof(QChar));
     scriptCodeMaxSize = newsize;
 }

 void HTMLTokenizer::notifyFinished(CachedObject */*finishedObj*/)
 {
 #if INSTRUMENT_LAYOUT_SCHEDULING
     if (!parser->doc()->ownerElement())
         printf("script loaded at %d\n", parser->doc()->elapsedTime());
 #endif

     ASSERT(!pendingScripts.isEmpty());
     bool finished = false;
     while (!finished && pendingScripts.head()->isLoaded()) {
 #ifdef TOKEN_DEBUG
         kdDebug( 6036 ) << "Finished loading an external script" << endl;
 #endif
         CachedScript* cs = pendingScripts.dequeue();
         ASSERT(cs->accessCount() > 0);

         DOMString scriptSource = cs->script();
 #ifdef TOKEN_DEBUG
         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.qstring() << endl;
 #endif
         setSrc(SegmentedString());

         // make sure we forget about the script before we execute the new one
         // infinite recursion might happen otherwise
         QString cachedScriptUrl( cs->url().qstring() );
         bool errorOccurred = cs->errorOccurred();
         cs->deref(this);
         RefPtr<NodeImpl> n = scriptNode;
         scriptNode = 0;

 #if INSTRUMENT_LAYOUT_SCHEDULING
         if (!parser->doc()->ownerElement())
             printf("external script beginning execution at %d\n", parser->doc()->elapsedTime());
 #endif

         if (errorOccurred)
             EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, false, false);
         else {
             m_state = scriptExecution(scriptSource.qstring(), m_state, cachedScriptUrl);
             EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
         }

         // The state of pendingScripts.isEmpty() can change inside the scriptExecution()
         // call above, so test afterwards.
         finished = pendingScripts.isEmpty();
         if (finished) {
             m_state.setLoadingExtScript(false);
 #if INSTRUMENT_LAYOUT_SCHEDULING
             if (!parser->doc()->ownerElement())
                 printf("external script finished execution at %d\n", parser->doc()->elapsedTime());
 #endif
         }

         // 'inScript' is true when we are called synchronously from
         // parseScript(). In that case parseScript() will take care
         // of 'scriptOutput'.
         if (!m_state.inScript()) {
             SegmentedString rest = pendingSrc;
             pendingSrc.clear();
             write(rest, false);
             // we might be deleted at this point, do not
             // access any members.
         }
     }
 }

 bool HTMLTokenizer::isWaitingForScripts() const
 {
     return m_state.loadingExtScript();
 }

 void HTMLTokenizer::setSrc(const SegmentedString &source)
 {
     lineno += src.lineCount();
     src = source;
     src.resetLineCount();
 }

 void parseHTMLDocumentFragment(const DOMString &source, DocumentFragmentImpl *fragment)
 {
     HTMLTokenizer tok(fragment);
     tok.setForceSynchronous(true);
     tok.write(source.qstring(), true);
     tok.finish();
     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
 }

 unsigned short decodeNamedEntity(const char* name)
 {
     const Entity* e = findEntity(name, strlen(name));
     return e ? e->code : 0;
 }

 }