blob: ca6a18c2dfbf042697ecac85570cae74dbfb063a [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Copyright (C) 2016 Apple Inc. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "config.h"
#include "CSSTokenizer.h"
#include "CSSParserIdioms.h"
#include "CSSParserObserverWrapper.h"
#include "CSSParserTokenRange.h"
#include "CSSTokenizerInputStream.h"
#include "HTMLParserIdioms.h"
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
namespace WebCore {
CSSTokenizer::CSSTokenizer(const String& string)
: m_input(string)
{
// According to the spec, we should perform preprocessing here.
// See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
//
// However, we can skip this step since:
// * We're using HTML spaces (which accept \r and \f as a valid white space)
// * Do not count white spaces
// * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters
if (string.isEmpty())
return;
// To avoid resizing we err on the side of reserving too much space.
// Most strings we tokenize have about 3.5 to 5 characters per token.
m_tokens.reserveInitialCapacity(string.length() / 3);
while (true) {
CSSParserToken token = nextToken();
if (token.type() == CommentToken)
continue;
if (token.type() == EOFToken)
return;
m_tokens.append(token);
}
}
CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper)
: m_input(string)
{
if (string.isEmpty())
return;
unsigned offset = 0;
while (true) {
CSSParserToken token = nextToken();
if (token.type() == EOFToken)
break;
if (token.type() == CommentToken)
wrapper.addComment(offset, m_input.offset(), m_tokens.size());
else {
m_tokens.append(token);
wrapper.addToken(offset);
}
offset = m_input.offset();
}
wrapper.addToken(offset);
wrapper.finalizeConstruction(m_tokens.begin());
}
CSSParserTokenRange CSSTokenizer::tokenRange() const
{
return m_tokens;
}
unsigned CSSTokenizer::tokenCount()
{
return m_tokens.size();
}
static bool isNewLine(UChar cc)
{
// We check \r and \f here, since we have no preprocessing stage
return (cc == '\r' || cc == '\n' || cc == '\f');
}
// http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
static bool twoCharsAreValidEscape(UChar first, UChar second)
{
return first == '\\' && !isNewLine(second);
}
void CSSTokenizer::reconsume(UChar c)
{
m_input.pushBack(c);
}
UChar CSSTokenizer::consume()
{
UChar current = m_input.nextInputChar();
m_input.advance();
return current;
}
CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/)
{
m_input.advanceUntilNonWhitespace();
return CSSParserToken(WhitespaceToken);
}
CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type)
{
m_blockStack.append(type);
return CSSParserToken(type, CSSParserToken::BlockStart);
}
CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name)
{
m_blockStack.append(blockType);
return CSSParserToken(type, name, CSSParserToken::BlockStart);
}
CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType)
{
if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) {
m_blockStack.removeLast();
return CSSParserToken(type, CSSParserToken::BlockEnd);
}
return CSSParserToken(type);
}
CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/)
{
return blockStart(LeftParenthesisToken);
}
CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/)
{
return blockEnd(RightParenthesisToken, LeftParenthesisToken);
}
CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/)
{
return blockStart(LeftBracketToken);
}
CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/)
{
return blockEnd(RightBracketToken, LeftBracketToken);
}
CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/)
{
return blockStart(LeftBraceToken);
}
CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/)
{
return blockEnd(RightBraceToken, LeftBraceToken);
}
CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::asterisk(UChar cc)
{
ASSERT_UNUSED(cc, cc == '*');
if (consumeIfNext('='))
return CSSParserToken(SubstringMatchToken);
return CSSParserToken(DelimiterToken, '*');
}
CSSParserToken CSSTokenizer::lessThan(UChar cc)
{
ASSERT_UNUSED(cc, cc == '<');
if (m_input.peekWithoutReplacement(0) == '!'
&& m_input.peekWithoutReplacement(1) == '-'
&& m_input.peekWithoutReplacement(2) == '-') {
m_input.advance(3);
return CSSParserToken(CDOToken);
}
return CSSParserToken(DelimiterToken, '<');
}
CSSParserToken CSSTokenizer::comma(UChar /*cc*/)
{
return CSSParserToken(CommaToken);
}
CSSParserToken CSSTokenizer::hyphenMinus(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
if (m_input.peekWithoutReplacement(0) == '-'
&& m_input.peekWithoutReplacement(1) == '>') {
m_input.advance(2);
return CSSParserToken(CDCToken);
}
if (nextCharsAreIdentifier(cc)) {
reconsume(cc);
return consumeIdentLikeToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::solidus(UChar cc)
{
if (consumeIfNext('*')) {
// These get ignored, but we need a value to return.
consumeUntilCommentEndFound();
return CSSParserToken(CommentToken);
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::colon(UChar /*cc*/)
{
return CSSParserToken(ColonToken);
}
CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/)
{
return CSSParserToken(SemicolonToken);
}
CSSParserToken CSSTokenizer::hash(UChar cc)
{
UChar nextChar = m_input.peekWithoutReplacement(0);
if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) {
HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted;
return CSSParserToken(type, consumeName());
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::circumflexAccent(UChar cc)
{
ASSERT_UNUSED(cc, cc == '^');
if (consumeIfNext('='))
return CSSParserToken(PrefixMatchToken);
return CSSParserToken(DelimiterToken, '^');
}
CSSParserToken CSSTokenizer::dollarSign(UChar cc)
{
ASSERT_UNUSED(cc, cc == '$');
if (consumeIfNext('='))
return CSSParserToken(SuffixMatchToken);
return CSSParserToken(DelimiterToken, '$');
}
CSSParserToken CSSTokenizer::verticalLine(UChar cc)
{
ASSERT_UNUSED(cc, cc == '|');
if (consumeIfNext('='))
return CSSParserToken(DashMatchToken);
if (consumeIfNext('|'))
return CSSParserToken(ColumnToken);
return CSSParserToken(DelimiterToken, '|');
}
CSSParserToken CSSTokenizer::tilde(UChar cc)
{
ASSERT_UNUSED(cc, cc == '~');
if (consumeIfNext('='))
return CSSParserToken(IncludeMatchToken);
return CSSParserToken(DelimiterToken, '~');
}
CSSParserToken CSSTokenizer::commercialAt(UChar cc)
{
ASSERT_UNUSED(cc, cc == '@');
if (nextCharsAreIdentifier())
return CSSParserToken(AtKeywordToken, consumeName());
return CSSParserToken(DelimiterToken, '@');
}
CSSParserToken CSSTokenizer::reverseSolidus(UChar cc)
{
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
reconsume(cc);
return consumeIdentLikeToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::asciiDigit(UChar cc)
{
reconsume(cc);
return consumeNumericToken();
}
CSSParserToken CSSTokenizer::letterU(UChar cc)
{
if (m_input.peekWithoutReplacement(0) == '+'
&& (isASCIIHexDigit(m_input.peekWithoutReplacement(1)) || m_input.peekWithoutReplacement(1) == '?')) {
m_input.advance();
return consumeUnicodeRange();
}
reconsume(cc);
return consumeIdentLikeToken();
}
CSSParserToken CSSTokenizer::nameStart(UChar cc)
{
reconsume(cc);
return consumeIdentLikeToken();
}
CSSParserToken CSSTokenizer::stringStart(UChar cc)
{
return consumeStringTokenUntil(cc);
}
CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/)
{
return CSSParserToken(EOFToken);
}
const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = {
&CSSTokenizer::endOfFile,
0,
0,
0,
0,
0,
0,
0,
0,
&CSSTokenizer::whiteSpace,
&CSSTokenizer::whiteSpace,
0,
&CSSTokenizer::whiteSpace,
&CSSTokenizer::whiteSpace,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
&CSSTokenizer::whiteSpace,
0,
&CSSTokenizer::stringStart,
&CSSTokenizer::hash,
&CSSTokenizer::dollarSign,
0,
0,
&CSSTokenizer::stringStart,
&CSSTokenizer::leftParenthesis,
&CSSTokenizer::rightParenthesis,
&CSSTokenizer::asterisk,
&CSSTokenizer::plusOrFullStop,
&CSSTokenizer::comma,
&CSSTokenizer::hyphenMinus,
&CSSTokenizer::plusOrFullStop,
&CSSTokenizer::solidus,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::colon,
&CSSTokenizer::semiColon,
&CSSTokenizer::lessThan,
0,
0,
0,
&CSSTokenizer::commercialAt,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::letterU,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::leftBracket,
&CSSTokenizer::reverseSolidus,
&CSSTokenizer::rightBracket,
&CSSTokenizer::circumflexAccent,
&CSSTokenizer::nameStart,
0,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::letterU,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::leftBrace,
&CSSTokenizer::verticalLine,
&CSSTokenizer::rightBrace,
&CSSTokenizer::tilde,
0,
};
#if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED
const unsigned codePointsNumber = 128;
#endif
CSSParserToken CSSTokenizer::nextToken()
{
// Unlike the HTMLTokenizer, the CSS Syntax spec is written
// as a stateless, (fixed-size) look-ahead tokenizer.
// We could move to the stateful model and instead create
// states for all the "next 3 codepoints are X" cases.
// State-machine tokenizers are easier to write to handle
// incremental tokenization of partial sources.
// However, for now we follow the spec exactly.
UChar cc = consume();
CodePoint codePointFunc = 0;
if (isASCII(cc)) {
ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
codePointFunc = codePoints[cc];
} else
codePointFunc = &CSSTokenizer::nameStart;
if (codePointFunc)
return ((this)->*(codePointFunc))(cc);
return CSSParserToken(DelimiterToken, cc);
}
// This method merges the following spec sections for efficiency
// http://www.w3.org/TR/css3-syntax/#consume-a-number
// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
CSSParserToken CSSTokenizer::consumeNumber()
{
ASSERT(nextCharsAreNumber());
NumericValueType type = IntegerValueType;
NumericSign sign = NoSign;
unsigned numberLength = 0;
UChar next = m_input.peekWithoutReplacement(0);
if (next == '+') {
++numberLength;
sign = PlusSign;
} else if (next == '-') {
++numberLength;
sign = MinusSign;
}
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength);
next = m_input.peekWithoutReplacement(numberLength);
if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2);
next = m_input.peekWithoutReplacement(numberLength);
}
if (next == 'E' || next == 'e') {
next = m_input.peekWithoutReplacement(numberLength + 1);
if (isASCIIDigit(next)) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1);
} else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3);
}
}
double value = m_input.getDouble(0, numberLength);
m_input.advance(numberLength);
return CSSParserToken(NumberToken, value, type, sign);
}
// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
CSSParserToken CSSTokenizer::consumeNumericToken()
{
CSSParserToken token = consumeNumber();
if (nextCharsAreIdentifier())
token.convertToDimensionWithUnit(consumeName());
else if (consumeIfNext('%'))
token.convertToPercentage();
return token;
}
// http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token
CSSParserToken CSSTokenizer::consumeIdentLikeToken()
{
StringView name = consumeName();
if (consumeIfNext('(')) {
if (equalIgnoringASCIICase(name, "url")) {
// The spec is slightly different so as to avoid dropping whitespace
// tokens, but they wouldn't be used and this is easier.
m_input.advanceUntilNonWhitespace();
UChar next = m_input.peekWithoutReplacement(0);
if (next != '"' && next != '\'')
return consumeUrlToken();
}
return blockStart(LeftParenthesisToken, FunctionToken, name);
}
return CSSParserToken(IdentToken, name);
}
// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
{
// Strings without escapes get handled without allocations
for (unsigned size = 0; ; size++) {
UChar cc = m_input.peekWithoutReplacement(size);
if (cc == endingCodePoint) {
unsigned startOffset = m_input.offset();
m_input.advance(size + 1);
return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size));
}
if (isNewLine(cc)) {
m_input.advance(size);
return CSSParserToken(BadStringToken);
}
if (cc == '\0' || cc == '\\')
break;
}
StringBuilder output;
while (true) {
UChar cc = consume();
if (cc == endingCodePoint || cc == kEndOfFileMarker)
return CSSParserToken(StringToken, registerString(output.toString()));
if (isNewLine(cc)) {
reconsume(cc);
return CSSParserToken(BadStringToken);
}
if (cc == '\\') {
if (m_input.nextInputChar() == kEndOfFileMarker)
continue;
if (isNewLine(m_input.peekWithoutReplacement(0)))
consumeSingleWhitespaceIfNext(); // This handles \r\n for us
else
output.appendCharacter(consumeEscape());
} else
output.append(cc);
}
}
CSSParserToken CSSTokenizer::consumeUnicodeRange()
{
ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?');
int lengthRemaining = 6;
UChar32 start = 0;
while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
start = start * 16 + toASCIIHexValue(consume());
--lengthRemaining;
}
UChar32 end = start;
if (lengthRemaining && consumeIfNext('?')) {
do {
start *= 16;
end = end * 16 + 0xF;
--lengthRemaining;
} while (lengthRemaining && consumeIfNext('?'));
} else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) {
m_input.advance();
lengthRemaining = 6;
end = 0;
do {
end = end * 16 + toASCIIHexValue(consume());
--lengthRemaining;
} while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0)));
}
return CSSParserToken(UnicodeRangeToken, start, end);
}
// http://dev.w3.org/csswg/css-syntax/#non-printable-code-point
static bool isNonPrintableCodePoint(UChar cc)
{
return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f';
}
// http://dev.w3.org/csswg/css-syntax/#consume-url-token
CSSParserToken CSSTokenizer::consumeUrlToken()
{
m_input.advanceUntilNonWhitespace();
// URL tokens without escapes get handled without allocations
for (unsigned size = 0; ; size++) {
UChar cc = m_input.peekWithoutReplacement(size);
if (cc == ')') {
unsigned startOffset = m_input.offset();
m_input.advance(size + 1);
return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size));
}
if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f')
break;
}
StringBuilder result;
while (true) {
UChar cc = consume();
if (cc == ')' || cc == kEndOfFileMarker)
return CSSParserToken(UrlToken, registerString(result.toString()));
if (isHTMLSpace(cc)) {
m_input.advanceUntilNonWhitespace();
if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker)
return CSSParserToken(UrlToken, registerString(result.toString()));
break;
}
if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc))
break;
if (cc == '\\') {
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
result.appendCharacter(consumeEscape());
continue;
}
break;
}
result.append(cc);
}
consumeBadUrlRemnants();
return CSSParserToken(BadUrlToken);
}
// http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url
void CSSTokenizer::consumeBadUrlRemnants()
{
while (true) {
UChar cc = consume();
if (cc == ')' || cc == kEndOfFileMarker)
return;
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0)))
consumeEscape();
}
}
void CSSTokenizer::consumeSingleWhitespaceIfNext()
{
// We check for \r\n and HTML spaces since we don't do preprocessing
UChar next = m_input.peekWithoutReplacement(0);
if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n')
m_input.advance(2);
else if (isHTMLSpace(next))
m_input.advance();
}
void CSSTokenizer::consumeUntilCommentEndFound()
{
UChar c = consume();
while (true) {
if (c == kEndOfFileMarker)
return;
if (c != '*') {
c = consume();
continue;
}
c = consume();
if (c == '/')
return;
}
}
bool CSSTokenizer::consumeIfNext(UChar character)
{
// Since we're not doing replacement we can't tell the difference from
// a NUL in the middle and the kEndOfFileMarker, so character must not be
// NUL.
ASSERT(character);
if (m_input.peekWithoutReplacement(0) == character) {
m_input.advance();
return true;
}
return false;
}
// http://www.w3.org/TR/css3-syntax/#consume-a-name
StringView CSSTokenizer::consumeName()
{
// Names without escapes get handled without allocations
for (unsigned size = 0; ; ++size) {
UChar cc = m_input.peekWithoutReplacement(size);
if (isNameCodePoint(cc))
continue;
// peekWithoutReplacement will return NUL when we hit the end of the
// input. In that case we want to still use the rangeAt() fast path
// below.
if (cc == '\0' && m_input.offset() + size < m_input.length())
break;
if (cc == '\\')
break;
unsigned startOffset = m_input.offset();
m_input.advance(size);
return m_input.rangeAt(startOffset, size);
}
StringBuilder result;
while (true) {
UChar cc = consume();
if (isNameCodePoint(cc)) {
result.append(cc);
continue;
}
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
result.appendCharacter(consumeEscape());
continue;
}
reconsume(cc);
return registerString(result.toString());
}
}
// http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
UChar32 CSSTokenizer::consumeEscape()
{
UChar cc = consume();
ASSERT(!isNewLine(cc));
if (isASCIIHexDigit(cc)) {
unsigned consumedHexDigits = 1;
StringBuilder hexChars;
hexChars.append(cc);
while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
cc = consume();
hexChars.append(cc);
consumedHexDigits++;
};
consumeSingleWhitespaceIfNext();
bool ok = false;
UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16);
ASSERT(ok);
if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
return replacementCharacter;
return codePoint;
}
if (cc == kEndOfFileMarker)
return replacementCharacter;
return cc;
}
bool CSSTokenizer::nextTwoCharsAreValidEscape()
{
return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1));
}
// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
bool CSSTokenizer::nextCharsAreNumber(UChar first)
{
UChar second = m_input.peekWithoutReplacement(0);
if (isASCIIDigit(first))
return true;
if (first == '+' || first == '-')
return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1))));
if (first =='.')
return (isASCIIDigit(second));
return false;
}
bool CSSTokenizer::nextCharsAreNumber()
{
UChar first = consume();
bool areNumber = nextCharsAreNumber(first);
reconsume(first);
return areNumber;
}
// http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier
bool CSSTokenizer::nextCharsAreIdentifier(UChar first)
{
UChar second = m_input.peekWithoutReplacement(0);
if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second))
return true;
if (first == '-')
return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape();
return false;
}
bool CSSTokenizer::nextCharsAreIdentifier()
{
UChar first = consume();
bool areIdentifier = nextCharsAreIdentifier(first);
reconsume(first);
return areIdentifier;
}
StringView CSSTokenizer::registerString(const String& string)
{
m_stringPool.append(string);
return string;
}
} // namespace WebCore