blob: 23f6f04270d23c9f6d106d7cfd256a966b52daed [file] [log] [blame]
/*
* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
* Copyright (C) 2006, 2007, 2008 Apple Inc. All Rights Reserved.
* Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
*/
#include "config.h"
#include "lexer.h"
#include "dtoa.h"
#include "JSFunction.h"
#include "nodes.h"
#include "NodeInfo.h"
#include "JSGlobalObjectFunctions.h"
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wtf/Assertions.h>
#include <wtf/unicode/Unicode.h>
using namespace WTF;
using namespace Unicode;
// we can't specify the namespace in yacc's C output, so do it here
using namespace JSC;
#ifndef KDE_USE_FINAL
#include "grammar.h"
#endif
#include "lookup.h"
#include "lexer.lut.h"
// a bridge for yacc from the C world to C++
int kjsyylex(void* lvalp, void* llocp, void* globalData)
{
return static_cast<JSGlobalData*>(globalData)->lexer->lex(lvalp, llocp);
}
namespace JSC {
static bool isDecimalDigit(int);
static const size_t initialReadBufferCapacity = 32;
static const size_t initialStringTableCapacity = 64;
Lexer::Lexer(JSGlobalData* globalData)
: yylineno(1)
, m_restrKeyword(false)
, m_eatNextIdentifier(false)
, m_stackToken(-1)
, m_lastToken(-1)
, m_position(0)
, m_code(0)
, m_length(0)
, m_atLineStart(true)
, m_current(0)
, m_next1(0)
, m_next2(0)
, m_next3(0)
, m_currentOffset(0)
, m_nextOffset1(0)
, m_nextOffset2(0)
, m_nextOffset3(0)
, m_globalData(globalData)
, m_mainTable(JSC::mainTable)
{
m_buffer8.reserveCapacity(initialReadBufferCapacity);
m_buffer16.reserveCapacity(initialReadBufferCapacity);
m_strings.reserveCapacity(initialStringTableCapacity);
m_identifiers.reserveCapacity(initialStringTableCapacity);
}
Lexer::~Lexer()
{
m_mainTable.deleteTable();
}
void Lexer::setCode(int startingLineNumber, PassRefPtr<SourceProvider> source)
{
yylineno = startingLineNumber;
m_restrKeyword = false;
m_delimited = false;
m_eatNextIdentifier = false;
m_stackToken = -1;
m_lastToken = -1;
m_position = 0;
m_source = source;
m_code = m_source->data();
m_length = m_source->length();
m_skipLF = false;
m_skipCR = false;
m_error = false;
m_atLineStart = true;
// read first characters
shift(4);
}
void Lexer::shift(unsigned p)
{
// ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
// see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
while (p--) {
m_current = m_next1;
m_next1 = m_next2;
m_next2 = m_next3;
m_currentOffset = m_nextOffset1;
m_nextOffset1 = m_nextOffset2;
m_nextOffset2 = m_nextOffset3;
do {
if (m_position >= m_length) {
m_nextOffset3 = m_position;
m_position++;
m_next3 = -1;
break;
}
m_nextOffset3 = m_position;
m_next3 = m_code[m_position++];
} while (m_next3 == 0xFEFF);
}
}
// called on each new line
void Lexer::nextLine()
{
yylineno++;
m_atLineStart = true;
}
void Lexer::setDone(State s)
{
m_state = s;
m_done = true;
}
int Lexer::lex(void* p1, void* p2)
{
YYSTYPE* lvalp = static_cast<YYSTYPE*>(p1);
YYLTYPE* llocp = static_cast<YYLTYPE*>(p2);
int token = 0;
m_state = Start;
unsigned short stringType = 0; // either single or double quotes
m_buffer8.clear();
m_buffer16.clear();
m_done = false;
m_terminator = false;
m_skipLF = false;
m_skipCR = false;
// did we push a token on the stack previously ?
// (after an automatic semicolon insertion)
if (m_stackToken >= 0) {
setDone(Other);
token = m_stackToken;
m_stackToken = 0;
}
int startOffset = m_currentOffset;
while (!m_done) {
if (m_skipLF && m_current != '\n') // found \r but not \n afterwards
m_skipLF = false;
if (m_skipCR && m_current != '\r') // found \n but not \r afterwards
m_skipCR = false;
if (m_skipLF || m_skipCR) { // found \r\n or \n\r -> eat the second one
m_skipLF = false;
m_skipCR = false;
shift(1);
}
switch (m_state) {
case Start:
startOffset = m_currentOffset;
if (isWhiteSpace()) {
// do nothing
} else if (m_current == '/' && m_next1 == '/') {
shift(1);
m_state = InSingleLineComment;
} else if (m_current == '/' && m_next1 == '*') {
shift(1);
m_state = InMultiLineComment;
} else if (m_current == -1) {
if (!m_terminator && !m_delimited) {
// automatic semicolon insertion if program incomplete
token = ';';
m_stackToken = 0;
setDone(Other);
} else
setDone(Eof);
} else if (isLineTerminator()) {
nextLine();
m_terminator = true;
if (m_restrKeyword) {
token = ';';
setDone(Other);
}
} else if (m_current == '"' || m_current == '\'') {
m_state = InString;
stringType = static_cast<unsigned short>(m_current);
} else if (isIdentStart(m_current)) {
record16(m_current);
m_state = InIdentifierOrKeyword;
} else if (m_current == '\\')
m_state = InIdentifierStartUnicodeEscapeStart;
else if (m_current == '0') {
record8(m_current);
m_state = InNum0;
} else if (isDecimalDigit(m_current)) {
record8(m_current);
m_state = InNum;
} else if (m_current == '.' && isDecimalDigit(m_next1)) {
record8(m_current);
m_state = InDecimal;
// <!-- marks the beginning of a line comment (for www usage)
} else if (m_current == '<' && m_next1 == '!' && m_next2 == '-' && m_next3 == '-') {
shift(3);
m_state = InSingleLineComment;
// same for -->
} else if (m_atLineStart && m_current == '-' && m_next1 == '-' && m_next2 == '>') {
shift(2);
m_state = InSingleLineComment;
} else {
token = matchPunctuator(lvalp->intValue, m_current, m_next1, m_next2, m_next3);
if (token != -1)
setDone(Other);
else
setDone(Bad);
}
break;
case InString:
if (m_current == stringType) {
shift(1);
setDone(String);
} else if (isLineTerminator() || m_current == -1)
setDone(Bad);
else if (m_current == '\\')
m_state = InEscapeSequence;
else
record16(m_current);
break;
// Escape Sequences inside of strings
case InEscapeSequence:
if (isOctalDigit(m_current)) {
if (m_current >= '0' && m_current <= '3' &&
isOctalDigit(m_next1) && isOctalDigit(m_next2)) {
record16(convertOctal(m_current, m_next1, m_next2));
shift(2);
m_state = InString;
} else if (isOctalDigit(m_current) && isOctalDigit(m_next1)) {
record16(convertOctal('0', m_current, m_next1));
shift(1);
m_state = InString;
} else if (isOctalDigit(m_current)) {
record16(convertOctal('0', '0', m_current));
m_state = InString;
} else
setDone(Bad);
} else if (m_current == 'x')
m_state = InHexEscape;
else if (m_current == 'u')
m_state = InUnicodeEscape;
else if (isLineTerminator()) {
nextLine();
m_state = InString;
} else {
record16(singleEscape(static_cast<unsigned short>(m_current)));
m_state = InString;
}
break;
case InHexEscape:
if (isHexDigit(m_current) && isHexDigit(m_next1)) {
m_state = InString;
record16(convertHex(m_current, m_next1));
shift(1);
} else if (m_current == stringType) {
record16('x');
shift(1);
setDone(String);
} else {
record16('x');
record16(m_current);
m_state = InString;
}
break;
case InUnicodeEscape:
if (isHexDigit(m_current) && isHexDigit(m_next1) && isHexDigit(m_next2) && isHexDigit(m_next3)) {
record16(convertUnicode(m_current, m_next1, m_next2, m_next3));
shift(3);
m_state = InString;
} else if (m_current == stringType) {
record16('u');
shift(1);
setDone(String);
} else
setDone(Bad);
break;
case InSingleLineComment:
if (isLineTerminator()) {
nextLine();
m_terminator = true;
if (m_restrKeyword) {
token = ';';
setDone(Other);
} else
m_state = Start;
} else if (m_current == -1)
setDone(Eof);
break;
case InMultiLineComment:
if (m_current == -1)
setDone(Bad);
else if (isLineTerminator())
nextLine();
else if (m_current == '*' && m_next1 == '/') {
m_state = Start;
shift(1);
}
break;
case InIdentifierOrKeyword:
case InIdentifier:
if (isIdentPart(m_current))
record16(m_current);
else if (m_current == '\\')
m_state = InIdentifierPartUnicodeEscapeStart;
else
setDone(m_state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
break;
case InNum0:
if (m_current == 'x' || m_current == 'X') {
record8(m_current);
m_state = InHex;
} else if (m_current == '.') {
record8(m_current);
m_state = InDecimal;
} else if (m_current == 'e' || m_current == 'E') {
record8(m_current);
m_state = InExponentIndicator;
} else if (isOctalDigit(m_current)) {
record8(m_current);
m_state = InOctal;
} else if (isDecimalDigit(m_current)) {
record8(m_current);
m_state = InDecimal;
} else
setDone(Number);
break;
case InHex:
if (isHexDigit(m_current))
record8(m_current);
else
setDone(Hex);
break;
case InOctal:
if (isOctalDigit(m_current))
record8(m_current);
else if (isDecimalDigit(m_current)) {
record8(m_current);
m_state = InDecimal;
} else
setDone(Octal);
break;
case InNum:
if (isDecimalDigit(m_current))
record8(m_current);
else if (m_current == '.') {
record8(m_current);
m_state = InDecimal;
} else if (m_current == 'e' || m_current == 'E') {
record8(m_current);
m_state = InExponentIndicator;
} else
setDone(Number);
break;
case InDecimal:
if (isDecimalDigit(m_current))
record8(m_current);
else if (m_current == 'e' || m_current == 'E') {
record8(m_current);
m_state = InExponentIndicator;
} else
setDone(Number);
break;
case InExponentIndicator:
if (m_current == '+' || m_current == '-')
record8(m_current);
else if (isDecimalDigit(m_current)) {
record8(m_current);
m_state = InExponent;
} else
setDone(Bad);
break;
case InExponent:
if (isDecimalDigit(m_current))
record8(m_current);
else
setDone(Number);
break;
case InIdentifierStartUnicodeEscapeStart:
if (m_current == 'u')
m_state = InIdentifierStartUnicodeEscape;
else
setDone(Bad);
break;
case InIdentifierPartUnicodeEscapeStart:
if (m_current == 'u')
m_state = InIdentifierPartUnicodeEscape;
else
setDone(Bad);
break;
case InIdentifierStartUnicodeEscape:
if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) {
setDone(Bad);
break;
}
token = convertUnicode(m_current, m_next1, m_next2, m_next3);
shift(3);
if (!isIdentStart(token)) {
setDone(Bad);
break;
}
record16(token);
m_state = InIdentifier;
break;
case InIdentifierPartUnicodeEscape:
if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) {
setDone(Bad);
break;
}
token = convertUnicode(m_current, m_next1, m_next2, m_next3);
shift(3);
if (!isIdentPart(token)) {
setDone(Bad);
break;
}
record16(token);
m_state = InIdentifier;
break;
default:
ASSERT(!"Unhandled state in switch statement");
}
// move on to the next character
if (!m_done)
shift(1);
if (m_state != Start && m_state != InSingleLineComment)
m_atLineStart = false;
}
// no identifiers allowed directly after numeric literal, e.g. "3in" is bad
if ((m_state == Number || m_state == Octal || m_state == Hex) && isIdentStart(m_current))
m_state = Bad;
// terminate string
m_buffer8.append('\0');
#ifdef KJS_DEBUG_LEX
fprintf(stderr, "line: %d ", lineNo());
fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
fprintf(stderr, "%s ", m_buffer8.data());
#endif
double dval = 0;
if (m_state == Number)
dval = strtod(m_buffer8.data(), 0L);
else if (m_state == Hex) { // scan hex numbers
const char* p = m_buffer8.data() + 2;
while (char c = *p++) {
dval *= 16;
dval += convertHex(c);
}
if (dval >= mantissaOverflowLowerBound)
dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);
m_state = Number;
} else if (m_state == Octal) { // scan octal number
const char* p = m_buffer8.data() + 1;
while (char c = *p++) {
dval *= 8;
dval += c - '0';
}
if (dval >= mantissaOverflowLowerBound)
dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);
m_state = Number;
}
#ifdef KJS_DEBUG_LEX
switch (m_state) {
case Eof:
printf("(EOF)\n");
break;
case Other:
printf("(Other)\n");
break;
case Identifier:
printf("(Identifier)/(Keyword)\n");
break;
case String:
printf("(String)\n");
break;
case Number:
printf("(Number)\n");
break;
default:
printf("(unknown)");
}
#endif
if (m_state != Identifier)
m_eatNextIdentifier = false;
m_restrKeyword = false;
m_delimited = false;
llocp->first_line = yylineno;
llocp->last_line = yylineno;
llocp->first_column = startOffset;
llocp->last_column = m_currentOffset;
switch (m_state) {
case Eof:
token = 0;
break;
case Other:
if (token == '}' || token == ';')
m_delimited = true;
break;
case Identifier:
// Apply anonymous-function hack below (eat the identifier).
if (m_eatNextIdentifier) {
m_eatNextIdentifier = false;
token = lex(lvalp, llocp);
break;
}
lvalp->ident = makeIdentifier(m_buffer16);
token = IDENT;
break;
case IdentifierOrKeyword: {
lvalp->ident = makeIdentifier(m_buffer16);
const HashEntry* entry = m_mainTable.entry(m_globalData, *lvalp->ident);
if (!entry) {
// Lookup for keyword failed, means this is an identifier.
token = IDENT;
break;
}
token = entry->integerValue;
// Hack for "f = function somename() { ... }"; too hard to get into the grammar.
m_eatNextIdentifier = token == FUNCTION && m_lastToken == '=';
if (token == CONTINUE || token == BREAK || token == RETURN || token == THROW)
m_restrKeyword = true;
break;
}
case String:
// Atomize constant strings in case they're later used in property lookup.
lvalp->ident = makeIdentifier(m_buffer16);
token = STRING;
break;
case Number:
lvalp->doubleValue = dval;
token = NUMBER;
break;
case Bad:
#ifdef KJS_DEBUG_LEX
fprintf(stderr, "yylex: ERROR.\n");
#endif
m_error = true;
return -1;
default:
ASSERT(!"unhandled numeration value in switch");
m_error = true;
return -1;
}
m_lastToken = token;
return token;
}
bool Lexer::isWhiteSpace() const
{
return m_current == '\t' || m_current == 0x0b || m_current == 0x0c || isSeparatorSpace(m_current);
}
bool Lexer::isLineTerminator()
{
bool cr = (m_current == '\r');
bool lf = (m_current == '\n');
if (cr)
m_skipLF = true;
else if (lf)
m_skipCR = true;
return cr || lf || m_current == 0x2028 || m_current == 0x2029;
}
bool Lexer::isIdentStart(int c)
{
return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
|| c == '$' || c == '_';
}
bool Lexer::isIdentPart(int c)
{
return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
| Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
|| c == '$' || c == '_';
}
static bool isDecimalDigit(int c)
{
return (c >= '0' && c <= '9');
}
bool Lexer::isHexDigit(int c)
{
return (c >= '0' && c <= '9'
|| c >= 'a' && c <= 'f'
|| c >= 'A' && c <= 'F');
}
bool Lexer::isOctalDigit(int c)
{
return (c >= '0' && c <= '7');
}
int Lexer::matchPunctuator(int& charPos, int c1, int c2, int c3, int c4)
{
if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
shift(4);
return URSHIFTEQUAL;
}
if (c1 == '=' && c2 == '=' && c3 == '=') {
shift(3);
return STREQ;
}
if (c1 == '!' && c2 == '=' && c3 == '=') {
shift(3);
return STRNEQ;
}
if (c1 == '>' && c2 == '>' && c3 == '>') {
shift(3);
return URSHIFT;
}
if (c1 == '<' && c2 == '<' && c3 == '=') {
shift(3);
return LSHIFTEQUAL;
}
if (c1 == '>' && c2 == '>' && c3 == '=') {
shift(3);
return RSHIFTEQUAL;
}
if (c1 == '<' && c2 == '=') {
shift(2);
return LE;
}
if (c1 == '>' && c2 == '=') {
shift(2);
return GE;
}
if (c1 == '!' && c2 == '=') {
shift(2);
return NE;
}
if (c1 == '+' && c2 == '+') {
shift(2);
if (m_terminator)
return AUTOPLUSPLUS;
return PLUSPLUS;
}
if (c1 == '-' && c2 == '-') {
shift(2);
if (m_terminator)
return AUTOMINUSMINUS;
return MINUSMINUS;
}
if (c1 == '=' && c2 == '=') {
shift(2);
return EQEQ;
}
if (c1 == '+' && c2 == '=') {
shift(2);
return PLUSEQUAL;
}
if (c1 == '-' && c2 == '=') {
shift(2);
return MINUSEQUAL;
}
if (c1 == '*' && c2 == '=') {
shift(2);
return MULTEQUAL;
}
if (c1 == '/' && c2 == '=') {
shift(2);
return DIVEQUAL;
}
if (c1 == '&' && c2 == '=') {
shift(2);
return ANDEQUAL;
}
if (c1 == '^' && c2 == '=') {
shift(2);
return XOREQUAL;
}
if (c1 == '%' && c2 == '=') {
shift(2);
return MODEQUAL;
}
if (c1 == '|' && c2 == '=') {
shift(2);
return OREQUAL;
}
if (c1 == '<' && c2 == '<') {
shift(2);
return LSHIFT;
}
if (c1 == '>' && c2 == '>') {
shift(2);
return RSHIFT;
}
if (c1 == '&' && c2 == '&') {
shift(2);
return AND;
}
if (c1 == '|' && c2 == '|') {
shift(2);
return OR;
}
switch (c1) {
case '=':
case '>':
case '<':
case ',':
case '!':
case '~':
case '?':
case ':':
case '.':
case '+':
case '-':
case '*':
case '/':
case '&':
case '|':
case '^':
case '%':
case '(':
case ')':
case '[':
case ']':
case ';':
shift(1);
return static_cast<int>(c1);
case '{':
charPos = m_position - 4;
shift(1);
return OPENBRACE;
case '}':
charPos = m_position - 4;
shift(1);
return CLOSEBRACE;
default:
return -1;
}
}
unsigned short Lexer::singleEscape(unsigned short c)
{
switch (c) {
case 'b':
return 0x08;
case 't':
return 0x09;
case 'n':
return 0x0A;
case 'v':
return 0x0B;
case 'f':
return 0x0C;
case 'r':
return 0x0D;
case '"':
return 0x22;
case '\'':
return 0x27;
case '\\':
return 0x5C;
default:
return c;
}
}
unsigned short Lexer::convertOctal(int c1, int c2, int c3)
{
return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
}
unsigned char Lexer::convertHex(int c)
{
if (c >= '0' && c <= '9')
return static_cast<unsigned char>(c - '0');
if (c >= 'a' && c <= 'f')
return static_cast<unsigned char>(c - 'a' + 10);
return static_cast<unsigned char>(c - 'A' + 10);
}
unsigned char Lexer::convertHex(int c1, int c2)
{
return ((convertHex(c1) << 4) + convertHex(c2));
}
UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
{
unsigned char highByte = (convertHex(c1) << 4) + convertHex(c2);
unsigned char lowByte = (convertHex(c3) << 4) + convertHex(c4);
return (highByte << 8 | lowByte);
}
void Lexer::record8(int c)
{
ASSERT(c >= 0);
ASSERT(c <= 0xff);
m_buffer8.append(static_cast<char>(c));
}
void Lexer::record16(int c)
{
ASSERT(c >= 0);
ASSERT(c <= USHRT_MAX);
record16(UChar(static_cast<unsigned short>(c)));
}
void Lexer::record16(UChar c)
{
m_buffer16.append(c);
}
bool Lexer::scanRegExp()
{
m_buffer16.clear();
bool lastWasEscape = false;
bool inBrackets = false;
while (1) {
if (isLineTerminator() || m_current == -1)
return false;
else if (m_current != '/' || lastWasEscape == true || inBrackets == true) {
// keep track of '[' and ']'
if (!lastWasEscape) {
if ( m_current == '[' && !inBrackets )
inBrackets = true;
if ( m_current == ']' && inBrackets )
inBrackets = false;
}
record16(m_current);
lastWasEscape =
!lastWasEscape && (m_current == '\\');
} else { // end of regexp
m_pattern = UString(m_buffer16);
m_buffer16.clear();
shift(1);
break;
}
shift(1);
}
while (isIdentPart(m_current)) {
record16(m_current);
shift(1);
}
m_flags = UString(m_buffer16);
return true;
}
void Lexer::clear()
{
deleteAllValues(m_strings);
Vector<UString*> newStrings;
newStrings.reserveCapacity(initialStringTableCapacity);
m_strings.swap(newStrings);
deleteAllValues(m_identifiers);
Vector<JSC::Identifier*> newIdentifiers;
newIdentifiers.reserveCapacity(initialStringTableCapacity);
m_identifiers.swap(newIdentifiers);
Vector<char> newBuffer8;
newBuffer8.reserveCapacity(initialReadBufferCapacity);
m_buffer8.swap(newBuffer8);
Vector<UChar> newBuffer16;
newBuffer16.reserveCapacity(initialReadBufferCapacity);
m_buffer16.swap(newBuffer16);
m_pattern = 0;
m_flags = 0;
}
Identifier* Lexer::makeIdentifier(const Vector<UChar>& buffer)
{
JSC::Identifier* identifier = new JSC::Identifier(m_globalData, buffer.data(), buffer.size());
m_identifiers.append(identifier);
return identifier;
}
} // namespace JSC