WebCore/src/kdelibs/kjs/lexer.cpp - WebKit - Git at Google

 /*
  *  This file is part of the KDE libraries
  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
  *
  *  This library is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU Library General Public
  *  License as published by the Free Software Foundation; either
  *  version 2 of the License, or (at your option) any later version.
  *
  *  This library is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  Library General Public License for more details.
  *
  *  You should have received a copy of the GNU Library General Public License
  *  along with this library; see the file COPYING.LIB.  If not, write to
  *  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  *  Boston, MA 02111-1307, USA.
  */

 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif

 #include <ctype.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>

 #include "kjs.h"
 #include "nodes.h"
 #include "lexer.h"
 #include "ustring.h"
 #include "lookup.h"
 #include "internal.h"

 // we can't specify the namespace in yacc's C output, so do it here
 using namespace KJS;

 #ifndef KDE_USE_FINAL
 #include "grammar.h"
 #endif

 #include "lexer.lut.h"

 #ifdef KJS_DEBUGGER
 extern YYLTYPE yylloc;	// global bison variable holding token info
 #endif

 // a bridge for yacc from the C world to C++
 int kjsyylex()
 {
   return Lexer::curr()->lex();
 }

 Lexer::Lexer()
   : yylineno(0),
     size8(128), size16(128), restrKeyword(false),
     stackToken(-1), pos(0),
     code(0), length(0),
 #ifndef KJS_PURE_ECMA
     bol(true),
 #endif
     current(0), next1(0), next2(0), next3(0)
 {
   // allocate space for read buffers
   buffer8 = new char[size8];
   buffer16 = new UChar[size16];

 }

 Lexer::~Lexer()
 {
   delete [] buffer8;
   delete [] buffer16;
 }

 Lexer *Lexer::curr()
 {
   assert(KJScriptImp::current());
   return KJScriptImp::current()->lex;
 }

 void Lexer::setCode(const UChar *c, unsigned int len)
 {
   yylineno = 0;
   restrKeyword = false;
   delimited = false;
   stackToken = -1;
   pos = 0;
   code = c;
   length = len;
 #ifndef KJS_PURE_ECMA
   bol = true;
 #endif

   // read first characters
   current = (length > 0) ? code[0].unicode() : 0;
   next1 = (length > 1) ? code[1].unicode() : 0;
   next2 = (length > 2) ? code[2].unicode() : 0;
   next3 = (length > 3) ? code[3].unicode() : 0;
 }

 void Lexer::shift(unsigned int p)
 {
   while (p--) {
     pos++;
     current = next1;
     next1 = next2;
     next2 = next3;
     next3 = (pos + 3 < length) ? code[pos+3].unicode() : 0;
   }
 }

 void Lexer::setDone(State s)
 {
   state = s;
   done = true;
 }

 int Lexer::lex()
 {
   int token = 0;
   state = Start;
   unsigned short stringType = 0; // either single or double quotes
   pos8 = pos16 = 0;
   done = false;
   terminator = false;

   // did we push a token on the stack previously ?
   // (after an automatic semicolon insertion)
   if (stackToken >= 0) {
     setDone(Other);
     token = stackToken;
     stackToken = 0;
   }

   while (!done) {
     switch (state) {
     case Start:
       if (isWhiteSpace()) {
 	// do nothing
       } else if (current == '/' && next1 == '/') {
 	shift(1);
 	state = InSingleLineComment;
       } else if (current == '/' && next1 == '*') {
 	shift(1);
 	state = InMultiLineComment;
       } else if (current == 0) {
 	if (!terminator && !delimited) {
 	  // automatic semicolon insertion if program incomplete
 	  token = ';';
 	  stackToken = 0;
 	  setDone(Other);
 	} else
 	  setDone(Eof);
       } else if (isLineTerminator()) {
 	yylineno++;
 #ifndef KJS_PURE_ECMA
 	bol = true;
 #endif
 	terminator = true;
 	if (restrKeyword) {
 	  token = ';';
 	  setDone(Other);
 	}
       } else if (current == '"' || current == '\'') {
 	state = InString;
 	stringType = current;
       } else if (isIdentLetter(current)) {
 	record16(current);
 	state = InIdentifier;
       } else if (current == '0') {
 	record8(current);
 	state = InNum0;
       } else if (isDecimalDigit(current)) {
 	record8(current);
 	state = InNum;
       } else if (current == '.' && isDecimalDigit(next1)) {
 	record8(current);
 	state = InDecimal;
 #ifndef KJS_PURE_ECMA
 	// <!-- marks the beginning of a line comment (for www usage)
       } else if (bol && current == '<' && next1 == '!' &&
 		 next2 == '-' && next3 == '-') {
 	shift(3);
 	state = InSingleLineComment;
 	// same of -->
       } else if (bol && current == '-' && next1 == '-' &&  next2 == '>') {
 	shift(2);
 	state = InSingleLineComment;
 #endif
       } else {
 	token = matchPunctuator(current, next1, next2, next3);
 	if (token != -1) {
 	  setDone(Other);
 	} else {
 	  //	  cerr << "encountered unknown character" << endl;
 	  setDone(Bad);
 	}
       }
       break;
     case InString:
       if (current == stringType) {
 	shift(1);
 	setDone(String);
       } else if (current == 0 || isLineTerminator()) {
 	setDone(Bad);
       } else if (current == '\\') {
 	state = InEscapeSequence;
       } else {
 	record16(current);
       }
       break;
     // Escape Sequences inside of strings
     case InEscapeSequence:
       if (isOctalDigit(current)) {
 	if (current >= '0' && current <= '3' &&
 	    isOctalDigit(next1) && isOctalDigit(next2)) {
 	  record16(convertOctal(current, next1, next2));
 	  shift(2);
 	  state = InString;
 	} else if (isOctalDigit(current) && isOctalDigit(next1)) {
 	  record16(convertOctal('0', current, next1));
 	  shift(1);
 	  state = InString;
 	} else if (isOctalDigit(current)) {
 	  record16(convertOctal('0', '0', current));
 	  state = InString;
 	} else {
 	  setDone(Bad);
 	}
       } else if (current == 'x')
 	state = InHexEscape;
       else if (current == 'u')
 	state = InUnicodeEscape;
       else {
 	record16(singleEscape(current));
 	state = InString;
       }
       break;
     case InHexEscape:
       if (isHexDigit(current) && isHexDigit(next1)) {
 	state = InString;
 	record16(convertHex(current, next1));
 	shift(1);
       } else if (current == stringType) {
 	record16('x');
 	shift(1);
 	setDone(String);
       } else {
 	record16('x');
 	record16(current);
 	state = InString;
       }
       break;
     case InUnicodeEscape:
       if (isHexDigit(current) && isHexDigit(next1) &&
 	  isHexDigit(next2) && isHexDigit(next3)) {
 	record16(convertUnicode(current, next1, next2, next3));
 	shift(3);
 	state = InString;
       } else if (current == stringType) {
 	record16('u');
 	shift(1);
 	setDone(String);
       } else {
 	setDone(Bad);
       }
       break;
     case InSingleLineComment:
       if (isLineTerminator()) {
 	yylineno++;
 	terminator = true;
 #ifndef KJS_PURE_ECMA
 	bol = true;
 #endif
 	if (restrKeyword) {
 	  token = ';';
 	  setDone(Other);
 	} else
 	  state = Start;
       } else if (current == 0) {
 	setDone(Eof);
       }
       break;
     case InMultiLineComment:
       if (current == 0) {
 	setDone(Bad);
       } else if (isLineTerminator()) {
 	yylineno++;
       } else if (current == '*' && next1 == '/') {
 	state = Start;
 	shift(1);
       }
       break;
     case InIdentifier:
       if (isIdentLetter(current) || isDecimalDigit(current)) {
 	record16(current);
 	break;
       }
       setDone(Identifier);
       break;
     case InNum0:
       if (current == 'x' || current == 'X') {
 	record8(current);
 	state = InHex;
       } else if (current == '.') {
 	record8(current);
 	state = InDecimal;
       } else if (current == 'e' || current == 'E') {
 	record8(current);
 	state = InExponentIndicator;
       } else if (isOctalDigit(current)) {
 	record8(current);
 	state = InOctal;
       } else {
 	setDone(Number);
       }
       break;
     case InHex:
       if (isHexDigit(current)) {
 	record8(current);
       } else {
 	setDone(Hex);
       }
       break;
     case InOctal:
       if (isOctalDigit(current)) {
 	record8(current);
       } else
 	setDone(Octal);
       break;
     case InNum:
       if (isDecimalDigit(current)) {
 	record8(current);
       } else if (current == '.') {
 	record8(current);
 	state = InDecimal;
       } else if (current == 'e' || current == 'E') {
 	record8(current);
 	state = InExponentIndicator;
       } else
 	setDone(Number);
       break;
     case InDecimal:
       if (isDecimalDigit(current)) {
 	record8(current);
       } else if (current == 'e' || current == 'E') {
 	record8(current);
 	state = InExponentIndicator;
       } else
 	setDone(Number);
       break;
     case InExponentIndicator:
       if (current == '+' || current == '-') {
 	record8(current);
       } else if (isDecimalDigit(current)) {
 	record8(current);
 	state = InExponent;
       } else
 	setDone(Bad);
       break;
     case InExponent:
       if (isDecimalDigit(current)) {
 	record8(current);
       } else
 	setDone(Number);
       break;
     default:
       assert(!"Unhandled state in switch statement");
     }

     // move on to the next character
     if (!done)
       shift(1);
 #ifndef KJS_PURE_ECMA
     if (state != Start && state != InSingleLineComment)
       bol = false;
 #endif
   }

   // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
   if ((state == Number || state == Octal || state == Hex)
       && isIdentLetter(current))
     state = Bad;

   // terminate string
   buffer8[pos8] = '\0';

 #ifdef KJS_DEBUG_LEX
   fprintf(stderr, "line: %d ", lineNo());
   fprintf(stderr, "yytext (%x): ", buffer8[0]);
   fprintf(stderr, "%s ", buffer8);
 #endif

   double dval = 0;
   if (state == Number) {
     dval = strtod(buffer8, 0L);
   } else if (state == Hex) { // scan hex numbers
     // TODO: support long unsigned int
     unsigned int i;
     sscanf(buffer8, "%x", &i);
     dval = i;
     state = Number;
   } else if (state == Octal) {   // scan octal number
     unsigned int ui;
     sscanf(buffer8, "%o", &ui);
     dval = ui;
     state = Number;
   }

 #ifdef KJS_DEBUG_LEX
   switch (state) {
   case Eof:
     printf("(EOF)\n");
     break;
   case Other:
     printf("(Other)\n");
     break;
   case Identifier:
     printf("(Identifier)/(Keyword)\n");
     break;
   case String:
     printf("(String)\n");
     break;
   case Number:
     printf("(Number)\n");
     break;
   default:
     printf("(unknown)");
   }
 #endif

   restrKeyword = false;
   delimited = false;
 #ifdef KJS_DEBUGGER
   yylloc.first_line = yylineno; // ???
   yylloc.last_line = yylineno;
 #endif

   switch (state) {
   case Eof:
     return 0;
   case Other:
     if(token == '}' || token == ';') {
       delimited = true;
     }
     return token;
   case Identifier:
     if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
       /* TODO: close leak on parse error. same holds true for String */
       kjsyylval.ustr = new UString(buffer16, pos16);
       return IDENT;
     }
     if (token == CONTINUE || token == BREAK ||
 	token == RETURN || token == THROW)
       restrKeyword = true;
     return token;
   case String:
     kjsyylval.ustr = new UString(buffer16, pos16); return STRING;
   case Number:
     kjsyylval.dval = dval;
     return NUMBER;
   case Bad:
     fprintf(stderr, "yylex: ERROR.\n");
     return -1;
   default:
     assert(!"unhandled numeration value in switch");
     return -1;
   }
 }

 bool Lexer::isWhiteSpace() const
 {
   return (current == ' ' || current == '\t' ||
 	  current == 0x0b || current == 0x0c);
 }

 bool Lexer::isLineTerminator() const
 {
   return (current == '\n' || current == '\r');
 }

 bool Lexer::isIdentLetter(unsigned short c)
 {
   /* TODO: allow other legitimate unicode chars */
   return (c >= 'a' && c <= 'z' ||
 	  c >= 'A' && c <= 'Z' ||
 	  c == '$' || c == '_');
 }

 bool Lexer::isDecimalDigit(unsigned short c)
 {
   return (c >= '0' && c <= '9');
 }

 bool Lexer::isHexDigit(unsigned short c) const
 {
   return (c >= '0' && c <= '9' ||
 	  c >= 'a' && c <= 'f' ||
 	  c >= 'A' && c <= 'F');
 }

 bool Lexer::isOctalDigit(unsigned short c) const
 {
   return (c >= '0' && c <= '7');
 }

 int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
 			      unsigned short c3, unsigned short c4)
 {
   if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
     shift(4);
     return URSHIFTEQUAL;
   } else if (c1 == '=' && c2 == '=' && c3 == '=') {
     shift(3);
     return STREQ;
   } else if (c1 == '!' && c2 == '=' && c3 == '=') {
     shift(3);
     return STRNEQ;
    } else if (c1 == '>' && c2 == '>' && c3 == '>') {
     shift(3);
     return URSHIFT;
   } else if (c1 == '<' && c2 == '<' && c3 == '=') {
     shift(3);
     return LSHIFTEQUAL;
   } else if (c1 == '>' && c2 == '>' && c3 == '=') {
     shift(3);
     return RSHIFTEQUAL;
   } else if (c1 == '<' && c2 == '=') {
     shift(2);
     return LE;
   } else if (c1 == '>' && c2 == '=') {
     shift(2);
     return GE;
   } else if (c1 == '!' && c2 == '=') {
     shift(2);
     return NE;
   } else if (c1 == '+' && c2 == '+') {
     shift(2);
     if (terminator) {
       // automatic semicolon insertion
       stackToken = PLUSPLUS;
       return AUTO;
     } else
       return PLUSPLUS;
   } else if (c1 == '-' && c2 == '-') {
     shift(2);
     if (terminator) {
       // automatic semicolon insertion
       stackToken = MINUSMINUS;
       return AUTO;
     } else
       return MINUSMINUS;
   } else if (c1 == '=' && c2 == '=') {
     shift(2);
     return EQEQ;
   } else if (c1 == '+' && c2 == '=') {
     shift(2);
     return PLUSEQUAL;
   } else if (c1 == '-' && c2 == '=') {
     shift(2);
     return MINUSEQUAL;
   } else if (c1 == '*' && c2 == '=') {
     shift(2);
     return MULTEQUAL;
   } else if (c1 == '/' && c2 == '=') {
     shift(2);
     return DIVEQUAL;
   } else if (c1 == '&' && c2 == '=') {
     shift(2);
     return ANDEQUAL;
   } else if (c1 == '^' && c2 == '=') {
     shift(2);
     return XOREQUAL;
   } else if (c1 == '%' && c2 == '=') {
     shift(2);
     return MODEQUAL;
   } else if (c1 == '|' && c2 == '=') {
     shift(2);
     return OREQUAL;
   } else if (c1 == '<' && c2 == '<') {
     shift(2);
     return LSHIFT;
   } else if (c1 == '>' && c2 == '>') {
     shift(2);
     return RSHIFT;
   } else if (c1 == '&' && c2 == '&') {
     shift(2);
     return AND;
   } else if (c1 == '|' && c2 == '|') {
     shift(2);
     return OR;
   }

   switch(c1) {
     case '=':
     case '>':
     case '<':
     case ',':
     case '!':
     case '~':
     case '?':
     case ':':
     case '.':
     case '+':
     case '-':
     case '*':
     case '/':
     case '&':
     case '|':
     case '^':
     case '%':
     case '(':
     case ')':
     case '{':
     case '}':
     case '[':
     case ']':
     case ';':
       shift(1);
       return static_cast<int>(c1);
     default:
       return -1;
   }
 }

 unsigned short Lexer::singleEscape(unsigned short c) const
 {
   switch(c) {
   case 'b':
     return 0x08;
   case 't':
     return 0x09;
   case 'n':
     return 0x0A;
   case 'v':
     return 0x0B;
   case 'f':
     return 0x0C;
   case 'r':
     return 0x0D;
   case '"':
     return 0x22;
   case '\'':
     return 0x27;
   case '\\':
     return 0x5C;
   default:
     return c;
   }
 }

 unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
                                       unsigned short c3) const
 {
   return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
 }

 unsigned char Lexer::convertHex(unsigned short c)
 {
   if (c >= '0' && c <= '9')
     return (c - '0');
   else if (c >= 'a' && c <= 'f')
     return (c - 'a' + 10);
   else
     return (c - 'A' + 10);
 }

 unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
 {
   return ((convertHex(c1) << 4) + convertHex(c2));
 }

 UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
                                      unsigned short c3, unsigned short c4)
 {
   return UChar((convertHex(c1) << 4) + convertHex(c2),
 	       (convertHex(c3) << 4) + convertHex(c4));
 }

 void Lexer::record8(unsigned short c)
 {
   assert(c <= 0xff);

   // enlarge buffer if full
   if (pos8 >= size8 - 1) {
     char *tmp = new char[2 * size8];
     memcpy(tmp, buffer8, size8 * sizeof(char));
     delete [] buffer8;
     buffer8 = tmp;
     size8 *= 2;
   }

   buffer8[pos8++] = (char) c;
 }

 void Lexer::record16(UChar c)
 {
   // enlarge buffer if full
   if (pos16 >= size16 - 1) {
     UChar *tmp = new UChar[2 * size16];
     memcpy(tmp, buffer16, size16 * sizeof(UChar));
     delete [] buffer16;
     buffer16 = tmp;
     size16 *= 2;
   }

   buffer16[pos16++] = c;
 }

 bool Lexer::scanRegExp()
 {
   pos16 = 0;
   bool lastWasEscape = false;

   while (1) {
     if (isLineTerminator() || current == 0)
       return false;
     else if (current != '/' || lastWasEscape == true)
     {
         record16(current);
         lastWasEscape =
             !lastWasEscape && (current == '\\');
     }
     else {
       pattern = UString(buffer16, pos16);
       pos16 = 0;
       shift(1);
       break;
     }
     shift(1);
   }

   while (isIdentLetter(current)) {
     record16(current);
     shift(1);
   }
   flags = UString(buffer16, pos16);

   return true;
 }
	/*
	* This file is part of the KDE libraries
	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Library General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Library General Public License for more details.
	*
	* You should have received a copy of the GNU Library General Public License
	* along with this library; see the file COPYING.LIB. If not, write to
	* the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	* Boston, MA 02111-1307, USA.
	*/

	#ifdef HAVE_CONFIG_H
	#include <config.h>
	#endif

	#include <ctype.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>

	#include "kjs.h"
	#include "nodes.h"
	#include "lexer.h"
	#include "ustring.h"
	#include "lookup.h"
	#include "internal.h"

	// we can't specify the namespace in yacc's C output, so do it here
	using namespace KJS;

	#ifndef KDE_USE_FINAL
	#include "grammar.h"
	#endif

	#include "lexer.lut.h"

	#ifdef KJS_DEBUGGER
	extern YYLTYPE yylloc; // global bison variable holding token info
	#endif

	// a bridge for yacc from the C world to C++
	int kjsyylex()
	{
	return Lexer::curr()->lex();
	}

	Lexer::Lexer()
	: yylineno(0),
	size8(128), size16(128), restrKeyword(false),
	stackToken(-1), pos(0),
	code(0), length(0),
	#ifndef KJS_PURE_ECMA
	bol(true),
	#endif
	current(0), next1(0), next2(0), next3(0)
	{
	// allocate space for read buffers
	buffer8 = new char[size8];
	buffer16 = new UChar[size16];

	}

	Lexer::~Lexer()
	{
	delete [] buffer8;
	delete [] buffer16;
	}

	Lexer *Lexer::curr()
	{
	assert(KJScriptImp::current());
	return KJScriptImp::current()->lex;
	}

	void Lexer::setCode(const UChar *c, unsigned int len)
	{
	yylineno = 0;
	restrKeyword = false;
	delimited = false;
	stackToken = -1;
	pos = 0;
	code = c;
	length = len;
	#ifndef KJS_PURE_ECMA
	bol = true;
	#endif

	// read first characters
	current = (length > 0) ? code[0].unicode() : 0;
	next1 = (length > 1) ? code[1].unicode() : 0;
	next2 = (length > 2) ? code[2].unicode() : 0;
	next3 = (length > 3) ? code[3].unicode() : 0;
	}

	void Lexer::shift(unsigned int p)
	{
	while (p--) {
	pos++;
	current = next1;
	next1 = next2;
	next2 = next3;
	next3 = (pos + 3 < length) ? code[pos+3].unicode() : 0;
	}
	}

	void Lexer::setDone(State s)
	{
	state = s;
	done = true;
	}

	int Lexer::lex()
	{
	int token = 0;
	state = Start;
	unsigned short stringType = 0; // either single or double quotes
	pos8 = pos16 = 0;
	done = false;
	terminator = false;

	// did we push a token on the stack previously ?
	// (after an automatic semicolon insertion)
	if (stackToken >= 0) {
	setDone(Other);
	token = stackToken;
	stackToken = 0;
	}

	while (!done) {
	switch (state) {
	case Start:
	if (isWhiteSpace()) {
	// do nothing
	} else if (current == '/' && next1 == '/') {
	shift(1);
	state = InSingleLineComment;
	} else if (current == '/' && next1 == '*') {
	shift(1);
	state = InMultiLineComment;
	} else if (current == 0) {
	if (!terminator && !delimited) {
	// automatic semicolon insertion if program incomplete
	token = ';';
	stackToken = 0;
	setDone(Other);
	} else
	setDone(Eof);
	} else if (isLineTerminator()) {
	yylineno++;
	#ifndef KJS_PURE_ECMA
	bol = true;
	#endif
	terminator = true;
	if (restrKeyword) {
	token = ';';
	setDone(Other);
	}
	} else if (current == '"' \|\| current == '\'') {
	state = InString;
	stringType = current;
	} else if (isIdentLetter(current)) {
	record16(current);
	state = InIdentifier;
	} else if (current == '0') {
	record8(current);
	state = InNum0;
	} else if (isDecimalDigit(current)) {
	record8(current);
	state = InNum;
	} else if (current == '.' && isDecimalDigit(next1)) {
	record8(current);
	state = InDecimal;
	#ifndef KJS_PURE_ECMA
	// <!-- marks the beginning of a line comment (for www usage)
	} else if (bol && current == '<' && next1 == '!' &&
	next2 == '-' && next3 == '-') {
	shift(3);
	state = InSingleLineComment;
	// same of -->
	} else if (bol && current == '-' && next1 == '-' && next2 == '>') {
	shift(2);
	state = InSingleLineComment;
	#endif
	} else {
	token = matchPunctuator(current, next1, next2, next3);
	if (token != -1) {
	setDone(Other);
	} else {
	// cerr << "encountered unknown character" << endl;
	setDone(Bad);
	}
	}
	break;
	case InString:
	if (current == stringType) {
	shift(1);
	setDone(String);
	} else if (current == 0 \|\| isLineTerminator()) {
	setDone(Bad);
	} else if (current == '\\') {
	state = InEscapeSequence;
	} else {
	record16(current);
	}
	break;
	// Escape Sequences inside of strings
	case InEscapeSequence:
	if (isOctalDigit(current)) {
	if (current >= '0' && current <= '3' &&
	isOctalDigit(next1) && isOctalDigit(next2)) {
	record16(convertOctal(current, next1, next2));
	shift(2);
	state = InString;
	} else if (isOctalDigit(current) && isOctalDigit(next1)) {
	record16(convertOctal('0', current, next1));
	shift(1);
	state = InString;
	} else if (isOctalDigit(current)) {
	record16(convertOctal('0', '0', current));
	state = InString;
	} else {
	setDone(Bad);
	}
	} else if (current == 'x')
	state = InHexEscape;
	else if (current == 'u')
	state = InUnicodeEscape;
	else {
	record16(singleEscape(current));
	state = InString;
	}
	break;
	case InHexEscape:
	if (isHexDigit(current) && isHexDigit(next1)) {
	state = InString;
	record16(convertHex(current, next1));
	shift(1);
	} else if (current == stringType) {
	record16('x');
	shift(1);
	setDone(String);
	} else {
	record16('x');
	record16(current);
	state = InString;
	}
	break;
	case InUnicodeEscape:
	if (isHexDigit(current) && isHexDigit(next1) &&
	isHexDigit(next2) && isHexDigit(next3)) {
	record16(convertUnicode(current, next1, next2, next3));
	shift(3);
	state = InString;
	} else if (current == stringType) {
	record16('u');
	shift(1);
	setDone(String);
	} else {
	setDone(Bad);
	}
	break;
	case InSingleLineComment:
	if (isLineTerminator()) {
	yylineno++;
	terminator = true;
	#ifndef KJS_PURE_ECMA
	bol = true;
	#endif
	if (restrKeyword) {
	token = ';';
	setDone(Other);
	} else
	state = Start;
	} else if (current == 0) {
	setDone(Eof);
	}
	break;
	case InMultiLineComment:
	if (current == 0) {
	setDone(Bad);
	} else if (isLineTerminator()) {
	yylineno++;
	} else if (current == '*' && next1 == '/') {
	state = Start;
	shift(1);
	}
	break;
	case InIdentifier:
	if (isIdentLetter(current) \|\| isDecimalDigit(current)) {
	record16(current);
	break;
	}
	setDone(Identifier);
	break;
	case InNum0:
	if (current == 'x' \|\| current == 'X') {
	record8(current);
	state = InHex;
	} else if (current == '.') {
	record8(current);
	state = InDecimal;
	} else if (current == 'e' \|\| current == 'E') {
	record8(current);
	state = InExponentIndicator;
	} else if (isOctalDigit(current)) {
	record8(current);
	state = InOctal;
	} else {
	setDone(Number);
	}
	break;
	case InHex:
	if (isHexDigit(current)) {
	record8(current);
	} else {
	setDone(Hex);
	}
	break;
	case InOctal:
	if (isOctalDigit(current)) {
	record8(current);
	} else
	setDone(Octal);
	break;
	case InNum:
	if (isDecimalDigit(current)) {
	record8(current);
	} else if (current == '.') {
	record8(current);
	state = InDecimal;
	} else if (current == 'e' \|\| current == 'E') {
	record8(current);
	state = InExponentIndicator;
	} else
	setDone(Number);
	break;
	case InDecimal:
	if (isDecimalDigit(current)) {
	record8(current);
	} else if (current == 'e' \|\| current == 'E') {
	record8(current);
	state = InExponentIndicator;
	} else
	setDone(Number);
	break;
	case InExponentIndicator:
	if (current == '+' \|\| current == '-') {
	record8(current);
	} else if (isDecimalDigit(current)) {
	record8(current);
	state = InExponent;
	} else
	setDone(Bad);
	break;
	case InExponent:
	if (isDecimalDigit(current)) {
	record8(current);
	} else
	setDone(Number);
	break;
	default:
	assert(!"Unhandled state in switch statement");
	}

	// move on to the next character
	if (!done)
	shift(1);
	#ifndef KJS_PURE_ECMA
	if (state != Start && state != InSingleLineComment)
	bol = false;
	#endif
	}

	// no identifiers allowed directly after numeric literal, e.g. "3in" is bad
	if ((state == Number \|\| state == Octal \|\| state == Hex)
	&& isIdentLetter(current))
	state = Bad;

	// terminate string
	buffer8[pos8] = '\0';

	#ifdef KJS_DEBUG_LEX
	fprintf(stderr, "line: %d ", lineNo());
	fprintf(stderr, "yytext (%x): ", buffer8[0]);
	fprintf(stderr, "%s ", buffer8);
	#endif

	double dval = 0;
	if (state == Number) {
	dval = strtod(buffer8, 0L);
	} else if (state == Hex) { // scan hex numbers
	// TODO: support long unsigned int
	unsigned int i;
	sscanf(buffer8, "%x", &i);
	dval = i;
	state = Number;
	} else if (state == Octal) { // scan octal number
	unsigned int ui;
	sscanf(buffer8, "%o", &ui);
	dval = ui;
	state = Number;
	}

	#ifdef KJS_DEBUG_LEX
	switch (state) {
	case Eof:
	printf("(EOF)\n");
	break;
	case Other:
	printf("(Other)\n");
	break;
	case Identifier:
	printf("(Identifier)/(Keyword)\n");
	break;
	case String:
	printf("(String)\n");
	break;
	case Number:
	printf("(Number)\n");
	break;
	default:
	printf("(unknown)");
	}
	#endif

	restrKeyword = false;
	delimited = false;
	#ifdef KJS_DEBUGGER
	yylloc.first_line = yylineno; // ???
	yylloc.last_line = yylineno;
	#endif

	switch (state) {
	case Eof:
	return 0;
	case Other:
	if(token == '}' \|\| token == ';') {
	delimited = true;
	}
	return token;
	case Identifier:
	if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
	/* TODO: close leak on parse error. same holds true for String */
	kjsyylval.ustr = new UString(buffer16, pos16);
	return IDENT;
	}
	if (token == CONTINUE \|\| token == BREAK \|\|
	token == RETURN \|\| token == THROW)
	restrKeyword = true;
	return token;
	case String:
	kjsyylval.ustr = new UString(buffer16, pos16); return STRING;
	case Number:
	kjsyylval.dval = dval;
	return NUMBER;
	case Bad:
	fprintf(stderr, "yylex: ERROR.\n");
	return -1;
	default:
	assert(!"unhandled numeration value in switch");
	return -1;
	}
	}

	bool Lexer::isWhiteSpace() const
	{
	return (current == ' ' \|\| current == '\t' \|\|
	current == 0x0b \|\| current == 0x0c);
	}

	bool Lexer::isLineTerminator() const
	{
	return (current == '\n' \|\| current == '\r');
	}

	bool Lexer::isIdentLetter(unsigned short c)
	{
	/* TODO: allow other legitimate unicode chars */
	return (c >= 'a' && c <= 'z' \|\|
	c >= 'A' && c <= 'Z' \|\|
	c == '$' \|\| c == '_');
	}

	bool Lexer::isDecimalDigit(unsigned short c)
	{
	return (c >= '0' && c <= '9');
	}

	bool Lexer::isHexDigit(unsigned short c) const
	{
	return (c >= '0' && c <= '9' \|\|
	c >= 'a' && c <= 'f' \|\|
	c >= 'A' && c <= 'F');
	}

	bool Lexer::isOctalDigit(unsigned short c) const
	{
	return (c >= '0' && c <= '7');
	}

	int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
	unsigned short c3, unsigned short c4)
	{
	if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
	shift(4);
	return URSHIFTEQUAL;
	} else if (c1 == '=' && c2 == '=' && c3 == '=') {
	shift(3);
	return STREQ;
	} else if (c1 == '!' && c2 == '=' && c3 == '=') {
	shift(3);
	return STRNEQ;
	} else if (c1 == '>' && c2 == '>' && c3 == '>') {
	shift(3);
	return URSHIFT;
	} else if (c1 == '<' && c2 == '<' && c3 == '=') {
	shift(3);
	return LSHIFTEQUAL;
	} else if (c1 == '>' && c2 == '>' && c3 == '=') {
	shift(3);
	return RSHIFTEQUAL;
	} else if (c1 == '<' && c2 == '=') {
	shift(2);
	return LE;
	} else if (c1 == '>' && c2 == '=') {
	shift(2);
	return GE;
	} else if (c1 == '!' && c2 == '=') {
	shift(2);
	return NE;
	} else if (c1 == '+' && c2 == '+') {
	shift(2);
	if (terminator) {
	// automatic semicolon insertion
	stackToken = PLUSPLUS;
	return AUTO;
	} else
	return PLUSPLUS;
	} else if (c1 == '-' && c2 == '-') {
	shift(2);
	if (terminator) {
	// automatic semicolon insertion
	stackToken = MINUSMINUS;
	return AUTO;
	} else
	return MINUSMINUS;
	} else if (c1 == '=' && c2 == '=') {
	shift(2);
	return EQEQ;
	} else if (c1 == '+' && c2 == '=') {
	shift(2);
	return PLUSEQUAL;
	} else if (c1 == '-' && c2 == '=') {
	shift(2);
	return MINUSEQUAL;
	} else if (c1 == '*' && c2 == '=') {
	shift(2);
	return MULTEQUAL;
	} else if (c1 == '/' && c2 == '=') {
	shift(2);
	return DIVEQUAL;
	} else if (c1 == '&' && c2 == '=') {
	shift(2);
	return ANDEQUAL;
	} else if (c1 == '^' && c2 == '=') {
	shift(2);
	return XOREQUAL;
	} else if (c1 == '%' && c2 == '=') {
	shift(2);
	return MODEQUAL;
	} else if (c1 == '\|' && c2 == '=') {
	shift(2);
	return OREQUAL;
	} else if (c1 == '<' && c2 == '<') {
	shift(2);
	return LSHIFT;
	} else if (c1 == '>' && c2 == '>') {
	shift(2);
	return RSHIFT;
	} else if (c1 == '&' && c2 == '&') {
	shift(2);
	return AND;
	} else if (c1 == '\|' && c2 == '\|') {
	shift(2);
	return OR;
	}

	switch(c1) {
	case '=':
	case '>':
	case '<':
	case ',':
	case '!':
	case '~':
	case '?':
	case ':':
	case '.':
	case '+':
	case '-':
	case '*':
	case '/':
	case '&':
	case '\|':
	case '^':
	case '%':
	case '(':
	case ')':
	case '{':
	case '}':
	case '[':
	case ']':
	case ';':
	shift(1);
	return static_cast<int>(c1);
	default:
	return -1;
	}
	}

	unsigned short Lexer::singleEscape(unsigned short c) const
	{
	switch(c) {
	case 'b':
	return 0x08;
	case 't':
	return 0x09;
	case 'n':
	return 0x0A;
	case 'v':
	return 0x0B;
	case 'f':
	return 0x0C;
	case 'r':
	return 0x0D;
	case '"':
	return 0x22;
	case '\'':
	return 0x27;
	case '\\':
	return 0x5C;
	default:
	return c;
	}
	}

	unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
	unsigned short c3) const
	{
	return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
	}

	unsigned char Lexer::convertHex(unsigned short c)
	{
	if (c >= '0' && c <= '9')
	return (c - '0');
	else if (c >= 'a' && c <= 'f')
	return (c - 'a' + 10);
	else
	return (c - 'A' + 10);
	}

	unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
	{
	return ((convertHex(c1) << 4) + convertHex(c2));
	}

	UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
	unsigned short c3, unsigned short c4)
	{
	return UChar((convertHex(c1) << 4) + convertHex(c2),
	(convertHex(c3) << 4) + convertHex(c4));
	}

	void Lexer::record8(unsigned short c)
	{
	assert(c <= 0xff);

	// enlarge buffer if full
	if (pos8 >= size8 - 1) {
	char tmp = new char[2 size8];
	memcpy(tmp, buffer8, size8 * sizeof(char));
	delete [] buffer8;
	buffer8 = tmp;
	size8 *= 2;
	}

	buffer8[pos8++] = (char) c;
	}

	void Lexer::record16(UChar c)
	{
	// enlarge buffer if full
	if (pos16 >= size16 - 1) {
	UChar tmp = new UChar[2 size16];
	memcpy(tmp, buffer16, size16 * sizeof(UChar));
	delete [] buffer16;
	buffer16 = tmp;
	size16 *= 2;
	}

	buffer16[pos16++] = c;
	}

	bool Lexer::scanRegExp()
	{
	pos16 = 0;
	bool lastWasEscape = false;

	while (1) {
	if (isLineTerminator() \|\| current == 0)
	return false;
	else if (current != '/' \|\| lastWasEscape == true)
	{
	record16(current);
	lastWasEscape =
	!lastWasEscape && (current == '\\');
	}
	else {
	pattern = UString(buffer16, pos16);
	pos16 = 0;
	shift(1);
	break;
	}
	shift(1);
	}

	while (isIdentLetter(current)) {
	record16(current);
	shift(1);
	}
	flags = UString(buffer16, pos16);

	return true;
	}