blob: 9796f5cbc4846ff347e21bd67655652e206a2dec [file] [log] [blame]
/*
* tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "tokenizer.h"
struct AxisNameMapping
{
AxisNameMapping( const char *_name, Step::AxisType _type ) :
name( _name ), type( _type )
{
}
const char *name;
Step::AxisType type;
};
static AxisNameMapping axisNames[] = {
AxisNameMapping("ancestor", Step::AncestorAxis),
AxisNameMapping("ancestor-or-self", Step::AncestorOrSelfAxis),
AxisNameMapping("attribute", Step::AttributeAxis),
AxisNameMapping("child", Step::ChildAxis),
AxisNameMapping("descendant", Step::DescendantAxis),
AxisNameMapping("descendant-or-self", Step::DescendantOrSelfAxis),
AxisNameMapping("following", Step::FollowingAxis),
AxisNameMapping("following-sibling", Step::FollowingSiblingAxis),
AxisNameMapping("namespace", Step::NamespaceAxis),
AxisNameMapping("parent", Step::ParentAxis),
AxisNameMapping("preceding", Step::PrecedingAxis),
AxisNameMapping("preceding-sibling", Step::PrecedingSiblingAxis),
AxisNameMapping("self", Step::SelfAxis)
};
static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
#ifndef APPLE_COMPILE_HACK
static const char* const nodeTypeNames[] = {
"comment",
"text",
"processing-instruction",
"node",
0
};
#endif
Tokenizer* Tokenizer::s_instance = 0;
Q3Dict<Step::AxisType>* Tokenizer::s_axisNamesDict = 0;
Q3Dict<char>* Tokenizer::s_nodeTypeNamesDict = 0;
Tokenizer &Tokenizer::self()
{
if (!s_instance)
s_instance = new Tokenizer;
return *s_instance;
}
Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
{
//### might need to add some special cases from the XML spec.
if (aChar.unicode() == '_')
return NameStart;
if (aChar.unicode() == '.' || aChar.unicode() == '-')
return NameCont;
#ifndef APPLE_COMPILE_HACK
switch (aChar.category()) {
case QChar::Letter_Lowercase: //Ll
case QChar::Letter_Uppercase: //Lu
case QChar::Letter_Other: //Lo
case QChar::Letter_Titlecase: //Lt
case QChar::Number_Letter: //Nl
return NameStart;
case QChar::Mark_SpacingCombining: //Mc
case QChar::Mark_Enclosing: //Me
case QChar::Mark_NonSpacing: //Mn
case QChar::Letter_Modifier: //Lm
case QChar::Number_DecimalDigit: //Nd
return NameCont;
default:
return NotPartOfName;
}
#else
return NotPartOfName;
#endif
}
bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
{
if (!s_axisNamesDict) {
s_axisNamesDict = new Q3Dict<Step::AxisType>;
s_axisNamesDict->setAutoDelete( true );
for (unsigned int p = 0; p < axisNamesCount; ++p)
s_axisNamesDict->insert(QString::fromLatin1(axisNames[p].name),
new Step::AxisType(axisNames[p].type));
}
Step::AxisType *t = s_axisNamesDict->find(name);
if ( t && type ) {
*type = *t;
}
return t != 0;
}
bool Tokenizer::isNodeTypeName(QString name)
{
#ifndef APPLE_COMPILE_HACK
if (!s_nodeTypeNamesDict) {
s_nodeTypeNamesDict = new Q3Dict<char>;
for (int p = 0; nodeTypeNames[p]; ++p)
s_nodeTypeNamesDict->insert(QString::fromLatin1(nodeTypeNames[p]),
nodeTypeNames /*dummy*/);
}
return s_nodeTypeNamesDict->find(name);
#else
return false;
#endif
}
/* Returns whether the last parsed token matches the [32] Operator rule
* (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
* the tokens.
*/
bool Tokenizer::isOperatorContext()
{
switch ( m_lastTokenType ) {
case AND: case OR: case MULOP:
case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
case EQOP: case RELOP:
case '@': case AXISNAME: case '(': case '[':
return false;
default:
return true;
}
}
void Tokenizer::skipWS()
{
while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
++m_nextPos;
}
Token Tokenizer::makeTokenAndAdvance(int code, int advance)
{
m_nextPos += advance;
return Token(code);
}
Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
{
m_nextPos += advance;
return Token(code, val);
}
//Returns next char if it's there and interesting, 0 otherwise
char Tokenizer::peekAheadHelper()
{
if (m_nextPos + 1 >= m_data.length())
return 0;
QChar next = m_data[m_nextPos + 1];
if (next.row() != 0)
return 0;
else
return next.cell();
}
char Tokenizer::peekCurHelper()
{
if (m_nextPos >= m_data.length())
return 0;
QChar next = m_data[m_nextPos];
if (next.row() != 0)
return 0;
else
return next.cell();
}
Token Tokenizer::lexString()
{
QChar delimiter = m_data[m_nextPos];
int startPos = m_nextPos + 1;
for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
if (m_data[m_nextPos] == delimiter) {
QString value = m_data.mid(startPos, m_nextPos - startPos);
++m_nextPos; //Consume the char;
return Token(LITERAL, value);
}
}
//Ouch, went off the end -- report error
return Token(ERROR);
}
Token Tokenizer::lexNumber()
{
int startPos = m_nextPos;
bool seenDot = false;
//Go until end or a non-digits character
for (; m_nextPos < m_data.length(); ++m_nextPos) {
QChar aChar = m_data[m_nextPos];
if (aChar.row() != 0) break;
if (aChar.cell() < '0' || aChar.cell() > '9') {
if (aChar.cell() == '.' && !seenDot)
seenDot = true;
else
break;
}
}
QString value = m_data.mid(startPos, m_nextPos - startPos);
return Token(NUMBER, value);
}
Token Tokenizer::lexNCName()
{
int startPos = m_nextPos;
if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
{
//Keep going until we get a character that's not good for names.
for (; m_nextPos < m_data.length(); ++m_nextPos) {
if (charCat(m_data[m_nextPos]) == NotPartOfName)
break;
}
QString value = m_data.mid(startPos, m_nextPos - startPos);
return Token(value);
}
else
return makeTokenAndAdvance(ERROR);
}
Token Tokenizer::lexQName()
{
Token t1 = lexNCName();
if (t1.type == ERROR) return t1;
skipWS();
//If the next character is :, what we just got it the prefix, if not,
//it's the whole thing
if (peekAheadHelper() != ':')
return t1;
Token t2 = lexNCName();
if (t2.type == ERROR) return t2;
return Token(t1.value + ":" + t2.value);
}
Token Tokenizer::nextTokenInternal()
{
skipWS();
if (m_nextPos >= m_data.length()) {
return Token(0);
}
char code = peekCurHelper();
switch (code) {
case '(': case ')': case '[': case ']':
case '@': case ',': case '|':
return makeTokenAndAdvance(code);
case '\'':
case '\"':
return lexString();
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return lexNumber();
case '.': {
char next = peekAheadHelper();
if (next == '.')
return makeTokenAndAdvance(DOTDOT, 2);
else if (next >= '0' && next <= '9')
return lexNumber();
else
return makeTokenAndAdvance('.');
}
case '/':
if (peekAheadHelper() == '/')
return makeTokenAndAdvance(SLASHSLASH, 2);
else
return makeTokenAndAdvance('/');
case '+':
return makeTokenAndAdvance(PLUS);
case '-':
return makeTokenAndAdvance(MINUS);
case '=':
return makeIntTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
case '!':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
else {
return Token(ERROR);
}
case '<':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(RELOP, NumericOp::OP_LE, 2);
else
return makeIntTokenAndAdvance(RELOP, NumericOp::OP_LT);
case '>':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(RELOP, NumericOp::OP_GE, 2);
else
return makeIntTokenAndAdvance(RELOP, NumericOp::OP_GT);
case '*':
if (isOperatorContext())
return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
else {
++m_nextPos;
return Token(NAMETEST, "*");
}
case '$': {//$ QName
m_nextPos++;
Token par = lexQName();
if (par.type == ERROR)
return par;
else
return Token(VARIABLEREFERENCE, par.value);
}
}
Token t1 = lexNCName();
if (t1.type == ERROR) return t1;
skipWS();
//If we're in an operator context, check for any operator names
if (isOperatorContext()) {
if (t1.value == QString::fromLatin1("and")) //### hash?
return Token(AND);
if (t1.value == QString::fromLatin1("or"))
return Token(OR);
if (t1.value == QString::fromLatin1("mod"))
return Token(MULOP, NumericOp::OP_Mod);
if (t1.value == QString::fromLatin1("div"))
return Token(MULOP, NumericOp::OP_Div);
}
//See whether we are at a :
if (peekCurHelper() == ':') {
m_nextPos++;
//Any chance it's an axis name?
if (peekCurHelper() == ':') {
m_nextPos++;
//It might be an axis name.
Step::AxisType axisType;
if (isAxisName(t1.value, &axisType))
return Token(AXISNAME, axisType);
//Ugh, :: is only valid in axis names -> error
return Token(ERROR);
}
//Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
skipWS();
if (peekCurHelper() == '*') {
m_nextPos++;
return Token(NAMETEST, t1.value + ":*");
}
//Make a full qname..
Token t2 = lexNCName();
if (t2.type == ERROR) return t2;
t1.value = t1.value + ':' + t2.value;
}
skipWS();
if (peekCurHelper() == '(') {
//note: we don't swallow the ( here!
//either node type of function name
if (isNodeTypeName(t1.value)) {
if (t1.value == "processing-instruction")
return Token(PI, t1.value);
else
return Token(NODETYPE, t1.value);
}
//must be a function name.
return Token(FUNCTIONNAME, t1.value);
}
//At this point, it must be NAMETEST
return Token(NAMETEST, t1.value);
}
Token Tokenizer::nextToken()
{
Token toRet = nextTokenInternal();
m_lastTokenType = toRet.type;
return toRet;
}
Tokenizer::Tokenizer()
{
reset(QString());
}
void Tokenizer::reset(QString data)
{
m_nextPos = 0;
m_data = data;
m_lastTokenType = 0;
}
int xpathyylex()
{
Token tok = Tokenizer::self().nextToken();
if (!tok.value.isEmpty()) {
xpathyylval.str = new DomString(tok.value);
} else if (tok.intValue) {
xpathyylval.num = tok.intValue;
}
return tok.type;
}
void initTokenizer(QString string)
{
Tokenizer::self().reset(string);
}
void xpathyyerror(const char *str)
{
fprintf(stderr, "error: %s\n", str);
}
class TokenizerDeleter
{
public:
~TokenizerDeleter()
{
delete Tokenizer::s_instance;
delete Tokenizer::s_axisNamesDict;
delete Tokenizer::s_nodeTypeNamesDict;
}
};
static TokenizerDeleter tokenizerDeleter;
// kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;