| /* |
| * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org> |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| #include "config.h" |
| #include "tokenizer.h" |
| |
| struct AxisNameMapping |
| { |
| AxisNameMapping( const char *_name, Step::AxisType _type ) : |
| name( _name ), type( _type ) |
| { |
| } |
| |
| const char *name; |
| Step::AxisType type; |
| }; |
| |
| static AxisNameMapping axisNames[] = { |
| AxisNameMapping("ancestor", Step::AncestorAxis), |
| AxisNameMapping("ancestor-or-self", Step::AncestorOrSelfAxis), |
| AxisNameMapping("attribute", Step::AttributeAxis), |
| AxisNameMapping("child", Step::ChildAxis), |
| AxisNameMapping("descendant", Step::DescendantAxis), |
| AxisNameMapping("descendant-or-self", Step::DescendantOrSelfAxis), |
| AxisNameMapping("following", Step::FollowingAxis), |
| AxisNameMapping("following-sibling", Step::FollowingSiblingAxis), |
| AxisNameMapping("namespace", Step::NamespaceAxis), |
| AxisNameMapping("parent", Step::ParentAxis), |
| AxisNameMapping("preceding", Step::PrecedingAxis), |
| AxisNameMapping("preceding-sibling", Step::PrecedingSiblingAxis), |
| AxisNameMapping("self", Step::SelfAxis) |
| }; |
| static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]); |
| |
| #ifndef APPLE_COMPILE_HACK |
| static const char* const nodeTypeNames[] = { |
| "comment", |
| "text", |
| "processing-instruction", |
| "node", |
| 0 |
| }; |
| #endif |
| |
| Tokenizer* Tokenizer::s_instance = 0; |
| |
| Q3Dict<Step::AxisType>* Tokenizer::s_axisNamesDict = 0; |
| Q3Dict<char>* Tokenizer::s_nodeTypeNamesDict = 0; |
| |
| Tokenizer &Tokenizer::self() |
| { |
| if (!s_instance) |
| s_instance = new Tokenizer; |
| return *s_instance; |
| } |
| |
| Tokenizer::XMLCat Tokenizer::charCat(QChar aChar) |
| { |
| //### might need to add some special cases from the XML spec. |
| |
| if (aChar.unicode() == '_') |
| return NameStart; |
| |
| if (aChar.unicode() == '.' || aChar.unicode() == '-') |
| return NameCont; |
| |
| #ifndef APPLE_COMPILE_HACK |
| switch (aChar.category()) { |
| case QChar::Letter_Lowercase: //Ll |
| case QChar::Letter_Uppercase: //Lu |
| case QChar::Letter_Other: //Lo |
| case QChar::Letter_Titlecase: //Lt |
| case QChar::Number_Letter: //Nl |
| return NameStart; |
| |
| case QChar::Mark_SpacingCombining: //Mc |
| case QChar::Mark_Enclosing: //Me |
| case QChar::Mark_NonSpacing: //Mn |
| case QChar::Letter_Modifier: //Lm |
| case QChar::Number_DecimalDigit: //Nd |
| return NameCont; |
| |
| default: |
| return NotPartOfName; |
| } |
| #else |
| return NotPartOfName; |
| #endif |
| } |
| |
| bool Tokenizer::isAxisName(QString name, Step::AxisType *type) |
| { |
| if (!s_axisNamesDict) { |
| s_axisNamesDict = new Q3Dict<Step::AxisType>; |
| s_axisNamesDict->setAutoDelete( true ); |
| for (unsigned int p = 0; p < axisNamesCount; ++p) |
| s_axisNamesDict->insert(QString::fromLatin1(axisNames[p].name), |
| new Step::AxisType(axisNames[p].type)); |
| } |
| |
| Step::AxisType *t = s_axisNamesDict->find(name); |
| if ( t && type ) { |
| *type = *t; |
| } |
| return t != 0; |
| } |
| |
| bool Tokenizer::isNodeTypeName(QString name) |
| { |
| #ifndef APPLE_COMPILE_HACK |
| if (!s_nodeTypeNamesDict) { |
| s_nodeTypeNamesDict = new Q3Dict<char>; |
| for (int p = 0; nodeTypeNames[p]; ++p) |
| s_nodeTypeNamesDict->insert(QString::fromLatin1(nodeTypeNames[p]), |
| nodeTypeNames /*dummy*/); |
| } |
| return s_nodeTypeNamesDict->find(name); |
| #else |
| return false; |
| #endif |
| } |
| |
| /* Returns whether the last parsed token matches the [32] Operator rule |
| * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate |
| * the tokens. |
| */ |
| bool Tokenizer::isOperatorContext() |
| { |
| switch ( m_lastTokenType ) { |
| case AND: case OR: case MULOP: |
| case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: |
| case EQOP: case RELOP: |
| case '@': case AXISNAME: case '(': case '[': |
| return false; |
| default: |
| return true; |
| } |
| } |
| |
| void Tokenizer::skipWS() |
| { |
| while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace()) |
| ++m_nextPos; |
| } |
| |
| Token Tokenizer::makeTokenAndAdvance(int code, int advance) |
| { |
| m_nextPos += advance; |
| return Token(code); |
| } |
| |
| Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance) |
| { |
| m_nextPos += advance; |
| return Token(code, val); |
| } |
| |
| //Returns next char if it's there and interesting, 0 otherwise |
| char Tokenizer::peekAheadHelper() |
| { |
| if (m_nextPos + 1 >= m_data.length()) |
| return 0; |
| QChar next = m_data[m_nextPos + 1]; |
| if (next.row() != 0) |
| return 0; |
| else |
| return next.cell(); |
| } |
| |
| char Tokenizer::peekCurHelper() |
| { |
| if (m_nextPos >= m_data.length()) |
| return 0; |
| QChar next = m_data[m_nextPos]; |
| if (next.row() != 0) |
| return 0; |
| else |
| return next.cell(); |
| } |
| |
| Token Tokenizer::lexString() |
| { |
| QChar delimiter = m_data[m_nextPos]; |
| int startPos = m_nextPos + 1; |
| |
| for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { |
| if (m_data[m_nextPos] == delimiter) { |
| QString value = m_data.mid(startPos, m_nextPos - startPos); |
| ++m_nextPos; //Consume the char; |
| return Token(LITERAL, value); |
| } |
| } |
| |
| //Ouch, went off the end -- report error |
| return Token(ERROR); |
| } |
| |
| Token Tokenizer::lexNumber() |
| { |
| int startPos = m_nextPos; |
| bool seenDot = false; |
| |
| //Go until end or a non-digits character |
| for (; m_nextPos < m_data.length(); ++m_nextPos) { |
| QChar aChar = m_data[m_nextPos]; |
| if (aChar.row() != 0) break; |
| |
| if (aChar.cell() < '0' || aChar.cell() > '9') { |
| if (aChar.cell() == '.' && !seenDot) |
| seenDot = true; |
| else |
| break; |
| } |
| } |
| |
| QString value = m_data.mid(startPos, m_nextPos - startPos); |
| return Token(NUMBER, value); |
| } |
| |
| Token Tokenizer::lexNCName() |
| { |
| int startPos = m_nextPos; |
| if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart) |
| { |
| //Keep going until we get a character that's not good for names. |
| for (; m_nextPos < m_data.length(); ++m_nextPos) { |
| if (charCat(m_data[m_nextPos]) == NotPartOfName) |
| break; |
| } |
| |
| QString value = m_data.mid(startPos, m_nextPos - startPos); |
| return Token(value); |
| } |
| else |
| return makeTokenAndAdvance(ERROR); |
| } |
| |
| Token Tokenizer::lexQName() |
| { |
| Token t1 = lexNCName(); |
| if (t1.type == ERROR) return t1; |
| skipWS(); |
| //If the next character is :, what we just got it the prefix, if not, |
| //it's the whole thing |
| if (peekAheadHelper() != ':') |
| return t1; |
| |
| Token t2 = lexNCName(); |
| if (t2.type == ERROR) return t2; |
| |
| return Token(t1.value + ":" + t2.value); |
| } |
| |
| Token Tokenizer::nextTokenInternal() |
| { |
| skipWS(); |
| |
| if (m_nextPos >= m_data.length()) { |
| return Token(0); |
| } |
| |
| char code = peekCurHelper(); |
| switch (code) { |
| case '(': case ')': case '[': case ']': |
| case '@': case ',': case '|': |
| return makeTokenAndAdvance(code); |
| case '\'': |
| case '\"': |
| return lexString(); |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return lexNumber(); |
| case '.': { |
| char next = peekAheadHelper(); |
| if (next == '.') |
| return makeTokenAndAdvance(DOTDOT, 2); |
| else if (next >= '0' && next <= '9') |
| return lexNumber(); |
| else |
| return makeTokenAndAdvance('.'); |
| } |
| case '/': |
| if (peekAheadHelper() == '/') |
| return makeTokenAndAdvance(SLASHSLASH, 2); |
| else |
| return makeTokenAndAdvance('/'); |
| case '+': |
| return makeTokenAndAdvance(PLUS); |
| case '-': |
| return makeTokenAndAdvance(MINUS); |
| case '=': |
| return makeIntTokenAndAdvance(EQOP, EqTestOp::OP_EQ); |
| case '!': |
| if (peekAheadHelper() == '=') |
| return makeIntTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2); |
| else { |
| return Token(ERROR); |
| } |
| case '<': |
| if (peekAheadHelper() == '=') |
| return makeIntTokenAndAdvance(RELOP, NumericOp::OP_LE, 2); |
| else |
| return makeIntTokenAndAdvance(RELOP, NumericOp::OP_LT); |
| case '>': |
| if (peekAheadHelper() == '=') |
| return makeIntTokenAndAdvance(RELOP, NumericOp::OP_GE, 2); |
| else |
| return makeIntTokenAndAdvance(RELOP, NumericOp::OP_GT); |
| case '*': |
| if (isOperatorContext()) |
| return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul); |
| else { |
| ++m_nextPos; |
| return Token(NAMETEST, "*"); |
| } |
| case '$': {//$ QName |
| m_nextPos++; |
| Token par = lexQName(); |
| if (par.type == ERROR) |
| return par; |
| else |
| return Token(VARIABLEREFERENCE, par.value); |
| } |
| } |
| |
| Token t1 = lexNCName(); |
| if (t1.type == ERROR) return t1; |
| |
| skipWS(); |
| |
| //If we're in an operator context, check for any operator names |
| if (isOperatorContext()) { |
| if (t1.value == QString::fromLatin1("and")) //### hash? |
| return Token(AND); |
| if (t1.value == QString::fromLatin1("or")) |
| return Token(OR); |
| if (t1.value == QString::fromLatin1("mod")) |
| return Token(MULOP, NumericOp::OP_Mod); |
| if (t1.value == QString::fromLatin1("div")) |
| return Token(MULOP, NumericOp::OP_Div); |
| } |
| |
| //See whether we are at a : |
| if (peekCurHelper() == ':') { |
| m_nextPos++; |
| //Any chance it's an axis name? |
| if (peekCurHelper() == ':') { |
| m_nextPos++; |
| |
| //It might be an axis name. |
| Step::AxisType axisType; |
| if (isAxisName(t1.value, &axisType)) |
| return Token(AXISNAME, axisType); |
| //Ugh, :: is only valid in axis names -> error |
| return Token(ERROR); |
| } |
| |
| //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest |
| skipWS(); |
| if (peekCurHelper() == '*') { |
| m_nextPos++; |
| return Token(NAMETEST, t1.value + ":*"); |
| } |
| |
| //Make a full qname.. |
| Token t2 = lexNCName(); |
| if (t2.type == ERROR) return t2; |
| |
| t1.value = t1.value + ':' + t2.value; |
| } |
| |
| skipWS(); |
| if (peekCurHelper() == '(') { |
| //note: we don't swallow the ( here! |
| |
| //either node type of function name |
| if (isNodeTypeName(t1.value)) { |
| if (t1.value == "processing-instruction") |
| return Token(PI, t1.value); |
| else |
| return Token(NODETYPE, t1.value); |
| } |
| //must be a function name. |
| return Token(FUNCTIONNAME, t1.value); |
| } |
| |
| //At this point, it must be NAMETEST |
| return Token(NAMETEST, t1.value); |
| } |
| |
| Token Tokenizer::nextToken() |
| { |
| Token toRet = nextTokenInternal(); |
| m_lastTokenType = toRet.type; |
| return toRet; |
| } |
| |
| Tokenizer::Tokenizer() |
| { |
| reset(QString()); |
| } |
| |
| void Tokenizer::reset(QString data) |
| { |
| m_nextPos = 0; |
| m_data = data; |
| m_lastTokenType = 0; |
| } |
| |
| int xpathyylex() |
| { |
| Token tok = Tokenizer::self().nextToken(); |
| if (!tok.value.isEmpty()) { |
| xpathyylval.str = new DomString(tok.value); |
| } else if (tok.intValue) { |
| xpathyylval.num = tok.intValue; |
| } |
| return tok.type; |
| } |
| |
| void initTokenizer(QString string) |
| { |
| Tokenizer::self().reset(string); |
| } |
| |
| void xpathyyerror(const char *str) |
| { |
| fprintf(stderr, "error: %s\n", str); |
| } |
| |
| class TokenizerDeleter |
| { |
| public: |
| ~TokenizerDeleter() |
| { |
| delete Tokenizer::s_instance; |
| delete Tokenizer::s_axisNamesDict; |
| delete Tokenizer::s_nodeTypeNamesDict; |
| } |
| }; |
| |
| static TokenizerDeleter tokenizerDeleter; |
| |
| // kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off; |