Context Navigation

source: trunk/src/xmlpatterns/parser/qxquerytokenizer.cpp@ 686

Last change on this file since 686 was 651, checked in by Dmitry A. Kuminov, 15 years ago
trunk: Merged in qt 4.6.2 sources.
File size: 68.5 KB

Rev	Line
[2]	1	/****************************************************************************
	2	**
[651]	3	** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
[561]	4	** All rights reserved.
	5	** Contact: Nokia Corporation ([email protected])
[2]	6	**
	7	** This file is part of the QtXmlPatterns module of the Qt Toolkit.
	8	**
	9	** $QT_BEGIN_LICENSE:LGPL$
	10	** Commercial Usage
	11	** Licensees holding valid Qt Commercial licenses may use this file in
	12	** accordance with the Qt Commercial License Agreement provided with the
	13	** Software or, alternatively, in accordance with the terms contained in
	14	** a written agreement between you and Nokia.
	15	**
	16	** GNU Lesser General Public License Usage
	17	** Alternatively, this file may be used under the terms of the GNU Lesser
	18	** General Public License version 2.1 as published by the Free Software
	19	** Foundation and appearing in the file LICENSE.LGPL included in the
	20	** packaging of this file. Please review the following information to
	21	** ensure the GNU Lesser General Public License version 2.1 requirements
	22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
	23	**
[561]	24	** In addition, as a special exception, Nokia gives you certain additional
	25	** rights. These rights are described in the Nokia Qt LGPL Exception
	26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
[2]	27	**
	28	** GNU General Public License Usage
	29	** Alternatively, this file may be used under the terms of the GNU
	30	** General Public License version 3.0 as published by the Free Software
	31	** Foundation and appearing in the file LICENSE.GPL included in the
	32	** packaging of this file. Please review the following information to
	33	** ensure the GNU General Public License version 3.0 requirements will be
	34	** met: http://www.gnu.org/copyleft/gpl.html.
	35	**
[561]	36	** If you have questions regarding the use of this file, please contact
	37	** Nokia at [email protected].
[2]	38	** $QT_END_LICENSE$
	39	**
	40	****************************************************************************/
	41
	42	#include <QByteArray>
	43
	44	#include "qquerytransformparser_p.h"
	45
	46	#include "qxquerytokenizer_p.h"
	47
	48	#include "qtokenlookup.cpp"
	49
	50	QT_BEGIN_NAMESPACE
	51
	52	namespace QPatternist
	53	{
	54
	55	#define handleWhitespace() \
	56	{ \
	57	const TokenType t = consumeWhitespace(); \
	58	if(t != SUCCESS) \
	59	return Token(t); \
	60	}
	61
	62	XQueryTokenizer::XQueryTokenizer(const QString &query,
	63	const QUrl &location,
	64	const State startingState) : Tokenizer(location)
	65	, m_data(query)
	66	, m_length(query.length())
	67	, m_state(startingState)
	68	, m_pos(0)
	69	, m_line(1)
	70	, m_columnOffset(0)
	71	, m_scanOnly(false)
	72	{
	73	Q_ASSERT(location.isValid() \|\| location.isEmpty());
	74	}
	75
	76	const QChar XQueryTokenizer::current() const
	77	{
	78	if(m_pos < m_length)
	79	return m_data.at(m_pos);
	80	else
	81	return QChar();
	82	}
	83
	84	char XQueryTokenizer::peekCurrent() const
	85	{
	86	return current().toAscii();
	87	}
	88
	89	int XQueryTokenizer::peekForColonColon() const
	90	{
	91	/* Note, we don't modify m_pos in this function, so we need to do offset
	92	* calculations. */
	93	int pos = m_pos;
	94
	95	while(pos < m_length)
	96	{
	97	switch(m_data.at(pos).toAscii())
	98	{
	99	/* Fallthrough these four. */
	100	case ' ':
	101	case '\t':
	102	case '\n':
	103	case '\r':
	104	break;
	105	case ':':
	106	{
	107	if(peekAhead((pos - m_pos) + 1) == ':')
	108	return pos - m_pos;
	109	/* Fallthrough. */
	110	}
	111	default:
	112	return -1;
	113	}
	114	++pos;
	115	}
	116
	117	return -1;
	118	}
	119
	120	Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
	121	const State s,
	122	const int advance)
	123	{
	124	Q_ASSERT(advance >= 0);
	125	m_pos += advance;
	126	setState(s);
	127	return Token(code);
	128	}
	129
	130	Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
	131	const QString &value,
	132	const State s)
	133	{
	134	setState(s);
	135	return Token(code, value);
	136	}
	137
	138	Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
	139	const int advance)
	140	{
	141	Q_ASSERT(advance >= 0);
	142	m_pos += advance;
	143	return Token(code);
	144	}
	145
	146	QString XQueryTokenizer::normalizeEOL(const QString &input,
	147	const CharacterSkips &characterSkips)
	148	{
	149	const int len = input.count();
	150	QString result;
	151
	152	/* The likely hood is rather high it'll be the same content. */
	153	result.reserve(len);
	154
	155	for(int i = 0; i < len; ++i)
	156	{
	157	const QChar &at = input.at(i);
	158
	159	if(characterSkips.contains(i))
	160	{
	161	result.append(at);
	162	continue;
	163	}
	164	switch(input.at(i).unicode())
	165	{
	166	case '\r':
	167	{
	168	if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
	169	++i;
	170
	171	/* Else, fallthrough. */
	172	}
	173	case '\n':
	174	{
	175	result.append(QLatin1Char('\n'));
	176	continue;
	177	}
	178	default:
	179	{
	180	result.append(at);
	181	}
	182	}
	183	}
	184
	185	return result;
	186	}
	187
	188	Tokenizer::TokenType XQueryTokenizer::consumeComment()
	189	{
	190	/* Below, we return ERROR instead of END_OF_FILE such that the parser
	191	* sees an invalid comment. */
	192	while(m_pos < m_length)
	193	{
	194	switch(peekCurrent())
	195	{
	196	case ':':
	197	{
	198	++m_pos; /* Consume ':' */
	199	if(atEnd())
	200	return ERROR;
	201
	202	if(peekCurrent() == ')')
	203	{
	204	++m_pos; /* Consume ')' */
	205	return SUCCESS; /* The comment closed nicely. */
	206	}
	207	continue; /* We don't want to increment m_pos twice. */
	208	}
	209	case '(':
	210	{ /* It looks like the start of a comment. */
	211	++m_pos;
	212
	213	if(atEnd())
	214	return END_OF_FILE;
	215	else if(peekCurrent() == ':')
	216	{
	217	/* And it is a nested comment -- parse it. */
	218	const TokenType retval = consumeComment();
	219	if(retval == SUCCESS)
	220	continue; /* Continue with our "own" comment. */
	221	else
	222	return retval; /* Return the error in the nested comment. */
	223	}
	224	break;
	225	}
	226	case '\n':
	227	/* Fallthrough. */
	228	case '\r':
	229	{
	230	/* We want to count \r\n as a single line break. */
	231	if(peekAhead() == '\n')
	232	++m_pos;
	233
	234	m_columnOffset = m_pos;
	235	++m_line;
	236
	237	break;
	238	}
	239	}
	240	++m_pos;
	241	}
	242
	243	return ERROR; /* Error: we reached the end while inside a comment. */
	244	}
	245
	246	bool XQueryTokenizer::consumeRawWhitespace()
	247	{
	248	while(m_pos < m_length)
	249	{
	250	switch(peekCurrent())
	251	{
	252	case ' ':
	253	case '\t':
	254	break;
	255	case '\n':
	256	case '\r':
	257	{
	258	if(peekAhead() == '\n')
	259	++m_pos;
	260
	261	m_columnOffset = m_pos;
	262	++m_line;
	263
	264	break;
	265	}
	266	default:
	267	return false;
	268	}
	269	++m_pos;
	270	}
	271	return true;
	272	}
	273
	274	Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
	275	{
	276	while(m_pos < m_length)
	277	{
	278	switch(peekCurrent())
	279	{
	280	case ' ':
	281	case '\t':
	282	break;
	283	case '\n':
	284	case '\r':
	285	{
	286	/* We want to count \r\n as a single line break. */
	287	if(peekAhead() == '\n')
	288	++m_pos;
	289
	290	m_columnOffset = m_pos;
	291	++m_line;
	292
	293	break;
	294	}
	295	case '(':
	296	{
	297	if(peekAhead() == ':')
	298	{
	299	m_pos += 2; /* Consume "(:" */
	300
	301	const TokenType comment = consumeComment();
	302	if(comment == SUCCESS)
	303	continue;
	304	else
	305	return comment;
	306	}
	307	}
	308	default:
	309	return SUCCESS;
	310	}
	311	++m_pos;
	312	}
	313
	314	return END_OF_FILE;
	315	}
	316
	317	char XQueryTokenizer::peekAhead(const int length) const
	318	{
	319	if(m_pos + length < m_length)
	320	return m_data.at(m_pos + length).toAscii();
	321	else
	322	return 0;
	323	}
	324
	325	Tokenizer::Token XQueryTokenizer::error()
	326	{
	327	return Token(ERROR);
	328	}
	329
	330	bool XQueryTokenizer::isDigit(const char ch)
	331	{
	332	return ch >= '0' && ch <= '9';
	333	}
	334
	335	/* Replace with function in QXmlUtils. Write test cases for this. */
	336	bool XQueryTokenizer::isNCNameStart(const QChar ch)
	337	{
	338	if(ch == QLatin1Char('_'))
	339	return true;
	340
	341	switch(ch.category())
	342	{
	343	case QChar::Letter_Lowercase:
	344	case QChar::Letter_Uppercase:
	345	case QChar::Letter_Other:
	346	case QChar::Letter_Titlecase:
	347	case QChar::Number_Letter:
	348	return true;
	349	default:
	350	return false;
	351	}
	352	}
	353
	354	bool XQueryTokenizer::isNCNameBody(const QChar ch)
	355	{
	356	switch(ch.unicode())
	357	{
	358	case '.':
	359	case '_':
	360	case '-':
	361	return true;
	362	}
	363
	364	switch(ch.category())
	365	{
	366	case QChar::Letter_Lowercase:
	367	case QChar::Letter_Uppercase:
	368	case QChar::Letter_Other:
	369	case QChar::Letter_Titlecase:
	370	case QChar::Number_Letter:
	371	case QChar::Mark_SpacingCombining:
	372	case QChar::Mark_Enclosing:
	373	case QChar::Mark_NonSpacing:
	374	case QChar::Letter_Modifier:
	375	case QChar::Number_DecimalDigit:
	376	return true;
	377	default:
	378	return false;
	379	}
	380	}
	381
	382	bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
	383	{
	384	switch(code)
	385	{
	386	/* Fallthrough all these. */
	387	case CASTABLE:
	388	case CAST:
	389	case COPY_NAMESPACES:
	390	case DECLARE:
	391	case EMPTY:
	392	case MODULE:
	393	case IMPORT:
	394	case INSTANCE:
	395	case ORDER:
	396	case ORDERING:
	397	case XQUERY:
	398	case STABLE:
	399	case TREAT:
	400	return true;
	401	default:
	402	return false;
	403	}
	404	}
	405
	406	bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
	407	{
	408	switch(code)
	409	{
	410	/* Fallthrough all these. */
	411	case AS:
	412	case ASCENDING:
	413	case AT:
	414	case CASE:
	415	case CAST:
	416	case CASTABLE:
	417	case EQ:
	418	case EXTERNAL:
	419	case GE:
	420	case G_EQ:
	421	case G_GT:
	422	case G_LT:
	423	case G_NE:
	424	case GT:
	425	case IN:
	426	case INHERIT:
	427	case INSTANCE:
	428	case IS:
	429	case ITEM:
	430	case LE:
	431	case LT:
	432	case NE:
	433	case NO_INHERIT:
	434	case NO_PRESERVE:
	435	case OF:
	436	case PRESERVE:
	437	case RETURN:
	438	case STABLE:
	439	case TO:
	440	case TREAT:
	441	return true;
	442	default:
	443	return false;
	444	};
	445	}
	446
	447	bool XQueryTokenizer::isTypeToken(const TokenType t)
	448	{
	449	switch(t)
	450	{
	451	/* Fallthrough all these. */
	452	case ATTRIBUTE:
	453	case COMMENT:
	454	case DOCUMENT:
	455	case DOCUMENT_NODE:
	456	case ELEMENT:
	457	case ITEM:
	458	case NODE:
	459	case PROCESSING_INSTRUCTION:
	460	case SCHEMA_ATTRIBUTE:
	461	case SCHEMA_ELEMENT:
	462	case TEXT:
	463	return true;
	464	default:
	465	return false;
	466	}
	467	}
	468
	469	Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
	470	{
	471	const int start = m_pos;
	472
	473	const Token t1 = tokenizeNCName();
	474	if(t1.hasError())
	475	return t1;
	476
	477	if(peekCurrent() != ':' \|\| peekAhead() == '=')
	478	return t1;
	479
	480	++m_pos;
	481
	482	const Token t2 = tokenizeNCName();
	483	if(t2.hasError())
	484	return t2;
	485	else
	486	return Token(QNAME, m_data.mid(start, m_pos - start));
	487	}
	488
	489	Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
	490	{
	491	setState(Operator);
	492	const int startPos = m_pos;
	493	bool hasDot = false;
	494	bool isXPath20 = false;
	495
	496	for(; m_pos < m_length; ++m_pos)
	497	{
	498	QChar ch(current());
	499
	500	char cell = ch.cell();
	501
	502	if(cell == 'e' \|\| cell == 'E')
	503	{
	504	isXPath20 = true;
	505	++m_pos;
	506	ch = current();
	507
	508	if(ch.row() != 0)
	509	break;
	510
	511	cell = ch.cell();
	512
	513	if(cell == '+' \|\| cell == '-')
	514	continue;
	515	}
	516
	517	if(isNCNameStart(ch))
	518	return error();
	519
	520	if(cell < '0' \|\| cell > '9')
	521	{
	522	if(cell == '.' && !hasDot)
	523	hasDot = true;
	524	else
	525	break;
	526	}
	527	}
	528
	529	return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
	530	}
	531
	532	QString XQueryTokenizer::tokenizeCharacterReference()
	533	{
	534	Q_ASSERT(peekCurrent() == '&');
	535
	536	const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
	537
	538	if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
	539	return QString();
	540
	541	QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
	542	m_pos = theEnd;
	543
	544	const QChar charRef(charForReference(content));
	545
	546	if(!charRef.isNull())
	547	return charRef;
	548	else if(content.startsWith(QLatin1Char('#')))
	549	{
	550	int base;
	551
	552	/* It is only '#' or '#x'. */
	553	if(content.length() < 2)
	554	return QString();
	555
	556	/* We got a hex number if it starts with 'x', otherwise it's a decimal. */
	557	if(content.at(1) == QLatin1Char('x'))
	558	{
	559	base = 16;
	560	content = content.mid(2); /* Remove "#x". */
	561	}
	562	else
	563	{
	564	base = 10;
	565	content = content.mid(1); /* Remove "#". */
	566	}
	567
	568	bool conversionOK = false;
	569	const int codepoint = content.toInt(&conversionOK, base);
	570
	571	if(conversionOK)
	572	{
	573	const QChar ch(codepoint);
	574
	575	if(ch.isNull())
	576	{
	577	/* We likely have something which require surrogate pairs. */
	578	QString result;
	579	result += QChar(QChar::highSurrogate(codepoint));
	580	result += QChar(QChar::lowSurrogate(codepoint));
	581	return result;
	582	}
	583	else
	584	return ch;
	585	}
	586	else
	587	return QString();
	588	}
	589	else
	590	return QString();
	591	}
	592
	593	int XQueryTokenizer::scanUntil(const char *const content)
	594	{
	595	const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
	596
	597	if(end == -1)
	598	return -1;
	599	else
	600	{
	601	const int len = end - m_pos;
	602	m_pos += len;
	603	return len;
	604	}
	605	}
	606
	607	QChar XQueryTokenizer::charForReference(const QString &reference)
	608	{
	609	if(m_charRefs.isEmpty())
	610	{
	611	/* Initialize. */
	612	m_charRefs.reserve(5);
	613	m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<'));
	614	m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>'));
	615	m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&'));
	616	m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"'));
	617	m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
	618	}
	619
	620	return m_charRefs.value(reference);
	621	}
	622
	623	Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
	624	{
	625	const QChar delimiter(current());
	626	/* We cannot unfortunately just scan and then do mid(),
	627	* since we can encounter character references. */
	628	QString result;
	629
	630	/* This is more likely than QString's default allocation. */
	631	result.reserve(8);
	632
	633	CharacterSkips skipEOLNormalization;
	634
	635	/* Advance over the initial quote character. */
	636	++m_pos;
	637
	638	for(; m_pos < m_length; ++m_pos)
	639	{
	640	const QChar c(current());
	641
	642	if(c == QLatin1Char('&'))
	643	{
	644	const QString charRef(tokenizeCharacterReference());
	645
	646	if(charRef.isNull())
	647	return error();
	648	else
	649	{
	650	skipEOLNormalization.insert(result.count());
	651	result.append(charRef);
	652	}
	653
	654	}
	655	else if(c == delimiter)
	656	{
	657	/* Maybe the escaping mechanism is used. For instance, "s""s"
	658	* has the value `s"s'. */
	659	++m_pos;
	660
	661	if(current() == delimiter) /* Double quote. */
	662	result += delimiter;
	663	else
	664	return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
	665	}
	666	else
	667	result += c;
	668	}
	669
	670	return error();
	671	}
	672
	673	Tokenizer::Token XQueryTokenizer::tokenizeNCName()
	674	{
	675	const int startPos = m_pos;
	676
	677	if(m_pos < m_length && isNCNameStart(current()))
	678	{
	679	++m_pos;
	680
	681	for(; m_pos < m_length; ++m_pos)
	682	{
	683	if(!isNCNameBody(current()))
	684	break;
	685	}
	686
	687	return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
	688	}
	689	else
	690	return error();
	691	}
	692
	693	bool XQueryTokenizer::aheadEquals(const char *const chs,
	694	const int len,
	695	const int offset) const
	696	{
	697	Q_ASSERT(len > 0);
	698	Q_ASSERT(qstrlen(chs) == uint(len));
	699
	700	if(m_pos + len >= m_length)
	701	return false;
	702
	703	for(int i = offset; i < (len + offset); ++i)
	704	{
	705	if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
	706	return false;
	707	}
	708
	709	return true;
	710	}
	711
	712	const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
	713	{
	714	return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
	715	}
	716
	717	XQueryTokenizer::State XQueryTokenizer::state() const
	718	{
	719	return m_state;
	720	}
	721
	722	void XQueryTokenizer::setState(const State s)
	723	{
	724	m_state = s;
	725	}
	726
	727	void XQueryTokenizer::pushState(const State s)
	728	{
	729	m_stateStack.push(s);
	730	}
	731
	732	void XQueryTokenizer::pushState()
	733	{
	734	m_stateStack.push(m_state);
	735	}
	736
	737	void XQueryTokenizer::popState()
	738	{
	739	/* QStack::pop() asserts if it's empty, so we need to check
	740	* it, since we might receive unbalanced curlies. */
	741	if(!m_stateStack.isEmpty())
	742	m_state = m_stateStack.pop();
	743	}
	744
	745	Tokenizer::Token XQueryTokenizer::nextToken()
	746	{
	747	switch(state())
	748	{
	749	/* We want to skip or do special whitespace handling for these
	750	* states. So fallthrough all of the following. */
	751	case AposAttributeContent:
	752	case Axis:
	753	case ElementContent:
	754	case EndTag:
	755	case Pragma:
	756	case PragmaContent:
	757	case ProcessingInstructionName:
	758	case QuotAttributeContent:
	759	case StartTag:
	760	case XMLComment:
	761	break;
	762	default:
	763	handleWhitespace();
	764	}
	765
	766	switch(state())
	767	{
	768	case XMLSpaceDecl:
	769	/* Fallthrough. */
	770	case NamespaceKeyword:
	771	{
	772	switch(peekCurrent())
	773	{
	774	case ',':
	775	return tokenAndAdvance(COMMA);
	776	case '"':
	777	/* Fallthrough. */
	778	case '\'':
	779	{
	780	setState(NamespaceDecl);
	781	return tokenizeStringLiteral();
	782	}
	783	}
	784
	785	const Token id(tokenizeNCName());
	786
	787	if(id.type != NCNAME)
	788	return id;
	789
	790	const TokenMap *const keyword = lookupKeyword(id.value);
	791	if(keyword)
	792	{
	793	switch(keyword->token)
	794	{
	795	case INHERIT:
	796	/* Fallthrough. */
	797	case NO_INHERIT:
	798	{
	799	setState(Default);
	800	break;
	801	}
	802	case NAMESPACE:
	803	{
	804	setState(NamespaceDecl);
	805	break;
	806	}
	807	case ORDERED:
	808	/* Fallthrough. */
	809	case UNORDERED:
	810	/* Fallthrough. */
	811	case STRIP:
	812	{
	813	setState(Default);
	814	break;
	815	}
	816	case PRESERVE:
	817	{
	818	if(state() != NamespaceKeyword)
	819	setState(Default);
	820	}
	821	default:
	822	break;
	823	}
	824
	825	return Token(keyword->token);
	826	}
	827	else
	828	return id;
	829
	830	Q_ASSERT(false);
	831	}
	832	case NamespaceDecl:
	833	{
	834	switch(peekCurrent())
	835	{
	836	case '=':
	837	return tokenAndAdvance(G_EQ);
	838	case ';':
	839	return tokenAndChangeState(SEMI_COLON, Default);
	840	case '\'':
	841	/* Fallthrough. */
	842	case '\"':
	843	return tokenizeStringLiteral();
	844	}
	845
	846	const Token nc(tokenizeNCName());
	847
	848	handleWhitespace();
	849
	850	const char pc = peekCurrent();
	851	const TokenMap* const t = lookupKeyword(nc.value);
	852
	853	if(pc == '\'' \|\| (pc == '"' && t))
	854	return tokenAndChangeState(t->token, Default, 0);
	855	else
	856	return nc;
	857
	858	Q_ASSERT(false);
	859	}
	860	case Axis:
	861	{
	862	if(peekCurrent() == ':')
	863	{
	864	Q_ASSERT(peekAhead() == ':');
	865	m_pos += 2;
	866	setState(AfterAxisSeparator);
	867	return Token(COLONCOLON);
	868	}
	869	/* Fallthrough. */
	870	}
	871	case AfterAxisSeparator:
	872	/* Fallthrough. */
	873	case Default:
	874	/* State Operator and state Default have a lot of tokens in common except
	875	* for minor differences. So we treat them the same way, and sprinkles logic
	876	* here and there to handle the small differences. */
	877	/* Fallthrough. */
	878	case Operator:
	879	{
	880	switch(peekCurrent())
	881	{
	882	case '=':
	883	return tokenAndChangeState(G_EQ, Default);
	884	case '-':
	885	return tokenAndChangeState(MINUS, Default);
	886	case '+':
	887	return tokenAndChangeState(PLUS, Default);
	888	case '[':
	889	return tokenAndChangeState(LBRACKET, Default);
	890	case ']':
	891	return tokenAndChangeState(RBRACKET, Operator);
	892	case ',':
	893	return tokenAndChangeState(COMMA, Default);
	894	case ';':
	895	return tokenAndChangeState(SEMI_COLON, Default);
	896	case '$':
	897	return tokenAndChangeState(DOLLAR, VarName);
	898	case '\|':
	899	return tokenAndChangeState(BAR, Default);
	900	case '?':
	901	return tokenAndChangeState(QUESTION, Operator);
	902	case ')':
	903	return tokenAndChangeState(RPAREN, Operator);
	904	case '@':
	905	return tokenAndChangeState(AT_SIGN, Default);
	906	/* Fallthrough all these. */
	907	case '1':
	908	case '2':
	909	case '3':
	910	case '4':
	911	case '5':
	912	case '6':
	913	case '7':
	914	case '8':
	915	case '9':
	916	case '0':
	917	return tokenizeNumberLiteral();
	918	case '.':
	919	{
	920	const char next = peekAhead();
	921	if(next == '.')
	922	return tokenAndChangeState(DOTDOT, Operator, 2);
	923	/* .5 is allowed, as short form for 0.5:
	924	* <tt>[142] DecimalLiteral ::= ("." Digits) \| (Digits "." [0-9]*)</tt>
	925	*/
	926	else if(isDigit(next))
	927	return tokenizeNumberLiteral();
	928	else
	929	return tokenAndChangeState(DOT, Operator);
	930	}
	931	case '\'':
	932	/* Fallthrough. */
	933	case '"':
	934	{
	935	setState(Operator);
	936	return tokenizeStringLiteral();
	937
	938	}
	939	case '(':
	940	{
	941	if(peekAhead() == '#')
	942	return tokenAndChangeState(PRAGMA_START, Pragma, 2);
	943	else
	944	return tokenAndChangeState(LPAREN, Default);
	945	}
	946	case '*':
	947	{
	948	if(peekAhead() == ':')
	949	{
	950	m_pos += 2; /* Consume :. /
	951	const Token nc = tokenizeNCName();
	952
	953	if(nc.hasError())
	954	return error();
	955	else
	956	return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
	957	}
	958	else
	959	return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
	960	}
	961	case ':':
	962	{
	963	switch(peekAhead())
	964	{
	965	case '=':
	966	return tokenAndChangeState(ASSIGN, Default, 2);
	967	case ':':
	968	return tokenAndChangeState(COLONCOLON, Default, 2);
	969	default:
	970	return error();
	971	}
	972	}
	973	case '!':
	974	{
	975	if(peekAhead() == '=')
	976	return tokenAndChangeState(G_NE, Default, 2);
	977	else
	978	return error();
	979	}
	980	case '<':
	981	{
	982	switch(peekAhead())
	983	{
	984	case '=':
	985	return tokenAndChangeState(G_LE, Default, 2);
	986	case '<':
	987	return tokenAndChangeState(PRECEDES, Default, 2);
	988	case '?':
	989	{
	990	pushState(Operator);
	991	return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
	992	}
	993	case '!':
	994	{
	995	if(aheadEquals("!--", 3))
	996	{
	997	m_pos += 3; /* Consume "!--". */
	998	pushState(Operator);
	999	return tokenAndChangeState(COMMENT_START, XMLComment);
	1000	}
	1001	/* Fallthrough. It's a syntax error, and this is a good way to report it. */
	1002	}
	1003	default:
	1004	{
	1005	if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
	1006	{
	1007	/* We assume it's an element constructor. */
	1008	pushState(Operator);
	1009	}
	1010
	1011	return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
	1012	}
	1013	}
	1014	}
	1015	case '>':
	1016	{
	1017	switch(peekAhead())
	1018	{
	1019	case '=':
	1020	return tokenAndChangeState(G_GE, Default, 2);
	1021	case '>':
	1022	return tokenAndChangeState(FOLLOWS, Default, 2);
	1023	default:
	1024	return tokenAndChangeState(G_GT, Default);
	1025	}
	1026	}
	1027	case '/':
	1028	{
	1029	if(peekAhead() == '/')
	1030	return tokenAndChangeState(SLASHSLASH, Default, 2);
	1031	else
	1032	return tokenAndChangeState(SLASH, Default);
	1033	}
	1034	case '{':
	1035	{
	1036	pushState(Operator);
	1037	return tokenAndChangeState(CURLY_LBRACE, Default);
	1038	}
	1039	case '}':
	1040	{
	1041	popState();
	1042
	1043	return tokenAndAdvance(CURLY_RBRACE);
	1044	}
	1045	}
	1046
	1047	/* Ok. We're in state Default or Operator, and it wasn't a simple
	1048	* character. */
	1049
	1050	const Token id(tokenizeNCName());
	1051
	1052	if(id.type != NCNAME)
	1053	return id;
	1054
	1055	const TokenMap *const keyword = lookupKeyword(id.value);
	1056
	1057	if(state() == Operator)
	1058	{
	1059	if(keyword)
	1060	{
	1061	if(keyword->token == DEFAULT \|\| keyword->token == ASCENDING \|\| keyword->token == DESCENDING)
	1062	setState(Operator);
	1063	else if(keyword->token == RETURN)
	1064	setState(Default);
	1065	else if(isPhraseKeyword(keyword->token))
	1066	{
	1067	const TokenType ws = consumeWhitespace();
	1068	if(ws == ERROR)
	1069	return error();
	1070
	1071	const Token id2(tokenizeNCName());
	1072	const TokenMap *const keyword2 = lookupKeyword(id2.value);
	1073
	1074	if(keyword2)
	1075	{
	1076	if(keyword->token == TREAT && keyword2->token == AS)
	1077	setState(ItemType);
	1078	else if (keyword->token == CAST \|\| (keyword->token == CASTABLE && keyword2->token == AS) \|\| keyword2->token == BY)
	1079	setState(Default);
	1080
	1081	m_tokenStack.push(Token(keyword2->token));
	1082	}
	1083	else
	1084	m_tokenStack.push(id2);
	1085
	1086	return Token(keyword->token);
	1087	}
	1088	else
	1089	{
	1090	/* Such that we tokenize the second token in "empty greatest". */
	1091	if(keyword->token != EMPTY)
	1092	setState(Default);
	1093	}
	1094
	1095	if(keyword->token == AS \|\| keyword->token == CASE)
	1096	setState(ItemType);
	1097
	1098	return Token(keyword->token);
	1099	}
	1100	else
	1101	return id;
	1102	}
	1103
	1104	Q_ASSERT(state() == Default \|\| state() == Axis \|\| state() == AfterAxisSeparator);
	1105
	1106	/*
	1107	* This is hard. Consider this:
	1108	*
	1109	* Valid: child ::nameTest
	1110	* Valid: child:: nameTest
	1111	* Syntax Error: child :localName
	1112	* Syntax Error: child: localName
	1113	*
	1114	* Consider "child ::name". Right now, we're here:
	1115	* ^
	1116	* We don't know whether "child" is a prefix and hence the whitespace is invalid,
	1117	* or whether it's an axis and hence skippable. */
	1118	{
	1119	const int wsLength = peekForColonColon();
	1120	/* We cannot call handleWhitespace() because it returns on
	1121	* END_OF_FILE, and we have parsed up keyword, and we need to
	1122	* deal with that.
	1123	*
	1124	* If we have a colon colon, which means the whitespace is
	1125	* allowed, we skip it. */
	1126	if(wsLength != -1)
	1127	m_pos += wsLength;
	1128	}
	1129
	1130	/* Handle name tests. */
	1131	if(peekCurrent() == ':')
	1132	{
	1133	switch(peekAhead())
	1134	{
	1135	case '=':
	1136	return id;
	1137	case '*':
	1138	{
	1139	m_pos += 2;
	1140	return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
	1141	}
	1142	case ':':
	1143	{
	1144	/* We have an axis. */
	1145	setState(Axis);
	1146	return keyword ? Token(keyword->token) : id;
	1147	}
	1148	default:
	1149	{
	1150	/* It's a QName. */
	1151	++m_pos; /* Consume the colon. */
	1152
	1153	const Token id2(tokenizeNCName());
	1154
	1155	if(id2.type != NCNAME)
	1156	{
	1157	--m_pos;
	1158	return id;
	1159	}
	1160
	1161	setState(Operator);
	1162	const int qNameLen = id.value.length() + id2.value.length() + 1;
	1163	return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
	1164	}
	1165	}
	1166	}
	1167
	1168	if(!keyword \|\| isOperatorKeyword(keyword->token))
	1169	{
	1170	setState(Operator);
	1171	return id;
	1172	}
	1173
	1174	const TokenType ws = consumeWhitespace();
	1175	if(ws == ERROR) // TODO this should test for success. Write test.
	1176	return Token(ERROR);
	1177
	1178	if(atEnd())
	1179	{
	1180	setState(Operator);
	1181	return id;
	1182	}
	1183
	1184	/* Let the if-body apply for constructors, and node type tests. */
	1185	if(isTypeToken(keyword->token) \|\|
	1186	keyword->token == TYPESWITCH \|\|
	1187	keyword->token == ORDERED \|\|
	1188	keyword->token == UNORDERED \|\|
	1189	keyword->token == IF)
	1190	{
	1191	switch(peekCurrent())
	1192	{
	1193	case '(':
	1194	{
	1195	// TODO See if we can remove DOCUMENT from isTypeToken.
	1196	if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
	1197	{
	1198	m_tokenStack.push(Token(LPAREN));
	1199	++m_pos; /* Consume '('. */
	1200	pushState(Operator);
	1201
	1202	if(keyword->token == PROCESSING_INSTRUCTION)
	1203	setState(KindTestForPI);
	1204	else
	1205	setState(KindTest);
	1206
	1207	return Token(keyword->token);
	1208	}
	1209	else if(keyword->token == TYPESWITCH \|\| keyword->token == IF)
	1210	return Token(keyword->token);
	1211	else /* It's a function call. */
	1212	return id;
	1213	}
	1214	case '{':
	1215	{
	1216	m_tokenStack.push(Token(CURLY_LBRACE));
	1217	++m_pos; /* Consume '{'. */
	1218	pushState(Operator);
	1219	/* Stay in state Default. */
	1220	return Token(keyword->token);
	1221	}
	1222	default:
	1223	{
	1224	/* We have read in a token which is for instance
	1225	* "return", and now it can be an element
	1226	* test("element") a node kind test("element()"), or a
	1227	* computed element constructor("element name {...").
	1228	* We need to do a two-token lookahead here, because
	1229	* "element return" can be an element test followed by
	1230	* the return keyword, but it can also be an element
	1231	* constructor("element return {"). */
	1232	if(isNCNameStart(current()))
	1233	{
	1234	const int currentPos = m_pos;
	1235	const Token token2 = tokenizeNCNameOrQName();
	1236
	1237	if(token2.hasError())
	1238	return token2;
	1239
	1240	handleWhitespace();
	1241
	1242	if(peekCurrent() == '{')
	1243	{
	1244	/* An element constructor. */
	1245	m_tokenStack.push(token2);
	1246	return Token(keyword->token);
	1247	}
	1248
	1249	/* We jump back in the stream, we need to tokenize token2 according
	1250	* to the state. */
	1251	m_pos = currentPos;
	1252	setState(Operator);
	1253	return Token(NCNAME, QLatin1String(keyword->name));
	1254	}
	1255	}
	1256	}
	1257	}
	1258
	1259	if(peekCurrent() == '$')
	1260	{
	1261	setState(VarName);
	1262	return Token(keyword->token);
	1263	}
	1264
	1265	/* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
	1266	if(peekCurrent() == '(')
	1267	return id;
	1268	else if(peekCurrent() == '{' && keyword->token == VALIDATE)
	1269	return Token(keyword->token);
	1270
	1271	if(!isNCNameStart(current()))
	1272	{
	1273	setState(Operator);
	1274	return id;
	1275	}
	1276
	1277	const Token id2(tokenizeNCName());
	1278	const TokenMap *const keyword2 = lookupKeyword(id2.value);
	1279
	1280	if(!keyword2)
	1281	{
	1282	/* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
	1283	setState(Operator);
	1284	return id;
	1285	}
	1286
	1287	switch(keyword->token)
	1288	{
	1289	case DECLARE:
	1290	{
	1291	switch(keyword2->token)
	1292	{
	1293	case VARIABLE:
	1294	/* Fallthrough. */
	1295	case FUNCTION:
	1296	{
	1297	m_tokenStack.push(Token(keyword2->token));
	1298	setState(Default);
	1299	return Token(keyword->token);
	1300	}
	1301	case OPTION:
	1302	{
	1303	m_tokenStack.push(Token(keyword2->token));
	1304	setState(Default);
	1305	return Token(keyword->token);
	1306	}
	1307	case COPY_NAMESPACES:
	1308	/* Fallthrough. */
	1309	case ORDERING:
	1310	{
	1311	m_tokenStack.push(Token(keyword2->token));
	1312	setState(NamespaceKeyword);
	1313	return Token(keyword->token);
	1314	}
	1315	case CONSTRUCTION:
	1316	{
	1317	// TODO identical to CONSTRUCTION?
	1318	m_tokenStack.push(Token(keyword2->token));
	1319	setState(Operator);
	1320	return Token(keyword->token);
	1321	}
	1322	case NAMESPACE:
	1323	/* Fallthrough. */
	1324	case BASEURI:
	1325	{
	1326	m_tokenStack.push(Token(keyword2->token));
	1327	setState(NamespaceDecl);
	1328	return Token(keyword->token);
	1329	}
	1330	case BOUNDARY_SPACE:
	1331	{
	1332	m_tokenStack.push(Token(keyword2->token));
	1333	setState(XMLSpaceDecl);
	1334	return Token(keyword->token);
	1335	}
	1336	case DEFAULT:
	1337	{
	1338	m_tokenStack.push(Token(keyword2->token));
	1339
	1340	const TokenType ws2 = consumeWhitespace();
	1341	if(ws2 != SUCCESS)
	1342	{
	1343	m_tokenStack.prepend(Token(ws2));
	1344	return Token(keyword->token);
	1345	}
	1346
	1347	const Token id3(tokenizeNCName());
	1348
	1349	if(id3.type != NCNAME)
	1350	{
	1351	m_tokenStack.prepend(id3);
	1352	return Token(keyword->token);
	1353	}
	1354
	1355	const TokenMap *const keyword3 = lookupKeyword(id3.value);
	1356	if(!keyword3)
	1357	{
	1358	m_tokenStack.prepend(id3);
	1359	return Token(keyword->token);
	1360	}
	1361	else
	1362	{
	1363	m_tokenStack.prepend(Token(keyword3->token));
	1364
	1365	if(keyword3->token == ORDER)
	1366	setState(Operator);
	1367	else
	1368	setState(NamespaceDecl);
	1369	}
	1370
	1371	return Token(keyword->token);
	1372	}
	1373	default:
	1374	{
	1375	m_tokenStack.push(Token(keyword2->token));
	1376	setState(Default);
	1377	return id;
	1378	}
	1379	}
	1380	}
	1381	case XQUERY:
	1382	{
	1383	m_tokenStack.push(Token(keyword2->token));
	1384
	1385	if(keyword2->token == VERSION)
	1386	{
	1387	setState(NamespaceDecl);
	1388	return Token(keyword->token);
	1389	}
	1390	else
	1391	{
	1392	setState(Operator);
	1393	return id;
	1394	}
	1395	}
	1396	case IMPORT:
	1397	{
	1398	m_tokenStack.push(Token(keyword2->token));
	1399
	1400	switch(keyword2->token)
	1401	{
	1402	case SCHEMA:
	1403	/* Fallthrough. */
	1404	case MODULE:
	1405	{
	1406	setState(NamespaceKeyword);
	1407	return Token(keyword->token);
	1408	}
	1409	default:
	1410	{
	1411	setState(Operator);
	1412	return id;
	1413	}
	1414	}
	1415	}
	1416	case VALIDATE:
	1417	{
	1418	m_tokenStack.push(Token(keyword2->token));
	1419
	1420	switch(keyword2->token)
	1421	{
	1422	case LAX:
	1423	case STRICT:
	1424	{
	1425	pushState(Operator);
	1426	return Token(keyword->token);
	1427	}
	1428	default:
	1429	{
	1430	setState(Operator);
	1431	return id;
	1432	}
	1433	}
	1434	}
	1435	default:
	1436	{
	1437	m_tokenStack.push(Token(keyword2->token));
	1438	setState(Operator);
	1439	return id;
	1440	}
	1441	}
	1442
	1443	Q_ASSERT(false);
	1444
	1445	}
	1446	case VarName:
	1447	{
	1448	if(peekCurrent() == '$')
	1449	return tokenAndAdvance(DOLLAR);
	1450
	1451	setState(Operator);
	1452	return tokenizeNCNameOrQName();
	1453	Q_ASSERT(false);
	1454	}
	1455	case ItemType:
	1456	{
	1457	switch(peekCurrent())
	1458	{
	1459	case '(':
	1460	return tokenAndChangeState(LPAREN, KindTest);
	1461	case '$':
	1462	return tokenAndChangeState(DOLLAR, VarName);
	1463	}
	1464
	1465	const Token name(tokenizeNCNameOrQName());
	1466
	1467	if(name.hasError())
	1468	return error();
	1469
	1470	else if(name.type == QNAME)
	1471	{
	1472	setState(OccurrenceIndicator);
	1473	return name;
	1474	}
	1475	else
	1476	{
	1477	const TokenMap *const keyword = lookupKeyword(name.value);
	1478
	1479	if(keyword)
	1480	{
	1481	pushState(OccurrenceIndicator);
	1482	return Token(keyword->token);
	1483	}
	1484	else
	1485	{
	1486	setState(Default);
	1487	return name;
	1488	}
	1489	}
	1490	Q_ASSERT(false);
	1491	}
	1492	case KindTest:
	1493	{
	1494	switch(peekCurrent())
	1495	{
	1496	case ')':
	1497	{
	1498	popState();
	1499	return tokenAndAdvance(RPAREN);
	1500	}
	1501	case '(':
	1502	return tokenAndAdvance(LPAREN);
	1503	case ',':
	1504	return tokenAndAdvance(COMMA);
	1505	case '*':
	1506	return tokenAndAdvance(STAR);
	1507	case '?':
	1508	return tokenAndAdvance(QUESTION);
	1509	case '\'':
	1510	/* Fallthrough. */
	1511	case '"':
	1512	return tokenizeStringLiteral();
	1513	}
	1514
	1515	const Token nc(tokenizeNCNameOrQName());
	1516	if(nc.hasError())
	1517	return nc;
	1518
	1519	const TokenType ws = consumeWhitespace();
	1520	if(ws == ERROR)
	1521	return error();
	1522
	1523	if(peekCurrent() == '(')
	1524	{
	1525	const TokenMap *const keyword = lookupKeyword(nc.value);
	1526	if(keyword)
	1527	{
	1528	pushState(KindTest);
	1529	return Token(keyword->token);
	1530	}
	1531	else
	1532	return nc;
	1533	}
	1534	else
	1535	return nc;
	1536	Q_ASSERT(false);
	1537	}
	1538	case KindTestForPI:
	1539	{
	1540	switch(peekCurrent())
	1541	{
	1542	case ')':
	1543	{
	1544	popState();
	1545	return tokenAndAdvance(RPAREN);
	1546	}
	1547	case '\'':
	1548	/* Fallthrough. */
	1549	case '"':
	1550	return tokenizeStringLiteral();
	1551	default:
	1552	return tokenizeNCName();
	1553	}
	1554	Q_ASSERT(false);
	1555	}
	1556	case OccurrenceIndicator:
	1557	{
	1558	switch(peekCurrent())
	1559	{
	1560	case '?':
	1561	return tokenAndChangeState(QUESTION, Operator);
	1562	case '*':
	1563	return tokenAndChangeState(STAR, Operator);
	1564	case '+':
	1565	return tokenAndChangeState(PLUS, Operator);
	1566	default:
	1567	{
	1568	setState(Operator);
	1569	return nextToken();
	1570	}
	1571	}
	1572	Q_ASSERT(false);
	1573	}
	1574	case XQueryVersion:
	1575	{
	1576	switch(peekCurrent())
	1577	{
	1578	case '\'':
	1579	/* Fallthrough. */
	1580	case '"':
	1581	return tokenizeStringLiteral();
	1582	case ';':
	1583	return tokenAndChangeState(SEMI_COLON, Default);
	1584	}
	1585
	1586	const Token id(tokenizeNCName());
	1587
	1588	if(id.type != NCNAME)
	1589	return id;
	1590
	1591	const TokenMap *const keyword = lookupKeyword(id.value);
	1592	if(keyword)
	1593	return tokenAndChangeState(keyword->token, Default);
	1594	else
	1595	return id;
	1596	Q_ASSERT(false);
	1597	}
	1598	case StartTag:
	1599	{
	1600	if(peekAhead(-1) == '<')
	1601	{
	1602	if(current().isSpace())
	1603	return Token(ERROR);
	1604	}
	1605	else
	1606	{
	1607	if(consumeRawWhitespace())
	1608	return Token(END_OF_FILE);
	1609	}
	1610
	1611	switch(peekCurrent())
	1612	{
	1613	case '/':
	1614	{
	1615	if(peekAhead() == '>')
	1616	{
	1617	m_pos += 2;
	1618
	1619	if(m_scanOnly)
	1620	return Token(POSITION_SET);
	1621	else
	1622	{
	1623	popState();
	1624	return Token(QUICK_TAG_END);
	1625	}
	1626	}
	1627	else
	1628	return error();
	1629	}
	1630	case '>':
	1631	{
	1632	if(m_scanOnly)
	1633	return tokenAndChangeState(POSITION_SET, StartTag);
	1634	else
	1635	return tokenAndChangeState(G_GT, ElementContent);
	1636	}
	1637	case '=':
	1638	return tokenAndAdvance(G_EQ);
	1639	case '\'':
	1640	return tokenAndChangeState(APOS, AposAttributeContent);
	1641	case '"':
	1642	return tokenAndChangeState(QUOTE, QuotAttributeContent);
	1643	default:
	1644	return tokenizeNCNameOrQName();
	1645	}
	1646	Q_ASSERT(false);
	1647	}
	1648	case AposAttributeContent:
	1649	/* Fallthrough. */
	1650	case QuotAttributeContent:
	1651	{
	1652	const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
	1653	QString result;
	1654	result.reserve(20);
	1655
	1656	if(m_scanOnly)
	1657	{
	1658	int stack = 0;
	1659	return attributeAsRaw(sep, stack, m_pos, true, result);
	1660	}
	1661
	1662	Q_ASSERT(!m_scanOnly);
	1663	while(true)
	1664	{
	1665	if(atEnd())
	1666	{
	1667	/* In the case that the XSL-T tokenizer invokes us with
	1668	* default state QuotAttributeContent, we need to be able
	1669	* to return a single string, in case that is all we have
	1670	* accumulated. */
	1671	if(result.isEmpty())
	1672	return Token(END_OF_FILE);
	1673	else
	1674	return Token(STRING_LITERAL, result);
	1675	}
	1676
	1677	const QChar curr(current());
	1678
	1679	if(curr == sep)
	1680	{
	1681	if(m_pos + 1 == m_length)
	1682	return Token(END_OF_FILE);
	1683
	1684	if(m_data.at(m_pos + 1) == sep)
	1685	{
	1686	/* The quoting mechanism was used. */
	1687	m_pos += 2;
	1688	result.append(sep);
	1689	continue;
	1690	}
	1691
	1692	const QChar next(m_data.at(m_pos + 1));
	1693	if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
	1694	return Token(ERROR); // i18n Space must separate attributes
	1695	else if(result.isEmpty())
	1696	{
	1697	return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
	1698	StartTag, 1);
	1699	}
	1700	else
	1701	{
	1702	/* Don't consume the sep, but leave it so we next time return a token for it. */
	1703	return Token(STRING_LITERAL, result);
	1704	}
	1705
	1706	++m_pos;
	1707	continue;
	1708	}
	1709	else if(curr == QLatin1Char('{'))
	1710	{
	1711	if(m_pos + 1 == m_length)
	1712	return Token(END_OF_FILE);
	1713	else if(peekAhead() == '{')
	1714	{
	1715	++m_pos;
	1716	result.append(QLatin1Char('{'));
	1717	}
	1718	else
	1719	{
	1720	if(result.isEmpty())
	1721	{
	1722	/* The Attribute Value Template appeared directly in the attribute. */
	1723	pushState();
	1724	return tokenAndChangeState(CURLY_LBRACE, Default);
	1725	}
	1726	else
	1727	{
	1728	/* We don't advance, keep '{' as next token. */
	1729	return Token(STRING_LITERAL, result);
	1730	}
	1731	}
	1732	}
	1733	else if(curr == QLatin1Char('}'))
	1734	{
	1735	if(m_pos + 1 == m_length)
	1736	return Token(END_OF_FILE);
	1737	else if(peekAhead() == '}')
	1738	{
	1739	++m_pos;
	1740	result.append(QLatin1Char('}'));
	1741	}
	1742	else
	1743	return Token(ERROR);
	1744	}
	1745	else if(curr == QLatin1Char('&'))
	1746	{
	1747	const QString ret(tokenizeCharacterReference());
	1748	if(ret.isNull())
	1749	return Token(ERROR);
	1750	else
	1751	result.append(ret);
	1752	}
	1753	else if(curr == QLatin1Char('<'))
	1754	return Token(STRING_LITERAL, result);
	1755	else
	1756	{
	1757	/* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
	1758	* 3.3.3 Attribute-Value Normalization.
	1759	*
	1760	* However, it is complicated a bit by that AVN is defined on top of
	1761	* EOL normalization and we do those two in one go here. */
	1762	switch(curr.unicode())
	1763	{
	1764	case 0xD:
	1765	{
	1766	if(peekAhead() == '\n')
	1767	{
	1768	result.append(QLatin1Char(' '));
	1769	++m_pos;
	1770	break;
	1771	}
	1772	}
	1773	case 0xA:
	1774	/* Fallthrough. */
	1775	case 0x9:
	1776	{
	1777	result.append(QLatin1Char(' '));
	1778	break;
	1779	}
	1780	default:
	1781	result.append(curr);
	1782	}
	1783	}
	1784
	1785	++m_pos;
	1786	}
	1787	Q_ASSERT(false);
	1788	}
	1789	case ElementContent:
	1790	{
	1791	QString result;
	1792	result.reserve(20);
	1793
	1794	/* Whether the text node, result, may be whitespace only. Character references
	1795	* and CDATA sections disables that. */
	1796	bool mayBeWS = true;
	1797
	1798	CharacterSkips skipEOLNormalization;
	1799
	1800	while(true)
	1801	{
	1802	if(atEnd())
	1803	return Token(END_OF_FILE);
	1804
	1805	switch(peekCurrent())
	1806	{
	1807	case '<':
	1808	{
	1809	if(!result.isEmpty() && peekAhead(2) != '[')
	1810	{
	1811	/* We encountered the end, and it was not a CDATA section. */
	1812	/* We don't advance. Next time we'll handle the <... stuff. */
	1813	return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
	1814	}
	1815
	1816	++m_pos;
	1817	if(atEnd())
	1818	return Token(END_OF_FILE);
	1819
	1820	const QChar ahead(current());
	1821	if(ahead.isSpace())
	1822	return error();
	1823	else if(ahead == QLatin1Char('/'))
	1824	{
	1825	if(m_pos + 1 == m_length)
	1826	return Token(END_OF_FILE);
	1827	else if(m_data.at(m_pos + 1).isSpace())
	1828	return error();
	1829	else
	1830	return tokenAndChangeState(BEGIN_END_TAG, EndTag);
	1831	}
	1832	else if(isNCNameStart(ahead))
	1833	{
	1834	pushState();
	1835	return tokenAndChangeState(G_LT, StartTag, 0);
	1836	}
	1837	else if(aheadEquals("!--", 3, 0))
	1838	{
	1839	pushState();
	1840	m_pos += 3;
	1841	return tokenAndChangeState(COMMENT_START, XMLComment, 0);
	1842	}
	1843	else if(aheadEquals("![CDATA[", 8, 0))
	1844	{
	1845	mayBeWS = false;
	1846	m_pos += 8;
	1847	const int start = m_pos;
	1848	const int len = scanUntil("]]>");
	1849
	1850	if(len == -1)
	1851	return Token(END_OF_FILE);
	1852
	1853	m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
	1854	result.append(m_data.mid(start, len));
	1855	break;
	1856	}
	1857	else if(ahead == QLatin1Char('?'))
	1858	{
	1859	pushState();
	1860	return tokenAndChangeState(PI_START, ProcessingInstructionName);
	1861	}
	1862	else
	1863	return Token(G_LT);
	1864	}
	1865	case '&':
	1866	{
	1867	const QString ret(tokenizeCharacterReference());
	1868	if(ret.isNull())
	1869	return Token(ERROR);
	1870	else
	1871	{
	1872	skipEOLNormalization.insert(result.count());
	1873	result.append(ret);
	1874	mayBeWS = false;
	1875	break;
	1876	}
	1877	}
	1878	case '{':
	1879	{
	1880	// TODO remove this check, also below.
	1881	if(m_pos + 1 == m_length)
	1882	return Token(END_OF_FILE);
	1883	else if(peekAhead() == '{')
	1884	{
	1885	++m_pos;
	1886	result.append(QLatin1Char('{'));
	1887	}
	1888	else
	1889	{
	1890	if(result.isEmpty())
	1891	{
	1892	pushState();
	1893	return tokenAndChangeState(CURLY_LBRACE, Default);
	1894	}
	1895	else
	1896	{
	1897	/* We don't advance here. */
	1898	return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
	1899	}
	1900	}
	1901	break;
	1902	}
	1903	case '}':
	1904	{
	1905	if(m_pos + 1 == m_length)
	1906	return Token(END_OF_FILE);
	1907	else if(peekAhead() == '}')
	1908	{
	1909	++m_pos;
	1910	result.append(QLatin1Char('}'));
	1911	}
	1912	else
	1913	{
	1914	/* This is a parse error, and the grammar won't be able
	1915	* to reduce this CURLY_RBRACE. */
	1916	return tokenAndChangeState(CURLY_RBRACE, Default);
	1917	}
	1918	break;
	1919	}
	1920	case '\n':
	1921	{
	1922	/* We want to translate \r\n into \n. */
	1923	if(peekAhead(-1) == '\r')
	1924	break;
	1925	/* else, fallthrough. */
	1926	}
	1927	case '\r':
	1928	{
	1929	result.append(QLatin1Char('\n'));
	1930	break;
	1931	}
	1932	default:
	1933	{
	1934	result.append(current());
	1935	break;
	1936	}
	1937	}
	1938	++m_pos;
	1939	}
	1940	Q_ASSERT(false);
	1941	}
	1942	case ProcessingInstructionName:
	1943	{
	1944	const int start = m_pos;
	1945
	1946	while(true)
	1947	{
	1948	++m_pos;
	1949	if(m_pos >= m_length)
	1950	return Token(END_OF_FILE);
	1951
	1952	const QChar next(current());
	1953	if(next.isSpace() \|\| next == QLatin1Char('?'))
	1954	{
	1955	return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
	1956	ProcessingInstructionContent);
	1957	}
	1958	}
	1959	Q_ASSERT(false);
	1960	}
	1961	case ProcessingInstructionContent:
	1962	{
	1963	/* Consume whitespace between the name and the content. */
	1964	if(consumeRawWhitespace())
	1965	return Token(END_OF_FILE);
	1966
	1967	const int start = m_pos;
	1968	const int len = scanUntil("?>");
	1969
	1970	if(len == -1)
	1971	return Token(END_OF_FILE);
	1972	else
	1973	{
	1974	m_pos += 2; /* Consume "?>" */
	1975	popState();
	1976	return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
	1977	}
	1978	Q_ASSERT(false);
	1979	}
	1980	case EndTag:
	1981	{
	1982	if(consumeRawWhitespace())
	1983	return END_OF_FILE;
	1984
	1985	if(peekCurrent() == '>')
	1986	{
	1987	popState();
	1988	return tokenAndAdvance(G_GT);
	1989	}
	1990	else
	1991	return tokenizeNCNameOrQName();
	1992	Q_ASSERT(false);
	1993	}
	1994	case XMLComment:
	1995	{
	1996	const int start = m_pos;
	1997	const int len = scanUntil("--");
	1998
	1999	if(len == -1)
	2000	return END_OF_FILE;
	2001	else
	2002	{
	2003	m_pos += 2; /* Consume "--". */
	2004	popState();
	2005
	2006	if(peekCurrent() == '>')
	2007	{
	2008	++m_pos;
	2009	return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
	2010	}
	2011	else
	2012	return error();
	2013	}
	2014	Q_ASSERT(false);
	2015	}
	2016	case Pragma:
	2017	{
	2018	/* Consume whitespace. */
	2019	if(consumeRawWhitespace())
	2020	return Token(END_OF_FILE);
	2021
	2022	setState(PragmaContent);
	2023	return tokenizeNCNameOrQName();
	2024	}
	2025	case PragmaContent:
	2026	{
	2027	QString result;
	2028	result.reserve(20);
	2029
	2030	const bool hasWS = m_pos < m_length && current().isSpace();
	2031
	2032	/* Consume all whitespace up to the pragma content(if any). */
	2033	if(consumeRawWhitespace())
	2034	return Token(END_OF_FILE);
	2035
	2036	if(peekCurrent() == '#' && peekAhead() == ')')
	2037	{
	2038	/* We reached the end, and there's no pragma content. */
	2039	return tokenAndChangeState(PRAGMA_END, Default, 2);
	2040	}
	2041	else if(!hasWS)
	2042	{
	2043	/* A separating space is required if there's pragma content. */
	2044	return error(); /* i18n */
	2045	}
	2046
	2047	const int start = m_pos;
	2048	const int len = scanUntil("#)");
	2049	if(len == -1)
	2050	return Token(END_OF_FILE);
	2051
	2052	return Token(STRING_LITERAL, m_data.mid(start, len));
	2053	Q_ASSERT(false);
	2054	}
	2055	}
	2056
	2057	Q_ASSERT(false);
	2058	return error();
	2059	}
	2060
	2061	Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
	2062	int &sepStack,
	2063	const int startPos,
	2064	const bool aInLiteral,
	2065	QString &result)
	2066	{
	2067	bool inLiteral = aInLiteral;
	2068	const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
	2069
	2070	while(true)
	2071	{
	2072	if(atEnd())
	2073	return END_OF_FILE;
	2074
	2075	if(peekCurrent() == sep.unicode())
	2076	{
	2077	if(inLiteral)
	2078	inLiteral = false;
	2079	else
	2080	inLiteral = true;
	2081
	2082	if(peekAhead() == sep.unicode())
	2083	{
	2084	/* The quoting mechanism was used. */
	2085	result.append(current());
	2086	m_pos += 2;
	2087	continue;
	2088	}
	2089	else
	2090	{
	2091	/* Don't consume the separator, such that we
	2092	* return a token for it next time. */
	2093	if(m_pos == startPos)
	2094	{
	2095	++m_pos;
	2096	setState(StartTag);
	2097	return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
	2098	}
	2099
	2100
	2101	if(sepStack == 0)
	2102	{
	2103	return Token(STRING_LITERAL, result);
	2104	}
	2105	else
	2106	{
	2107	result.append(current());
	2108	++m_pos;
	2109	continue;
	2110	}
	2111	}
	2112	}
	2113	else if(peekCurrent() == '&')
	2114	{
	2115	const QString ret(tokenizeCharacterReference());
	2116	if(ret.isNull())
	2117	return Token(ERROR);
	2118	else
	2119	{
	2120	result.append(ret);
	2121	++m_pos;
	2122	continue;
	2123	}
	2124	}
	2125	else if(peekCurrent() == otherSep)
	2126	{
	2127	result.append(current());
	2128	++m_pos;
	2129
	2130	if(peekCurrent() == otherSep)
	2131	++m_pos;
	2132
	2133	if(inLiteral)
	2134	inLiteral = false;
	2135	else
	2136	inLiteral = true;
	2137
	2138	continue;
	2139	}
	2140	else if(peekCurrent() == '{')
	2141	{
	2142	result.append(current());
	2143
	2144	if(peekAhead() == '{')
	2145	{
	2146	m_pos += 2;
	2147	continue;
	2148	}
	2149	else
	2150	{
	2151	++m_pos;
	2152	++sepStack;
	2153	const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
	2154	if(t.type != SUCCESS)
	2155	return t;
	2156	}
	2157
	2158	}
	2159	else if(peekCurrent() == '}')
	2160	{
	2161	if(inLiteral && peekAhead() == '}')
	2162	{
	2163	result.append(current());
	2164	m_pos += 2;
	2165	continue;
	2166	}
	2167	else
	2168	{
	2169	++m_pos;
	2170	--sepStack;
	2171	return Token(SUCCESS); /* The return value is arbitrary. */
	2172	}
	2173	}
	2174	else
	2175	{
	2176	result.append(current());
	2177	++m_pos;
	2178	}
	2179	}
	2180	}
	2181
	2182	Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
	2183	{
	2184	sourceLocator->first_line = m_line;
	2185	sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
	2186
	2187	if(m_tokenStack.isEmpty())
	2188	return nextToken();
	2189	else
	2190	{
	2191	const Token retval(m_tokenStack.pop());
	2192
	2193	switch(retval.type)
	2194	{
	2195	case MODULE:
	2196	/* Fallthrough.*/
	2197	case SCHEMA:
	2198	/* Fallthrough.*/
	2199	case COPY_NAMESPACES:
	2200	{
	2201	setState(NamespaceKeyword);
	2202	break;
	2203	}
	2204	case VERSION:
	2205	{
	2206	setState(XQueryVersion);
	2207	break;
	2208	}
	2209	case AS:
	2210	/* Fallthrough. */
	2211	case OF:
	2212	{
	2213	setState(ItemType);
	2214	break;
	2215	}
	2216	default:
	2217	{
	2218	if(isOperatorKeyword(retval.type))
	2219	setState(Default);
	2220
	2221	break;
	2222	}
	2223	};
	2224
	2225	return retval;
	2226	}
	2227	}
	2228
	2229	int XQueryTokenizer::commenceScanOnly()
	2230	{
	2231	m_scanOnly = true;
	2232	return m_pos;
	2233	}
	2234
	2235	void XQueryTokenizer::resumeTokenizationFrom(const int pos)
	2236	{
	2237	m_scanOnly = false;
	2238	m_pos = pos;
	2239	}
	2240
	2241	void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
	2242	{
	2243	}
	2244
	2245	#undef handleWhitespace
	2246
	2247	} // namespace QPatternist
	2248
	2249	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Download in other formats: