1 | /****************************************************************************
|
---|
2 | **
|
---|
3 | ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
---|
4 | ** Contact: Qt Software Information ([email protected])
|
---|
5 | **
|
---|
6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
|
---|
7 | **
|
---|
8 | ** $QT_BEGIN_LICENSE:LGPL$
|
---|
9 | ** Commercial Usage
|
---|
10 | ** Licensees holding valid Qt Commercial licenses may use this file in
|
---|
11 | ** accordance with the Qt Commercial License Agreement provided with the
|
---|
12 | ** Software or, alternatively, in accordance with the terms contained in
|
---|
13 | ** a written agreement between you and Nokia.
|
---|
14 | **
|
---|
15 | ** GNU Lesser General Public License Usage
|
---|
16 | ** Alternatively, this file may be used under the terms of the GNU Lesser
|
---|
17 | ** General Public License version 2.1 as published by the Free Software
|
---|
18 | ** Foundation and appearing in the file LICENSE.LGPL included in the
|
---|
19 | ** packaging of this file. Please review the following information to
|
---|
20 | ** ensure the GNU Lesser General Public License version 2.1 requirements
|
---|
21 | ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
---|
22 | **
|
---|
23 | ** In addition, as a special exception, Nokia gives you certain
|
---|
24 | ** additional rights. These rights are described in the Nokia Qt LGPL
|
---|
25 | ** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
|
---|
26 | ** package.
|
---|
27 | **
|
---|
28 | ** GNU General Public License Usage
|
---|
29 | ** Alternatively, this file may be used under the terms of the GNU
|
---|
30 | ** General Public License version 3.0 as published by the Free Software
|
---|
31 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
---|
32 | ** packaging of this file. Please review the following information to
|
---|
33 | ** ensure the GNU General Public License version 3.0 requirements will be
|
---|
34 | ** met: http://www.gnu.org/copyleft/gpl.html.
|
---|
35 | **
|
---|
36 | ** If you are unsure which license is appropriate for your use, please
|
---|
37 | ** contact the sales department at [email protected].
|
---|
38 | ** $QT_END_LICENSE$
|
---|
39 | **
|
---|
40 | ****************************************************************************/
|
---|
41 |
|
---|
42 | //
|
---|
43 | // W A R N I N G
|
---|
44 | // -------------
|
---|
45 | //
|
---|
46 | // This file is not part of the Qt API. It exists purely as an
|
---|
47 | // implementation detail. This header file may change from version to
|
---|
48 | // version without notice, or even be removed.
|
---|
49 | //
|
---|
50 | // We mean it.
|
---|
51 | #ifndef Patternist_XQueryTokenizer_H
|
---|
52 | #define Patternist_XQueryTokenizer_H
|
---|
53 |
|
---|
54 | #include <QHash>
|
---|
55 | #include <QSet>
|
---|
56 | #include <QStack>
|
---|
57 | #include <QString>
|
---|
58 | #include <QUrl>
|
---|
59 |
|
---|
60 | #include "qtokenizer_p.h"
|
---|
61 |
|
---|
62 | QT_BEGIN_HEADER
|
---|
63 |
|
---|
64 | QT_BEGIN_NAMESPACE
|
---|
65 |
|
---|
66 | namespace QPatternist
|
---|
67 | {
|
---|
68 | struct TokenMap;
|
---|
69 |
|
---|
70 | /**
|
---|
71 | * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0,
|
---|
72 | * and delivers tokens to the Bison generated parser.
|
---|
73 | *
|
---|
74 | * @author Frans Englich <[email protected]>
|
---|
75 | */
|
---|
76 | class XQueryTokenizer : public Tokenizer
|
---|
77 | {
|
---|
78 | public:
|
---|
79 | /**
|
---|
80 | * Tokenizer states. Organized alphabetically.
|
---|
81 | */
|
---|
82 | enum State
|
---|
83 | {
|
---|
84 | AfterAxisSeparator,
|
---|
85 | AposAttributeContent,
|
---|
86 | Axis,
|
---|
87 | Default,
|
---|
88 | ElementContent,
|
---|
89 | EndTag,
|
---|
90 | ItemType,
|
---|
91 | KindTest,
|
---|
92 | KindTestForPI,
|
---|
93 | NamespaceDecl,
|
---|
94 | NamespaceKeyword,
|
---|
95 | OccurrenceIndicator,
|
---|
96 | Operator,
|
---|
97 | Pragma,
|
---|
98 | PragmaContent,
|
---|
99 | ProcessingInstructionContent,
|
---|
100 | ProcessingInstructionName,
|
---|
101 | QuotAttributeContent,
|
---|
102 | StartTag,
|
---|
103 | VarName,
|
---|
104 | XMLComment,
|
---|
105 | XMLSpaceDecl,
|
---|
106 | XQueryVersion
|
---|
107 | };
|
---|
108 |
|
---|
109 | XQueryTokenizer(const QString &query,
|
---|
110 | const QUrl &location,
|
---|
111 | const State startingState = Default);
|
---|
112 |
|
---|
113 | virtual Token nextToken(YYLTYPE *const sourceLocator);
|
---|
114 | virtual int commenceScanOnly();
|
---|
115 | virtual void resumeTokenizationFrom(const int position);
|
---|
116 |
|
---|
117 | /**
|
---|
118 | * Does nothing.
|
---|
119 | */
|
---|
120 | virtual void setParserContext(const ParserContext::Ptr &parseInfo);
|
---|
121 |
|
---|
122 | private:
|
---|
123 |
|
---|
124 | /**
|
---|
125 | * Returns the character corresponding to the builtin reference @p
|
---|
126 | * reference. For instance, passing @c gt will give you '>' in return.
|
---|
127 | *
|
---|
128 | * If @p reference is an invalid character reference, a null QChar is
|
---|
129 | * returned.
|
---|
130 | *
|
---|
131 | * @see QChar::isNull()
|
---|
132 | */
|
---|
133 | QChar charForReference(const QString &reference);
|
---|
134 |
|
---|
135 | inline Token tokenAndChangeState(const TokenType code,
|
---|
136 | const State state,
|
---|
137 | const int advance = 1);
|
---|
138 | inline Token tokenAndChangeState(const TokenType code,
|
---|
139 | const QString &value,
|
---|
140 | const State state);
|
---|
141 | inline Token tokenAndAdvance(const TokenType code,
|
---|
142 | const int advance = 1);
|
---|
143 | QString tokenizeCharacterReference();
|
---|
144 |
|
---|
145 | inline Token tokenizeStringLiteral();
|
---|
146 | inline Token tokenizeNumberLiteral();
|
---|
147 |
|
---|
148 | /**
|
---|
149 | * @returns the character @p length characters from the current
|
---|
150 | * position.
|
---|
151 | */
|
---|
152 | inline char peekAhead(const int length = 1) const;
|
---|
153 |
|
---|
154 | /**
|
---|
155 | * @returns whether the stream, starting from @p offset from the
|
---|
156 | * current position, matches @p chs. The length of @p chs is @p len.
|
---|
157 | */
|
---|
158 | inline bool aheadEquals(const char *const chs,
|
---|
159 | const int len,
|
---|
160 | const int offset = 1) const;
|
---|
161 |
|
---|
162 | inline Token tokenizeNCName();
|
---|
163 | static inline bool isOperatorKeyword(const TokenType);
|
---|
164 |
|
---|
165 | static inline bool isDigit(const char ch);
|
---|
166 | static inline Token error();
|
---|
167 | inline TokenType consumeWhitespace();
|
---|
168 |
|
---|
169 | /**
|
---|
170 | * @short Returns the character at the current position, converted to
|
---|
171 | * @c ASCII.
|
---|
172 | *
|
---|
173 | * Equivalent to calling:
|
---|
174 | *
|
---|
175 | * @code
|
---|
176 | * current().toAscii();
|
---|
177 | * @endcode
|
---|
178 | */
|
---|
179 | inline char peekCurrent() const;
|
---|
180 |
|
---|
181 | /**
|
---|
182 | * Disregarding encoding conversion, equivalent to calling:
|
---|
183 | *
|
---|
184 | * @code
|
---|
185 | * peekAhead(0);
|
---|
186 | * @endcode
|
---|
187 | */
|
---|
188 | inline const QChar current() const;
|
---|
189 |
|
---|
190 | /**
|
---|
191 | * @p hadWhitespace is always set to a proper value.
|
---|
192 | *
|
---|
193 | * @returns the length of whitespace scanned before reaching "::", or
|
---|
194 | * -1 if something else was found.
|
---|
195 | */
|
---|
196 | int peekForColonColon() const;
|
---|
197 |
|
---|
198 | static inline bool isNCNameStart(const QChar ch);
|
---|
199 | static inline bool isNCNameBody(const QChar ch);
|
---|
200 | static inline const TokenMap *lookupKeyword(const QString &keyword);
|
---|
201 | inline void popState();
|
---|
202 | inline void pushState(const State state);
|
---|
203 | inline State state() const;
|
---|
204 | inline void setState(const State s);
|
---|
205 | static bool isTypeToken(const TokenType t);
|
---|
206 |
|
---|
207 | inline Token tokenizeNCNameOrQName();
|
---|
208 | /**
|
---|
209 | * Advances m_pos until content is encountered.
|
---|
210 | *
|
---|
211 | * Returned is the length stretching from m_pos when starting, until
|
---|
212 | * @p content is encountered. @p content is not included in the length.
|
---|
213 | */
|
---|
214 | int scanUntil(const char *const content);
|
---|
215 |
|
---|
216 | /**
|
---|
217 | * Same as calling:
|
---|
218 | * @code
|
---|
219 | * pushState(currentState());
|
---|
220 | * @endcode
|
---|
221 | */
|
---|
222 | inline void pushState();
|
---|
223 |
|
---|
224 | /**
|
---|
225 | * Consumes only whitespace, in the traditional sense. The function exits
|
---|
226 | * if non-whitespace is encountered, such as the start of a comment.
|
---|
227 | *
|
---|
228 | * @returns @c true if the end was reached, otherwise @c false
|
---|
229 | */
|
---|
230 | inline bool consumeRawWhitespace();
|
---|
231 |
|
---|
232 | /**
|
---|
233 | * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for
|
---|
234 | * parsing nested comments.
|
---|
235 | *
|
---|
236 | * It is assumed that the start token for the comment, "(:", has
|
---|
237 | * already been parsed.
|
---|
238 | *
|
---|
239 | * Typically, don't call this function, but ignoreWhitespace().
|
---|
240 | *
|
---|
241 | * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath)
|
---|
242 | * 2.0, 2.6 Comments</a>
|
---|
243 | * @returns
|
---|
244 | * - SUCCESS if everything went ok
|
---|
245 | * - ERROR if there was an error in parsing one or more comments
|
---|
246 | * - END_OF_FILE if the end was reached
|
---|
247 | */
|
---|
248 | Tokenizer::TokenType consumeComment();
|
---|
249 |
|
---|
250 | /**
|
---|
251 | * Determines whether @p code is a keyword
|
---|
252 | * that is followed by a second keyword. For instance <tt>declare
|
---|
253 | * function</tt>.
|
---|
254 | */
|
---|
255 | static inline bool isPhraseKeyword(const TokenType code);
|
---|
256 |
|
---|
257 | /**
|
---|
258 | * A set of indexes into a QString, the one being passed to
|
---|
259 | * normalizeEOL() whose characters shouldn't be normalized. */
|
---|
260 | typedef QSet<int> CharacterSkips;
|
---|
261 |
|
---|
262 | /**
|
---|
263 | * Returns @p input, normalized according to
|
---|
264 | * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0:
|
---|
265 | * An XML Query Language, A.2.3 End-of-Line Handling</a>
|
---|
266 | */
|
---|
267 | static QString normalizeEOL(const QString &input,
|
---|
268 | const CharacterSkips &characterSkips);
|
---|
269 |
|
---|
270 | inline bool atEnd() const
|
---|
271 | {
|
---|
272 | return m_pos == m_length;
|
---|
273 | }
|
---|
274 |
|
---|
275 | Token nextToken();
|
---|
276 | /**
|
---|
277 | * Instead of recognizing and tokenizing embedded expressions in
|
---|
278 | * direct attriute constructors, this function is essentially a mini
|
---|
279 | * recursive-descent parser that has the necessary logic to recognize
|
---|
280 | * embedded expressions and their potentially interfering string literals, in
|
---|
281 | * order to scan to the very end of the attribute value, and return the
|
---|
282 | * whole as a string.
|
---|
283 | *
|
---|
284 | * There is of course syntax errors this function will not detect, but
|
---|
285 | * that is ok since the attributes will be parsed once more.
|
---|
286 | *
|
---|
287 | * An inelegant solution, but which gets the job done.
|
---|
288 | *
|
---|
289 | * @see commenceScanOnly(), resumeTokenizationFrom()
|
---|
290 | */
|
---|
291 | Token attributeAsRaw(const QChar separator,
|
---|
292 | int &stack,
|
---|
293 | const int startPos,
|
---|
294 | const bool inLiteral,
|
---|
295 | QString &result);
|
---|
296 |
|
---|
297 | const QString m_data;
|
---|
298 | const int m_length;
|
---|
299 | State m_state;
|
---|
300 | QStack<State> m_stateStack;
|
---|
301 | int m_pos;
|
---|
302 |
|
---|
303 | /**
|
---|
304 | * The current line number.
|
---|
305 | *
|
---|
306 | * The line number and column number both starts at 1.
|
---|
307 | */
|
---|
308 | int m_line;
|
---|
309 |
|
---|
310 | /**
|
---|
311 | * The offset into m_length for where
|
---|
312 | * the current column starts. So m_length - m_columnOffset
|
---|
313 | * is the current column.
|
---|
314 | *
|
---|
315 | * The line number and column number both starts at 1.
|
---|
316 | */
|
---|
317 | int m_columnOffset;
|
---|
318 |
|
---|
319 | const NamePool::Ptr m_namePool;
|
---|
320 | QStack<Token> m_tokenStack;
|
---|
321 | QHash<QString, QChar> m_charRefs;
|
---|
322 | bool m_scanOnly;
|
---|
323 |
|
---|
324 | Q_DISABLE_COPY(XQueryTokenizer)
|
---|
325 | };
|
---|
326 | }
|
---|
327 |
|
---|
328 | QT_END_NAMESPACE
|
---|
329 |
|
---|
330 | QT_END_HEADER
|
---|
331 |
|
---|
332 | #endif
|
---|