source: trunk/tools/porting/src/tokenizer.cpp@ 348

Last change on this file since 348 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 11.2 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5** Copyright (C) 2001-2004 Roberto Raggi
6**
7** This file is part of the qt3to4 porting application of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain
25** additional rights. These rights are described in the Nokia Qt LGPL
26** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
27** package.
28**
29** GNU General Public License Usage
30** Alternatively, this file may be used under the terms of the GNU
31** General Public License version 3.0 as published by the Free Software
32** Foundation and appearing in the file LICENSE.GPL included in the
33** packaging of this file. Please review the following information to
34** ensure the GNU General Public License version 3.0 requirements will be
35** met: http://www.gnu.org/copyleft/gpl.html.
36**
37** If you are unsure which license is appropriate for your use, please
38** contact the sales department at [email protected].
39** $QT_END_LICENSE$
40**
41****************************************************************************/
42
43#include "tokenizer.h"
44#include "tokens.h"
45#include <QDateTime>
46#include <QHash>
47#include <ctype.h>
48
49QT_BEGIN_NAMESPACE
50
51using TokenEngine::Token;
52
53static QHash<QByteArray, bool> preprocessed;
54bool Tokenizer::s_initialized = false;
55Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1];
56int Tokenizer::s_attr_table[256];
57
58Tokenizer::Tokenizer()
59 : m_buffer(0), m_ptr(0)
60{
61 if (!s_initialized)
62 setupScanTable();
63}
64
65Tokenizer::~Tokenizer()
66{
67}
68
69enum
70{
71 A_Alpha = 0x01,
72 A_Digit = 0x02,
73 A_Alphanum = A_Alpha | A_Digit,
74 A_Whitespace = 0x04
75};
76
77void Tokenizer::setupScanTable()
78{
79 s_initialized = true;
80
81 memset(s_attr_table, 0, 256);
82
83 for (int i=0; i<128; ++i) {
84 switch (i) {
85 case ':':
86 case '*':
87 case '%':
88 case '^':
89 case '=':
90 case '!':
91 case '&':
92 case '|':
93 case '+':
94 case '<':
95 case '>':
96 case '-':
97 case '.':
98 s_scan_table[i] = &Tokenizer::scanOperator;
99 break;
100
101 case '\r':
102 case '\n':
103 s_scan_table[i] = &Tokenizer::scanNewline;
104 break;
105
106 case '#':
107 s_scan_table[i] = &Tokenizer::scanPreprocessor;
108 break;
109
110 case '/':
111 s_scan_table[i] = &Tokenizer::scanComment;
112 break;
113
114 case '\'':
115 s_scan_table[i] = &Tokenizer::scanCharLiteral;
116 break;
117
118 case '"':
119 s_scan_table[i] = &Tokenizer::scanStringLiteral;
120 break;
121
122 default:
123 if (isspace(i)) {
124 s_scan_table[i] = &Tokenizer::scanWhiteSpaces;
125 s_attr_table[i] |= A_Whitespace;
126 } else if (isalpha(i) || i == '_') {
127 s_scan_table[i] = &Tokenizer::scanIdentifier;
128 s_attr_table[i] |= A_Alpha;
129 } else if (isdigit(i)) {
130 s_scan_table[i] = &Tokenizer::scanNumberLiteral;
131 s_attr_table[i] |= A_Digit;
132 } else
133 s_scan_table[i] = &Tokenizer::scanChar;
134 }
135 }
136
137 s_scan_table[128] = &Tokenizer::scanUnicodeChar;
138}
139
140QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text)
141{
142 m_tokens.clear();
143
144 m_buffer = text;
145 m_ptr = 0;
146
147 // tokenize
148 for (;;) {
149 Token tk;
150 bool endOfFile = nextToken(tk);
151 if (endOfFile) {
152 break;
153 }
154 m_tokens.append(tk);
155 }
156
157 return m_tokens;
158}
159
160bool Tokenizer::nextToken(Token &tok)
161{
162 int start = m_ptr;
163 unsigned char ch = (unsigned char)m_buffer[m_ptr];
164
165 int kind = 0;
166 (this->*s_scan_table[ch < 128 ? ch : 128])(&kind);
167
168 tok.start = start;
169 tok.length = m_ptr - start;
170
171 return (kind == 0);
172}
173
174void Tokenizer::scanChar(int *kind)
175{
176 *kind = m_buffer[m_ptr++];
177}
178
179void Tokenizer::scanWhiteSpaces(int *kind)
180{
181 *kind = Token_whitespaces;
182 while (unsigned char ch = m_buffer[m_ptr]) {
183 if (s_attr_table[ch] & A_Whitespace)
184 ++m_ptr;
185 else
186 break;
187 }
188}
189
190void Tokenizer::scanNewline(int *kind)
191{
192 Q_UNUSED(kind);
193 const unsigned char ch = m_buffer[m_ptr++];
194 // Check for \n.
195 if (ch == '\n') {
196 *kind = '\n';
197 return;
198 }
199
200 // Check for \r\n.
201 if (ch == '\r' && m_buffer[m_ptr] == '\n') {
202 *kind = '\n';
203 ++ m_ptr;
204 return;
205 }
206
207 *kind = ch;
208}
209
210void Tokenizer::scanUnicodeChar(int *kind)
211{
212 *kind = m_buffer[m_ptr++];
213}
214
215void Tokenizer::scanCharLiteral(int *kind)
216{
217 ++m_ptr;
218 for (;;) {
219 unsigned char ch = m_buffer[m_ptr];
220 switch (ch) {
221 case '\0':
222 case '\n':
223 // ### error
224 *kind = Token_char_literal;
225 return;
226 case '\\':
227 if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\')
228 m_ptr += 2;
229 else
230 ++m_ptr;
231 break;
232 case '\'':
233 ++m_ptr;
234 *kind = Token_char_literal;
235 return;
236 default:
237 ++m_ptr;
238 break;
239 }
240 }
241
242 // ### error
243 *kind = Token_char_literal;
244}
245
246void Tokenizer::scanStringLiteral(int *kind)
247{
248 ++m_ptr;
249 while (m_buffer[m_ptr]) {
250 switch (m_buffer[m_ptr]) {
251 case '\n':
252 // ### error
253 *kind = Token_string_literal;
254 return;
255 case '\\':
256 if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\')
257 m_ptr += 2;
258 else
259 ++m_ptr;
260 break;
261 case '"':
262 ++m_ptr;
263 *kind = Token_string_literal;
264 return;
265 default:
266 ++m_ptr;
267 break;
268 }
269 }
270
271 // ### error
272 *kind = Token_string_literal;
273}
274
275void Tokenizer::scanIdentifier(int *kind)
276{
277 unsigned char ch;
278 for (;;) {
279 ch = m_buffer[m_ptr];
280 if (s_attr_table[ch] & A_Alphanum)
281 ++m_ptr;
282 else
283 break;
284 }
285 *kind = Token_identifier;
286}
287
288void Tokenizer::scanNumberLiteral(int *kind)
289{
290 unsigned char ch;
291 for (;;) {
292 ch = m_buffer[m_ptr];
293 if (s_attr_table[ch] & A_Alphanum || ch == '.')
294 ++m_ptr;
295 else
296 break;
297 }
298
299 // ### finish to implement me!!
300 *kind = Token_number_literal;
301}
302
303void Tokenizer::scanComment(int *kind)
304{
305 if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) {
306 scanOperator(kind);
307 return;
308 }
309
310 ++m_ptr; // skip '/'
311
312 bool multiLineComment = m_buffer[m_ptr++] == '*';
313
314 while (m_buffer[m_ptr]) {
315 switch (m_buffer[m_ptr]) {
316 case '\r':
317 case '\n':
318 if (!multiLineComment) {
319 *kind = Token_comment;
320 return;
321 }
322
323 (void) scanNewline(kind);
324 break;
325
326 case '*':
327 if (multiLineComment && m_buffer[m_ptr+1] == '/') {
328 m_ptr += 2;
329 *kind = Token_comment;
330 return;
331 }
332 ++m_ptr;
333 break;
334
335 default:
336 ++m_ptr;
337 }
338 }
339
340 // ### error
341 *kind = Token_comment;
342}
343
344
345void Tokenizer::scanPreprocessor(int *kind)
346{
347 ++m_ptr;
348 *kind = Token_preproc;
349}
350
351
352void Tokenizer::scanOperator(int *kind)
353{
354 switch (m_buffer[m_ptr]) {
355 case ':':
356 if (m_buffer[m_ptr+1] == ':') {
357 m_ptr += 2;
358 *kind = Token_scope;
359 return;
360 }
361 break;
362
363 case '*':
364 case '/':
365 case '%':
366 case '^':
367 if (m_buffer[m_ptr+1] == '=') {
368 m_ptr += 2;
369 *kind = Token_assign;
370 return;
371 }
372 break;
373
374 case '=':
375 case '!':
376 if (m_buffer[m_ptr+1] == '=') {
377 m_ptr += 2;
378 *kind = Token_eq;
379 return;
380 }
381 break;
382
383 case '&':
384 if (m_buffer[m_ptr+1] == '&') {
385 m_ptr += 2;
386 *kind = Token_and;
387 return;
388 } else if (m_buffer[m_ptr+1] == '=') {
389 m_ptr += 2;
390 *kind = Token_assign;
391 return;
392 }
393 break;
394
395 case '|':
396 if (m_buffer[m_ptr+1] == '|' ) {
397 m_ptr += 2;
398 *kind = Token_or;
399 return;
400 } else if (m_buffer[m_ptr+1] == '=') {
401 m_ptr += 2;
402 *kind = Token_assign;
403 return;
404 }
405 break;
406
407 case '+':
408 if (m_buffer[m_ptr+1] == '+' ) {
409 m_ptr += 2;
410 *kind = Token_incr;
411 return;
412 } else if (m_buffer[m_ptr+1] == '=') {
413 m_ptr += 2;
414 *kind = Token_assign;
415 return;
416 }
417 break;
418
419 case '<':
420 if (m_buffer[m_ptr+1] == '<') {
421 if (m_buffer[m_ptr+2] == '=') {
422 m_ptr += 3;
423 *kind = Token_assign;
424 return;
425 }
426 m_ptr += 2;
427 *kind = Token_shift;
428 return;
429 } else if (m_buffer[m_ptr+1] == '=') {
430 m_ptr += 2;
431 *kind = Token_leq;
432 return;
433 }
434 break;
435
436 case '>':
437 if (m_buffer[m_ptr+1] == '>') {
438 if (m_buffer[m_ptr+2] == '=') {
439 m_ptr += 3;
440 *kind = Token_assign;
441 return;
442 }
443 m_ptr += 2;
444 *kind = Token_shift;
445 return;
446 } else if (m_buffer[m_ptr+1] == '=') {
447 m_ptr += 2;
448 *kind = Token_geq;
449 return;
450 }
451 break;
452
453 case '-':
454 if (m_buffer[m_ptr+1] == '>') {
455 if (m_buffer[m_ptr+2] == '*') {
456 m_ptr += 3;
457 *kind = Token_ptrmem;
458 return;
459 }
460 m_ptr += 2;
461 *kind = Token_arrow;
462 return;
463 } else if (m_buffer[m_ptr+1] == '-') {
464 m_ptr += 2;
465 *kind = Token_decr;
466 return;
467 } else if (m_buffer[m_ptr+1] == '=') {
468 m_ptr += 2;
469 *kind = Token_assign;
470 return;
471 }
472 break;
473
474 case '.':
475 if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') {
476 m_ptr += 3;
477 *kind = Token_ellipsis;
478 return;
479 } else if (m_buffer[m_ptr+1] == '*') {
480 m_ptr += 2;
481 *kind = Token_ptrmem;
482 return;
483 }
484 break;
485
486 }
487
488 *kind = m_buffer[m_ptr++];
489}
490
491QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.