[2] | 1 | /****************************************************************************
|
---|
| 2 | **
|
---|
[561] | 3 | ** Copyright (C) 2001-2004 Roberto Raggi
|
---|
[846] | 4 | ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
|
---|
[561] | 5 | ** All rights reserved.
|
---|
| 6 | ** Contact: Nokia Corporation ([email protected])
|
---|
[2] | 7 | **
|
---|
| 8 | ** This file is part of the qt3to4 porting application of the Qt Toolkit.
|
---|
| 9 | **
|
---|
| 10 | ** $QT_BEGIN_LICENSE:LGPL$
|
---|
| 11 | ** Commercial Usage
|
---|
| 12 | ** Licensees holding valid Qt Commercial licenses may use this file in
|
---|
| 13 | ** accordance with the Qt Commercial License Agreement provided with the
|
---|
| 14 | ** Software or, alternatively, in accordance with the terms contained in
|
---|
| 15 | ** a written agreement between you and Nokia.
|
---|
| 16 | **
|
---|
| 17 | ** GNU Lesser General Public License Usage
|
---|
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser
|
---|
| 19 | ** General Public License version 2.1 as published by the Free Software
|
---|
| 20 | ** Foundation and appearing in the file LICENSE.LGPL included in the
|
---|
| 21 | ** packaging of this file. Please review the following information to
|
---|
| 22 | ** ensure the GNU Lesser General Public License version 2.1 requirements
|
---|
| 23 | ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
---|
| 24 | **
|
---|
[561] | 25 | ** In addition, as a special exception, Nokia gives you certain additional
|
---|
| 26 | ** rights. These rights are described in the Nokia Qt LGPL Exception
|
---|
| 27 | ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
---|
[2] | 28 | **
|
---|
| 29 | ** GNU General Public License Usage
|
---|
| 30 | ** Alternatively, this file may be used under the terms of the GNU
|
---|
| 31 | ** General Public License version 3.0 as published by the Free Software
|
---|
| 32 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
---|
| 33 | ** packaging of this file. Please review the following information to
|
---|
| 34 | ** ensure the GNU General Public License version 3.0 requirements will be
|
---|
| 35 | ** met: http://www.gnu.org/copyleft/gpl.html.
|
---|
| 36 | **
|
---|
[561] | 37 | ** If you have questions regarding the use of this file, please contact
|
---|
| 38 | ** Nokia at [email protected].
|
---|
[2] | 39 | ** $QT_END_LICENSE$
|
---|
| 40 | **
|
---|
| 41 | ****************************************************************************/
|
---|
| 42 |
|
---|
| 43 | #include "tokenizer.h"
|
---|
| 44 | #include "tokens.h"
|
---|
| 45 | #include <QDateTime>
|
---|
| 46 | #include <QHash>
|
---|
| 47 | #include <ctype.h>
|
---|
| 48 |
|
---|
| 49 | QT_BEGIN_NAMESPACE
|
---|
| 50 |
|
---|
| 51 | using TokenEngine::Token;
|
---|
| 52 |
|
---|
| 53 | static QHash<QByteArray, bool> preprocessed;
|
---|
| 54 | bool Tokenizer::s_initialized = false;
|
---|
| 55 | Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1];
|
---|
| 56 | int Tokenizer::s_attr_table[256];
|
---|
| 57 |
|
---|
| 58 | Tokenizer::Tokenizer()
|
---|
| 59 | : m_buffer(0), m_ptr(0)
|
---|
| 60 | {
|
---|
| 61 | if (!s_initialized)
|
---|
| 62 | setupScanTable();
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 | Tokenizer::~Tokenizer()
|
---|
| 66 | {
|
---|
| 67 | }
|
---|
| 68 |
|
---|
| 69 | enum
|
---|
| 70 | {
|
---|
| 71 | A_Alpha = 0x01,
|
---|
| 72 | A_Digit = 0x02,
|
---|
| 73 | A_Alphanum = A_Alpha | A_Digit,
|
---|
| 74 | A_Whitespace = 0x04
|
---|
| 75 | };
|
---|
| 76 |
|
---|
| 77 | void Tokenizer::setupScanTable()
|
---|
| 78 | {
|
---|
| 79 | s_initialized = true;
|
---|
| 80 |
|
---|
| 81 | memset(s_attr_table, 0, 256);
|
---|
| 82 |
|
---|
| 83 | for (int i=0; i<128; ++i) {
|
---|
| 84 | switch (i) {
|
---|
| 85 | case ':':
|
---|
| 86 | case '*':
|
---|
| 87 | case '%':
|
---|
| 88 | case '^':
|
---|
| 89 | case '=':
|
---|
| 90 | case '!':
|
---|
| 91 | case '&':
|
---|
| 92 | case '|':
|
---|
| 93 | case '+':
|
---|
| 94 | case '<':
|
---|
| 95 | case '>':
|
---|
| 96 | case '-':
|
---|
| 97 | case '.':
|
---|
| 98 | s_scan_table[i] = &Tokenizer::scanOperator;
|
---|
| 99 | break;
|
---|
| 100 |
|
---|
| 101 | case '\r':
|
---|
| 102 | case '\n':
|
---|
| 103 | s_scan_table[i] = &Tokenizer::scanNewline;
|
---|
| 104 | break;
|
---|
| 105 |
|
---|
| 106 | case '#':
|
---|
| 107 | s_scan_table[i] = &Tokenizer::scanPreprocessor;
|
---|
| 108 | break;
|
---|
| 109 |
|
---|
| 110 | case '/':
|
---|
| 111 | s_scan_table[i] = &Tokenizer::scanComment;
|
---|
| 112 | break;
|
---|
| 113 |
|
---|
| 114 | case '\'':
|
---|
| 115 | s_scan_table[i] = &Tokenizer::scanCharLiteral;
|
---|
| 116 | break;
|
---|
| 117 |
|
---|
| 118 | case '"':
|
---|
| 119 | s_scan_table[i] = &Tokenizer::scanStringLiteral;
|
---|
| 120 | break;
|
---|
| 121 |
|
---|
| 122 | default:
|
---|
| 123 | if (isspace(i)) {
|
---|
| 124 | s_scan_table[i] = &Tokenizer::scanWhiteSpaces;
|
---|
| 125 | s_attr_table[i] |= A_Whitespace;
|
---|
| 126 | } else if (isalpha(i) || i == '_') {
|
---|
| 127 | s_scan_table[i] = &Tokenizer::scanIdentifier;
|
---|
| 128 | s_attr_table[i] |= A_Alpha;
|
---|
| 129 | } else if (isdigit(i)) {
|
---|
| 130 | s_scan_table[i] = &Tokenizer::scanNumberLiteral;
|
---|
| 131 | s_attr_table[i] |= A_Digit;
|
---|
| 132 | } else
|
---|
| 133 | s_scan_table[i] = &Tokenizer::scanChar;
|
---|
| 134 | }
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | s_scan_table[128] = &Tokenizer::scanUnicodeChar;
|
---|
| 138 | }
|
---|
| 139 |
|
---|
| 140 | QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text)
|
---|
| 141 | {
|
---|
| 142 | m_tokens.clear();
|
---|
| 143 |
|
---|
| 144 | m_buffer = text;
|
---|
| 145 | m_ptr = 0;
|
---|
| 146 |
|
---|
| 147 | // tokenize
|
---|
| 148 | for (;;) {
|
---|
| 149 | Token tk;
|
---|
| 150 | bool endOfFile = nextToken(tk);
|
---|
| 151 | if (endOfFile) {
|
---|
| 152 | break;
|
---|
| 153 | }
|
---|
| 154 | m_tokens.append(tk);
|
---|
| 155 | }
|
---|
| 156 |
|
---|
| 157 | return m_tokens;
|
---|
| 158 | }
|
---|
| 159 |
|
---|
| 160 | bool Tokenizer::nextToken(Token &tok)
|
---|
| 161 | {
|
---|
| 162 | int start = m_ptr;
|
---|
| 163 | unsigned char ch = (unsigned char)m_buffer[m_ptr];
|
---|
| 164 |
|
---|
| 165 | int kind = 0;
|
---|
| 166 | (this->*s_scan_table[ch < 128 ? ch : 128])(&kind);
|
---|
| 167 |
|
---|
| 168 | tok.start = start;
|
---|
| 169 | tok.length = m_ptr - start;
|
---|
| 170 |
|
---|
| 171 | return (kind == 0);
|
---|
| 172 | }
|
---|
| 173 |
|
---|
| 174 | void Tokenizer::scanChar(int *kind)
|
---|
| 175 | {
|
---|
| 176 | *kind = m_buffer[m_ptr++];
|
---|
| 177 | }
|
---|
| 178 |
|
---|
| 179 | void Tokenizer::scanWhiteSpaces(int *kind)
|
---|
| 180 | {
|
---|
| 181 | *kind = Token_whitespaces;
|
---|
| 182 | while (unsigned char ch = m_buffer[m_ptr]) {
|
---|
| 183 | if (s_attr_table[ch] & A_Whitespace)
|
---|
| 184 | ++m_ptr;
|
---|
| 185 | else
|
---|
| 186 | break;
|
---|
| 187 | }
|
---|
| 188 | }
|
---|
| 189 |
|
---|
| 190 | void Tokenizer::scanNewline(int *kind)
|
---|
| 191 | {
|
---|
| 192 | Q_UNUSED(kind);
|
---|
| 193 | const unsigned char ch = m_buffer[m_ptr++];
|
---|
| 194 | // Check for \n.
|
---|
| 195 | if (ch == '\n') {
|
---|
| 196 | *kind = '\n';
|
---|
| 197 | return;
|
---|
| 198 | }
|
---|
| 199 |
|
---|
| 200 | // Check for \r\n.
|
---|
| 201 | if (ch == '\r' && m_buffer[m_ptr] == '\n') {
|
---|
| 202 | *kind = '\n';
|
---|
| 203 | ++ m_ptr;
|
---|
| 204 | return;
|
---|
| 205 | }
|
---|
| 206 |
|
---|
| 207 | *kind = ch;
|
---|
| 208 | }
|
---|
| 209 |
|
---|
| 210 | void Tokenizer::scanUnicodeChar(int *kind)
|
---|
| 211 | {
|
---|
| 212 | *kind = m_buffer[m_ptr++];
|
---|
| 213 | }
|
---|
| 214 |
|
---|
| 215 | void Tokenizer::scanCharLiteral(int *kind)
|
---|
| 216 | {
|
---|
| 217 | ++m_ptr;
|
---|
| 218 | for (;;) {
|
---|
| 219 | unsigned char ch = m_buffer[m_ptr];
|
---|
| 220 | switch (ch) {
|
---|
| 221 | case '\0':
|
---|
| 222 | case '\n':
|
---|
| 223 | // ### error
|
---|
| 224 | *kind = Token_char_literal;
|
---|
| 225 | return;
|
---|
| 226 | case '\\':
|
---|
| 227 | if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\')
|
---|
| 228 | m_ptr += 2;
|
---|
| 229 | else
|
---|
| 230 | ++m_ptr;
|
---|
| 231 | break;
|
---|
| 232 | case '\'':
|
---|
| 233 | ++m_ptr;
|
---|
| 234 | *kind = Token_char_literal;
|
---|
| 235 | return;
|
---|
| 236 | default:
|
---|
| 237 | ++m_ptr;
|
---|
| 238 | break;
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 |
|
---|
| 242 | // ### error
|
---|
| 243 | *kind = Token_char_literal;
|
---|
| 244 | }
|
---|
| 245 |
|
---|
| 246 | void Tokenizer::scanStringLiteral(int *kind)
|
---|
| 247 | {
|
---|
| 248 | ++m_ptr;
|
---|
| 249 | while (m_buffer[m_ptr]) {
|
---|
| 250 | switch (m_buffer[m_ptr]) {
|
---|
| 251 | case '\n':
|
---|
| 252 | // ### error
|
---|
| 253 | *kind = Token_string_literal;
|
---|
| 254 | return;
|
---|
| 255 | case '\\':
|
---|
| 256 | if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\')
|
---|
| 257 | m_ptr += 2;
|
---|
| 258 | else
|
---|
| 259 | ++m_ptr;
|
---|
| 260 | break;
|
---|
| 261 | case '"':
|
---|
| 262 | ++m_ptr;
|
---|
| 263 | *kind = Token_string_literal;
|
---|
| 264 | return;
|
---|
| 265 | default:
|
---|
| 266 | ++m_ptr;
|
---|
| 267 | break;
|
---|
| 268 | }
|
---|
| 269 | }
|
---|
| 270 |
|
---|
| 271 | // ### error
|
---|
| 272 | *kind = Token_string_literal;
|
---|
| 273 | }
|
---|
| 274 |
|
---|
| 275 | void Tokenizer::scanIdentifier(int *kind)
|
---|
| 276 | {
|
---|
| 277 | unsigned char ch;
|
---|
| 278 | for (;;) {
|
---|
| 279 | ch = m_buffer[m_ptr];
|
---|
| 280 | if (s_attr_table[ch] & A_Alphanum)
|
---|
| 281 | ++m_ptr;
|
---|
| 282 | else
|
---|
| 283 | break;
|
---|
| 284 | }
|
---|
| 285 | *kind = Token_identifier;
|
---|
| 286 | }
|
---|
| 287 |
|
---|
| 288 | void Tokenizer::scanNumberLiteral(int *kind)
|
---|
| 289 | {
|
---|
| 290 | unsigned char ch;
|
---|
| 291 | for (;;) {
|
---|
| 292 | ch = m_buffer[m_ptr];
|
---|
| 293 | if (s_attr_table[ch] & A_Alphanum || ch == '.')
|
---|
| 294 | ++m_ptr;
|
---|
| 295 | else
|
---|
| 296 | break;
|
---|
| 297 | }
|
---|
| 298 |
|
---|
| 299 | // ### finish to implement me!!
|
---|
| 300 | *kind = Token_number_literal;
|
---|
| 301 | }
|
---|
| 302 |
|
---|
| 303 | void Tokenizer::scanComment(int *kind)
|
---|
| 304 | {
|
---|
| 305 | if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) {
|
---|
| 306 | scanOperator(kind);
|
---|
| 307 | return;
|
---|
| 308 | }
|
---|
| 309 |
|
---|
| 310 | ++m_ptr; // skip '/'
|
---|
| 311 |
|
---|
| 312 | bool multiLineComment = m_buffer[m_ptr++] == '*';
|
---|
| 313 |
|
---|
| 314 | while (m_buffer[m_ptr]) {
|
---|
| 315 | switch (m_buffer[m_ptr]) {
|
---|
| 316 | case '\r':
|
---|
| 317 | case '\n':
|
---|
| 318 | if (!multiLineComment) {
|
---|
| 319 | *kind = Token_comment;
|
---|
| 320 | return;
|
---|
| 321 | }
|
---|
| 322 |
|
---|
| 323 | (void) scanNewline(kind);
|
---|
| 324 | break;
|
---|
| 325 |
|
---|
| 326 | case '*':
|
---|
| 327 | if (multiLineComment && m_buffer[m_ptr+1] == '/') {
|
---|
| 328 | m_ptr += 2;
|
---|
| 329 | *kind = Token_comment;
|
---|
| 330 | return;
|
---|
| 331 | }
|
---|
| 332 | ++m_ptr;
|
---|
| 333 | break;
|
---|
| 334 |
|
---|
| 335 | default:
|
---|
| 336 | ++m_ptr;
|
---|
| 337 | }
|
---|
| 338 | }
|
---|
| 339 |
|
---|
| 340 | // ### error
|
---|
| 341 | *kind = Token_comment;
|
---|
| 342 | }
|
---|
| 343 |
|
---|
| 344 |
|
---|
| 345 | void Tokenizer::scanPreprocessor(int *kind)
|
---|
| 346 | {
|
---|
| 347 | ++m_ptr;
|
---|
| 348 | *kind = Token_preproc;
|
---|
| 349 | }
|
---|
| 350 |
|
---|
| 351 |
|
---|
| 352 | void Tokenizer::scanOperator(int *kind)
|
---|
| 353 | {
|
---|
| 354 | switch (m_buffer[m_ptr]) {
|
---|
| 355 | case ':':
|
---|
| 356 | if (m_buffer[m_ptr+1] == ':') {
|
---|
| 357 | m_ptr += 2;
|
---|
| 358 | *kind = Token_scope;
|
---|
| 359 | return;
|
---|
| 360 | }
|
---|
| 361 | break;
|
---|
| 362 |
|
---|
| 363 | case '*':
|
---|
| 364 | case '/':
|
---|
| 365 | case '%':
|
---|
| 366 | case '^':
|
---|
| 367 | if (m_buffer[m_ptr+1] == '=') {
|
---|
| 368 | m_ptr += 2;
|
---|
| 369 | *kind = Token_assign;
|
---|
| 370 | return;
|
---|
| 371 | }
|
---|
| 372 | break;
|
---|
| 373 |
|
---|
| 374 | case '=':
|
---|
| 375 | case '!':
|
---|
| 376 | if (m_buffer[m_ptr+1] == '=') {
|
---|
| 377 | m_ptr += 2;
|
---|
| 378 | *kind = Token_eq;
|
---|
| 379 | return;
|
---|
| 380 | }
|
---|
| 381 | break;
|
---|
| 382 |
|
---|
| 383 | case '&':
|
---|
| 384 | if (m_buffer[m_ptr+1] == '&') {
|
---|
| 385 | m_ptr += 2;
|
---|
| 386 | *kind = Token_and;
|
---|
| 387 | return;
|
---|
| 388 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 389 | m_ptr += 2;
|
---|
| 390 | *kind = Token_assign;
|
---|
| 391 | return;
|
---|
| 392 | }
|
---|
| 393 | break;
|
---|
| 394 |
|
---|
| 395 | case '|':
|
---|
| 396 | if (m_buffer[m_ptr+1] == '|' ) {
|
---|
| 397 | m_ptr += 2;
|
---|
| 398 | *kind = Token_or;
|
---|
| 399 | return;
|
---|
| 400 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 401 | m_ptr += 2;
|
---|
| 402 | *kind = Token_assign;
|
---|
| 403 | return;
|
---|
| 404 | }
|
---|
| 405 | break;
|
---|
| 406 |
|
---|
| 407 | case '+':
|
---|
| 408 | if (m_buffer[m_ptr+1] == '+' ) {
|
---|
| 409 | m_ptr += 2;
|
---|
| 410 | *kind = Token_incr;
|
---|
| 411 | return;
|
---|
| 412 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 413 | m_ptr += 2;
|
---|
| 414 | *kind = Token_assign;
|
---|
| 415 | return;
|
---|
| 416 | }
|
---|
| 417 | break;
|
---|
| 418 |
|
---|
| 419 | case '<':
|
---|
| 420 | if (m_buffer[m_ptr+1] == '<') {
|
---|
| 421 | if (m_buffer[m_ptr+2] == '=') {
|
---|
| 422 | m_ptr += 3;
|
---|
| 423 | *kind = Token_assign;
|
---|
| 424 | return;
|
---|
| 425 | }
|
---|
| 426 | m_ptr += 2;
|
---|
| 427 | *kind = Token_shift;
|
---|
| 428 | return;
|
---|
| 429 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 430 | m_ptr += 2;
|
---|
| 431 | *kind = Token_leq;
|
---|
| 432 | return;
|
---|
| 433 | }
|
---|
| 434 | break;
|
---|
| 435 |
|
---|
| 436 | case '>':
|
---|
| 437 | if (m_buffer[m_ptr+1] == '>') {
|
---|
| 438 | if (m_buffer[m_ptr+2] == '=') {
|
---|
| 439 | m_ptr += 3;
|
---|
| 440 | *kind = Token_assign;
|
---|
| 441 | return;
|
---|
| 442 | }
|
---|
| 443 | m_ptr += 2;
|
---|
| 444 | *kind = Token_shift;
|
---|
| 445 | return;
|
---|
| 446 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 447 | m_ptr += 2;
|
---|
| 448 | *kind = Token_geq;
|
---|
| 449 | return;
|
---|
| 450 | }
|
---|
| 451 | break;
|
---|
| 452 |
|
---|
| 453 | case '-':
|
---|
| 454 | if (m_buffer[m_ptr+1] == '>') {
|
---|
| 455 | if (m_buffer[m_ptr+2] == '*') {
|
---|
| 456 | m_ptr += 3;
|
---|
| 457 | *kind = Token_ptrmem;
|
---|
| 458 | return;
|
---|
| 459 | }
|
---|
| 460 | m_ptr += 2;
|
---|
| 461 | *kind = Token_arrow;
|
---|
| 462 | return;
|
---|
| 463 | } else if (m_buffer[m_ptr+1] == '-') {
|
---|
| 464 | m_ptr += 2;
|
---|
| 465 | *kind = Token_decr;
|
---|
| 466 | return;
|
---|
| 467 | } else if (m_buffer[m_ptr+1] == '=') {
|
---|
| 468 | m_ptr += 2;
|
---|
| 469 | *kind = Token_assign;
|
---|
| 470 | return;
|
---|
| 471 | }
|
---|
| 472 | break;
|
---|
| 473 |
|
---|
| 474 | case '.':
|
---|
| 475 | if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') {
|
---|
| 476 | m_ptr += 3;
|
---|
| 477 | *kind = Token_ellipsis;
|
---|
| 478 | return;
|
---|
| 479 | } else if (m_buffer[m_ptr+1] == '*') {
|
---|
| 480 | m_ptr += 2;
|
---|
| 481 | *kind = Token_ptrmem;
|
---|
| 482 | return;
|
---|
| 483 | }
|
---|
| 484 | break;
|
---|
| 485 |
|
---|
| 486 | }
|
---|
| 487 |
|
---|
| 488 | *kind = m_buffer[m_ptr++];
|
---|
| 489 | }
|
---|
| 490 |
|
---|
| 491 | QT_END_NAMESPACE
|
---|