source: trunk/src/corelib/tools/qregexp.cpp@ 570

Last change on this file since 570 was 561, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.6.1 sources.

File size: 150.6 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qregexp.h"
43
44#include "qalgorithms.h"
45#include "qbitarray.h"
46#include "qcache.h"
47#include "qdatastream.h"
48#include "qlist.h"
49#include "qmap.h"
50#include "qmutex.h"
51#include "qstring.h"
52#include "qstringlist.h"
53#include "qstringmatcher.h"
54#include "qvector.h"
55#include "private/qfunctions_p.h"
56
57#include <limits.h>
58
59QT_BEGIN_NAMESPACE
60
61int qFindString(const QChar *haystack, int haystackLen, int from,
62 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
63
64// error strings for the regexp parser
65#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
66#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
67#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
68#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
69#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
70#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
71#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
72#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
73#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
74#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
75#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
76
77/*
78 WARNING! Be sure to read qregexp.tex before modifying this file.
79*/
80
81/*!
82 \class QRegExp
83 \reentrant
84 \brief The QRegExp class provides pattern matching using regular expressions.
85
86 \ingroup tools
87 \ingroup shared
88
89 \keyword regular expression
90
91 A regular expression, or "regexp", is a pattern for matching
92 substrings in a text. This is useful in many contexts, e.g.,
93
94 \table
95 \row \i Validation
96 \i A regexp can test whether a substring meets some criteria,
97 e.g. is an integer or contains no whitespace.
98 \row \i Searching
99 \i A regexp provides more powerful pattern matching than
100 simple substring matching, e.g., match one of the words
101 \e{mail}, \e{letter} or \e{correspondence}, but none of the
102 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
103 \row \i Search and Replace
104 \i A regexp can replace all occurrences of a substring with a
105 different substring, e.g., replace all occurrences of \e{&}
106 with \e{\&amp;} except where the \e{&} is already followed by
107 an \e{amp;}.
108 \row \i String Splitting
109 \i A regexp can be used to identify where a string should be
110 split apart, e.g. splitting tab-delimited strings.
111 \endtable
112
113 A brief introduction to regexps is presented, a description of
114 Qt's regexp language, some examples, and the function
115 documentation itself. QRegExp is modeled on Perl's regexp
116 language. It fully supports Unicode. QRegExp can also be used in a
117 simpler, \e{wildcard mode} that is similar to the functionality
118 found in command shells. The syntax rules used by QRegExp can be
119 changed with setPatternSyntax(). In particular, the pattern syntax
120 can be set to QRegExp::FixedString, which means the pattern to be
121 matched is interpreted as a plain string, i.e., special characters
122 (e.g., backslash) are not escaped.
123
124 A good text on regexps is \e {Mastering Regular Expressions}
125 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
126
127 \tableofcontents
128
129 \section1 Introduction
130
131 Regexps are built up from expressions, quantifiers, and
132 assertions. The simplest expression is a character, e.g. \bold{x}
133 or \bold{5}. An expression can also be a set of characters
134 enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
135 or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
136 expression as \bold{[A-D]}, and an experession to match any
137 captital letter in the English alphabet is written as
138 \bold{[A-Z]}.
139
140 A quantifier specifies the number of occurrences of an expression
141 that must be matched. \bold{x{1,1}} means match one and only one
142 \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
143 characters that contains at least one \bold{x} but no more than
144 five.
145
146 Note that in general regexps cannot be used to check for balanced
147 brackets or tags. For example, a regexp can be written to match an
148 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
149 are not nested, but if the \c{<b>} tags are nested, that same
150 regexp will match an opening \c{<b>} tag with the wrong closing
151 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
152 first \c{<b>} would be matched with the first \c{</b>}, which is
153 not correct. However, it is possible to write a regexp that will
154 match nested brackets or tags correctly, but only if the number of
155 nesting levels is fixed and known. If the number of nesting levels
156 is not fixed and known, it is impossible to write a regexp that
157 will not fail.
158
159 Suppose we want a regexp to match integers in the range 0 to 99.
160 At least one digit is required, so we start with the expression
161 \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
162 regexp matches integers in the range 0 to 9. To match integers up
163 to 99, increase the maximum number of occurrences to 2, so the
164 regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
165 original requirement to match integers from 0 to 99, but it will
166 also match integers that occur in the middle of strings. If we
167 want the matched integer to be the whole string, we must use the
168 anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
169 \bold{^} is the first character in a regexp, it means the regexp
170 must match from the beginning of the string. When \bold{$} is the
171 last character of the regexp, it means the regexp must match to
172 the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
173 Note that assertions, e.g. \bold{^} and \bold{$}, do not match
174 characters but locations in the string.
175
176 If you have seen regexps described elsewhere, they may have looked
177 different from the ones shown here. This is because some sets of
178 characters and some quantifiers are so common that they have been
179 given special symbols to represent them. \bold{[0-9]} can be
180 replaced with the symbol \bold{\\d}. The quantifier to match
181 exactly one occurrence, \bold{{1,1}}, can be replaced with the
182 expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
183 our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
184 also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
185 the string, match a digit, followed immediately by 0 or 1 digits}.
186 In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
187 is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
188 occurrences. \bold{?} makes an expression optional. The regexp
189 \bold{^\\d\\d?$} means \e{From the beginning of the string, match
190 one digit, followed immediately by 0 or 1 more digit, followed
191 immediately by end of string}.
192
193 To write a regexp that matches one of the words 'mail' \e or
194 'letter' \e or 'correspondence' but does not match words that
195 contain these words, e.g., 'email', 'mailman', 'mailer', and
196 'letterbox', start with a regexp that matches 'mail'. Expressed
197 fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
198 a character expression is automatically quantified by
199 \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
200 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
201 we can use the vertical bar \bold{|}, which means \bold{or}, to
202 include the other two words, so our regexp for matching any of the
203 three words becomes \bold{mail|letter|correspondence}. Match
204 'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
205 regexp will match one of the three words we want to match, it will
206 also match words we don't want to match, e.g., 'email'. To
207 prevent the regexp from matching unwanted words, we must tell it
208 to begin and end the match at word boundaries. First we enclose
209 our regexp in parentheses, \bold{(mail|letter|correspondence)}.
210 Parentheses group expressions together, and they identify a part
211 of the regexp that we wish to \l{capturing text}{capture}.
212 Enclosing the expression in parentheses allows us to use it as a
213 component in more complex regexps. It also allows us to examine
214 which of the three words was actually matched. To force the match
215 to begin and end on word boundaries, we enclose the regexp in
216 \bold{\\b} \e{word boundary} assertions:
217 \bold{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
218 \e{Match a word boundary, followed by the regexp in parentheses,
219 followed by a word boundary}. The \bold{\\b} assertion matches a
220 \e position in the regexp, not a \e character. A word boundary is
221 any non-word character, e.g., a space, newline, or the beginning
222 or ending of a string.
223
224 If we want to replace ampersand characters with the HTML entity
225 \bold{\&amp;}, the regexp to match is simply \bold{\&}. But this
226 regexp will also match ampersands that have already been converted
227 to HTML entities. We want to replace only ampersands that are not
228 already followed by \bold{amp;}. For this, we need the negative
229 lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
230 written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
231 \bold{not} \e{followed by} \bold{amp;}.
232
233 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
234 string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
235 \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
236 required to avoid matching words that contain either name,
237 e.g. 'Ericsson'. Note that the second regexp matches more
238 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239
240 Some of the examples discussed above are implemented in the
241 \link #code-examples code examples \endlink section.
242
243 \target characters-and-abbreviations-for-sets-of-characters
244 \section1 Characters and Abbreviations for Sets of Characters
245
246 \table
247 \header \i Element \i Meaning
248 \row \i \bold{c}
249 \i A character represents itself unless it has a special
250 regexp meaning. e.g. \bold{c} matches the character \e c.
251 \row \i \bold{\\c}
252 \i A character that follows a backslash matches the character
253 itself, except as specified below. e.g., To match a literal
254 caret at the beginning of a string, write \bold{\\^}.
255 \row \i \bold{\\a}
256 \i Matches the ASCII bell (BEL, 0x07).
257 \row \i \bold{\\f}
258 \i Matches the ASCII form feed (FF, 0x0C).
259 \row \i \bold{\\n}
260 \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
261 \row \i \bold{\\r}
262 \i Matches the ASCII carriage return (CR, 0x0D).
263 \row \i \bold{\\t}
264 \i Matches the ASCII horizontal tab (HT, 0x09).
265 \row \i \bold{\\v}
266 \i Matches the ASCII vertical tab (VT, 0x0B).
267 \row \i \bold{\\x\e{hhhh}}
268 \i Matches the Unicode character corresponding to the
269 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
270 \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
271 \i matches the ASCII/Latin1 character for the octal number
272 \e{ooo} (between 0 and 0377).
273 \row \i \bold{. (dot)}
274 \i Matches any character (including newline).
275 \row \i \bold{\\d}
276 \i Matches a digit (QChar::isDigit()).
277 \row \i \bold{\\D}
278 \i Matches a non-digit.
279 \row \i \bold{\\s}
280 \i Matches a whitespace character (QChar::isSpace()).
281 \row \i \bold{\\S}
282 \i Matches a non-whitespace character.
283 \row \i \bold{\\w}
284 \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285 \row \i \bold{\\W}
286 \i Matches a non-word character.
287 \row \i \bold{\\\e{n}}
288 \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
289 \endtable
290
291 \bold{Note:} The C++ compiler transforms backslashes in strings.
292 To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
293 To match the backslash character itself, enter it four times, i.e.
294 \c{\\\\}.
295
296 \target sets-of-characters
297 \section1 Sets of Characters
298
299 Square brackets mean match any character contained in the square
300 brackets. The character set abbreviations described above can
301 appear in a character set in square brackets. Except for the
302 character set abbreviations and the following two exceptions,
303 characters do not have special meanings in square brackets.
304
305 \table
306 \row \i \bold{^}
307
308 \i The caret negates the character set if it occurs as the
309 first character (i.e. immediately after the opening square
310 bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
311 \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
312
313 \row \i \bold{-}
314
315 \i The dash indicates a range of characters. \bold{[W-Z]}
316 matches 'W' or 'X' or 'Y' or 'Z'.
317
318 \endtable
319
320 Using the predefined character set abbreviations is more portable
321 than using character ranges across platforms and languages. For
322 example, \bold{[0-9]} matches a digit in Western alphabets but
323 \bold{\\d} matches a digit in \e any alphabet.
324
325 Note: In other regexp documentation, sets of characters are often
326 called "character classes".
327
328 \target quantifiers
329 \section1 Quantifiers
330
331 By default, an expression is automatically quantified by
332 \bold{{1,1}}, i.e. it should occur exactly once. In the following
333 list, \bold{\e {E}} stands for expression. An expression is a
334 character, or an abbreviation for a set of characters, or a set of
335 characters in square brackets, or an expression in parentheses.
336
337 \table
338 \row \i \bold{\e {E}?}
339
340 \i Matches zero or one occurrences of \e E. This quantifier
341 means \e{The previous expression is optional}, because it
342 will match whether or not the expression is found. \bold{\e
343 {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
344 matches 'dent' or 'dents'.
345
346 \row \i \bold{\e {E}+}
347
348 \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
349 the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
350 '00', '000', etc.
351
352 \row \i \bold{\e {E}*}
353
354 \i Matches zero or more occurrences of \e E. It is the same
355 as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
356 in error where \bold{+} should be used. For example, if
357 \bold{\\s*$} is used in an expression to match strings that
358 end in whitespace, it will match every string because
359 \bold{\\s*$} means \e{Match zero or more whitespaces followed
360 by end of string}. The correct regexp to match strings that
361 have at least one trailing whitespace character is
362 \bold{\\s+$}.
363
364 \row \i \bold{\e {E}{n}}
365
366 \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
367 is the same as repeating \e E \e n times. For example,
368 \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
369 as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
370
371 \row \i \bold{\e {E}{n,}}
372 \i Matches at least \e n occurrences of \e E.
373
374 \row \i \bold{\e {E}{,m}}
375 \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
376 is the same as \bold{\e {E}{0,m}}.
377
378 \row \i \bold{\e {E}{n,m}}
379 \i Matches at least \e n and at most \e m occurrences of \e E.
380 \endtable
381
382 To apply a quantifier to more than just the preceding character,
383 use parentheses to group characters together in an expression. For
384 example, \bold{tag+} matches a 't' followed by an 'a' followed by
385 at least one 'g', whereas \bold{(tag)+} matches at least one
386 occurrence of 'tag'.
387
388 Note: Quantifiers are normally "greedy". They always match as much
389 text as they can. For example, \bold{0+} matches the first zero it
390 finds and all the consecutive zeros after the first zero. Applied
391 to '20005', it matches'2\underline{000}5'. Quantifiers can be made
392 non-greedy, see setMinimal().
393
394 \target capturing parentheses
395 \target backreferences
396 \section1 Capturing Text
397
398 Parentheses allow us to group elements together so that we can
399 quantify and capture them. For example if we have the expression
400 \bold{mail|letter|correspondence} that matches a string we know
401 that \e one of the words matched but not which one. Using
402 parentheses allows us to "capture" whatever is matched within
403 their bounds, so if we used \bold{(mail|letter|correspondence)}
404 and matched this regexp against the string "I sent you some email"
405 we can use the cap() or capturedTexts() functions to extract the
406 matched characters, in this case 'mail'.
407
408 We can use captured text within the regexp itself. To refer to the
409 captured text we use \e backreferences which are indexed from 1,
410 the same as for cap(). For example we could search for duplicate
411 words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
412 word boundary followed by one or more word characters followed by
413 one or more non-word characters followed by the same text as the
414 first parenthesized expression followed by a word boundary.
415
416 If we want to use parentheses purely for grouping and not for
417 capturing we can use the non-capturing syntax, e.g.
418 \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
419 end ')'. In this example we match either 'green' or 'blue' but we
420 do not capture the match so we only know whether or not we matched
421 but not which color we actually found. Using non-capturing
422 parentheses is more efficient than using capturing parentheses
423 since the regexp engine has to do less book-keeping.
424
425 Both capturing and non-capturing parentheses may be nested.
426
427 \target greedy quantifiers
428
429 For historical reasons, quantifiers (e.g. \bold{*}) that apply to
430 capturing parentheses are more "greedy" than other quantifiers.
431 For example, \bold{a*(a)*} will match "aaa" with cap(1) == "aaa".
432 This behavior is different from what other regexp engines do
433 (notably, Perl). To obtain a more intuitive capturing behavior,
434 specify QRegExp::RegExp2 to the QRegExp constructor or call
435 setPatternSyntax(QRegExp::RegExp2).
436
437 \target cap_in_a_loop
438
439 When the number of matches cannot be determined in advance, a
440 common idiom is to use cap() in a loop. For example:
441
442 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
443
444 \target assertions
445 \section1 Assertions
446
447 Assertions make some statement about the text at the point where
448 they occur in the regexp but they do not match any characters. In
449 the following list \bold{\e {E}} stands for any expression.
450
451 \table
452 \row \i \bold{^}
453 \i The caret signifies the beginning of the string. If you
454 wish to match a literal \c{^} you must escape it by
455 writing \c{\\^}. For example, \bold{^#include} will only
456 match strings which \e begin with the characters '#include'.
457 (When the caret is the first character of a character set it
458 has a special meaning, see \link #sets-of-characters Sets of
459 Characters \endlink.)
460
461 \row \i \bold{$}
462 \i The dollar signifies the end of the string. For example
463 \bold{\\d\\s*$} will match strings which end with a digit
464 optionally followed by whitespace. If you wish to match a
465 literal \c{$} you must escape it by writing
466 \c{\\$}.
467
468 \row \i \bold{\\b}
469 \i A word boundary. For example the regexp
470 \bold{\\bOK\\b} means match immediately after a word
471 boundary (e.g. start of string or whitespace) the letter 'O'
472 then the letter 'K' immediately before another word boundary
473 (e.g. end of string or whitespace). But note that the
474 assertion does not actually match any whitespace so if we
475 write \bold{(\\bOK\\b)} and we have a match it will only
476 contain 'OK' even if the string is "It's \underline{OK} now".
477
478 \row \i \bold{\\B}
479 \i A non-word boundary. This assertion is true wherever
480 \bold{\\b} is false. For example if we searched for
481 \bold{\\Bon\\B} in "Left on" the match would fail (space
482 and end of string aren't non-word boundaries), but it would
483 match in "t\underline{on}ne".
484
485 \row \i \bold{(?=\e E)}
486 \i Positive lookahead. This assertion is true if the
487 expression matches at this point in the regexp. For example,
488 \bold{const(?=\\s+char)} matches 'const' whenever it is
489 followed by 'char', as in 'static \underline{const} char *'.
490 (Compare with \bold{const\\s+char}, which matches 'static
491 \underline{const char} *'.)
492
493 \row \i \bold{(?!\e E)}
494 \i Negative lookahead. This assertion is true if the
495 expression does not match at this point in the regexp. For
496 example, \bold{const(?!\\s+char)} matches 'const' \e except
497 when it is followed by 'char'.
498 \endtable
499
500 \keyword QRegExp wildcard matching
501 \section1 Wildcard Matching
502
503 Most command shells such as \e bash or \e cmd.exe support "file
504 globbing", the ability to identify a group of files by using
505 wildcards. The setPatternSyntax() function is used to switch
506 between regexp and wildcard mode. Wildcard matching is much
507 simpler than full regexps and has only four features:
508
509 \table
510 \row \i \bold{c}
511 \i Any character represents itself apart from those mentioned
512 below. Thus \bold{c} matches the character \e c.
513 \row \i \bold{?}
514 \i Matches any single character. It is the same as
515 \bold{.} in full regexps.
516 \row \i \bold{*}
517 \i Matches zero or more of any characters. It is the
518 same as \bold{.*} in full regexps.
519 \row \i \bold{[...]}
520 \i Sets of characters can be represented in square brackets,
521 similar to full regexps. Within the character class, like
522 outside, backslash has no special meaning.
523 \endtable
524
525 In the mode Wildcard, the wildcard characters cannot be
526 escaped. In the mode WildcardUnix, the character '\' escapes the
527 wildcard.
528
529 For example if we are in wildcard mode and have strings which
530 contain filenames we could identify HTML files with \bold{*.html}.
531 This will match zero or more characters followed by a dot followed
532 by 'h', 't', 'm' and 'l'.
533
534 To test a string against a wildcard expression, use exactMatch().
535 For example:
536
537 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
538
539 \target perl-users
540 \section1 Notes for Perl Users
541
542 Most of the character class abbreviations supported by Perl are
543 supported by QRegExp, see \link
544 #characters-and-abbreviations-for-sets-of-characters characters
545 and abbreviations for sets of characters \endlink.
546
547 In QRegExp, apart from within character classes, \c{^} always
548 signifies the start of the string, so carets must always be
549 escaped unless used for that purpose. In Perl the meaning of caret
550 varies automagically depending on where it occurs so escaping it
551 is rarely necessary. The same applies to \c{$} which in
552 QRegExp always signifies the end of the string.
553
554 QRegExp's quantifiers are the same as Perl's greedy quantifiers
555 (but see the \l{greedy quantifiers}{note above}). Non-greedy
556 matching cannot be applied to individual quantifiers, but can be
557 applied to all the quantifiers in the pattern. For example, to
558 match the Perl regexp \bold{ro+?m} requires:
559
560 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
561
562 The equivalent of Perl's \c{/i} option is
563 setCaseSensitivity(Qt::CaseInsensitive).
564
565 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
566
567 In QRegExp \bold{.} matches any character, therefore all QRegExp
568 regexps have the equivalent of Perl's \c{/s} option. QRegExp
569 does not have an equivalent to Perl's \c{/m} option, but this
570 can be emulated in various ways for example by splitting the input
571 into lines or by looping with a regexp that searches for newlines.
572
573 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
574 assertions. The \\G assertion is not supported but can be emulated
575 in a loop.
576
577 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
578 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
579 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
580 capturedTexts()[2], etc.
581
582 To substitute a pattern use QString::replace().
583
584 Perl's extended \c{/x} syntax is not supported, nor are
585 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
586 the other hand, C++'s rules for literal strings can be used to
587 achieve the same:
588
589 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
590
591 Both zero-width positive and zero-width negative lookahead
592 assertions (?=pattern) and (?!pattern) are supported with the same
593 syntax as Perl. Perl's lookbehind assertions, "independent"
594 subexpressions and conditional expressions are not supported.
595
596 Non-capturing parentheses are also supported, with the same
597 (?:pattern) syntax.
598
599 See QString::split() and QStringList::join() for equivalents
600 to Perl's split and join functions.
601
602 Note: because C++ transforms \\'s they must be written \e twice in
603 code, e.g. \bold{\\b} must be written \bold{\\\\b}.
604
605 \target code-examples
606 \section1 Code Examples
607
608 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
609
610 The third string matches '\underline{6}'. This is a simple validation
611 regexp for integers in the range 0 to 99.
612
613 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
614
615 The second string matches '\underline{This_is-OK}'. We've used the
616 character set abbreviation '\\S' (non-whitespace) and the anchors
617 to match strings which contain no whitespace.
618
619 In the following example we match strings containing 'mail' or
620 'letter' or 'correspondence' but only match whole words i.e. not
621 'email'
622
623 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
624
625 The second string matches "Please write the \underline{letter}". The
626 word 'letter' is also captured (because of the parentheses). We
627 can see what text we've captured like this:
628
629 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
630
631 This will capture the text from the first set of capturing
632 parentheses (counting capturing left parentheses from left to
633 right). The parentheses are counted from 1 since cap(0) is the
634 whole matched regexp (equivalent to '&' in most regexp engines).
635
636 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
637
638 Here we've passed the QRegExp to QString's replace() function to
639 replace the matched text with new text.
640
641 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
642
643 We've used the indexIn() function to repeatedly match the regexp in
644 the string. Note that instead of moving forward by one character
645 at a time \c pos++ we could have written \c {pos +=
646 rx.matchedLength()} to skip over the already matched string. The
647 count will equal 3, matching 'One \underline{Eric} another
648 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
649 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
650 by non-word boundaries.
651
652 One common use of regexps is to split lines of delimited data into
653 their component fields.
654
655 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
656
657 In this example our input lines have the format company name, web
658 address and country. Unfortunately the regexp is rather long and
659 not very versatile -- the code will break if we add any more
660 fields. A simpler and better solution is to look for the
661 separator, '\\t' in this case, and take the surrounding text. The
662 QString::split() function can take a separator string or regexp
663 as an argument and split a string accordingly.
664
665 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
666
667 Here field[0] is the company, field[1] the web address and so on.
668
669 To imitate the matching of a shell we can use wildcard mode.
670
671 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
672
673 Wildcard matching can be convenient because of its simplicity, but
674 any wildcard regexp can be defined using full regexps, e.g.
675 \bold{.*\.html$}. Notice that we can't match both \c .html and \c
676 .htm files with a wildcard unless we use \bold{*.htm*} which will
677 also match 'test.html.bak'. A full regexp gives us the precision
678 we need, \bold{.*\\.html?$}.
679
680 QRegExp can match case insensitively using setCaseSensitivity(),
681 and can use non-greedy matching, see setMinimal(). By
682 default QRegExp uses full regexps but this can be changed with
683 setWildcard(). Searching can be forward with indexIn() or backward
684 with lastIndexIn(). Captured text can be accessed using
685 capturedTexts() which returns a string list of all captured
686 strings, or using cap() which returns the captured string for the
687 given index. The pos() function takes a match index and returns
688 the position in the string where the match was made (or -1 if
689 there was no match).
690
691 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
692 {tools/regexp}{Regular Expression Example}
693*/
694
695#if defined(Q_OS_VXWORKS) && defined(EOS)
696# undef EOS
697#endif
698
699const int NumBadChars = 64;
700#define BadChar(ch) ((ch).unicode() % NumBadChars)
701
702const int NoOccurrence = INT_MAX;
703const int EmptyCapture = INT_MAX;
704const int InftyLen = INT_MAX;
705const int InftyRep = 1025;
706const int EOS = -1;
707
708static bool isWord(QChar ch)
709{
710 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
711}
712
713/*
714 Merges two vectors of ints and puts the result into the first
715 one.
716*/
717static void mergeInto(QVector<int> *a, const QVector<int> &b)
718{
719 int asize = a->size();
720 int bsize = b.size();
721 if (asize == 0) {
722 *a = b;
723#ifndef QT_NO_REGEXP_OPTIM
724 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
725 a->resize(asize + 1);
726 (*a)[asize] = b.at(0);
727#endif
728 } else if (bsize >= 1) {
729 int csize = asize + bsize;
730 QVector<int> c(csize);
731 int i = 0, j = 0, k = 0;
732 while (i < asize) {
733 if (j < bsize) {
734 if (a->at(i) == b.at(j)) {
735 ++i;
736 --csize;
737 } else if (a->at(i) < b.at(j)) {
738 c[k++] = a->at(i++);
739 } else {
740 c[k++] = b.at(j++);
741 }
742 } else {
743 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
744 break;
745 }
746 }
747 c.resize(csize);
748 if (j < bsize)
749 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
750 *a = c;
751 }
752}
753
754#ifndef QT_NO_REGEXP_WILDCARD
755/*
756 Translates a wildcard pattern to an equivalent regular expression
757 pattern (e.g., *.cpp to .*\.cpp).
758
759 If enableEscaping is true, it is possible to escape the wildcard
760 characters with \
761*/
762static QString wc2rx(const QString &wc_str, const bool enableEscaping)
763{
764 const int wclen = wc_str.length();
765 QString rx;
766 int i = 0;
767 bool isEscaping = false; // the previous character is '\'
768 const QChar *wc = wc_str.unicode();
769
770 while (i < wclen) {
771 const QChar c = wc[i++];
772 switch (c.unicode()) {
773 case '\\':
774 if (enableEscaping) {
775 if (isEscaping) {
776 rx += QLatin1String("\\\\");
777 } // we insert the \\ later if necessary
778 if (i+1 == wclen) { // the end
779 rx += QLatin1String("\\\\");
780 }
781 } else {
782 rx += QLatin1String("\\\\");
783 }
784 isEscaping = true;
785 break;
786 case '*':
787 if (isEscaping) {
788 rx += QLatin1String("\\*");
789 isEscaping = false;
790 } else {
791 rx += QLatin1String(".*");
792 }
793 break;
794 case '?':
795 if (isEscaping) {
796 rx += QLatin1String("\\?");
797 isEscaping = false;
798 } else {
799 rx += QLatin1Char('.');
800 }
801
802 break;
803 case '$':
804 case '(':
805 case ')':
806 case '+':
807 case '.':
808 case '^':
809 case '{':
810 case '|':
811 case '}':
812 if (isEscaping) {
813 isEscaping = false;
814 rx += QLatin1String("\\\\");
815 }
816 rx += QLatin1Char('\\');
817 rx += c;
818 break;
819 case '[':
820 if (isEscaping) {
821 isEscaping = false;
822 rx += QLatin1String("\\[");
823 } else {
824 rx += c;
825 if (wc[i] == QLatin1Char('^'))
826 rx += wc[i++];
827 if (i < wclen) {
828 if (rx[i] == QLatin1Char(']'))
829 rx += wc[i++];
830 while (i < wclen && wc[i] != QLatin1Char(']')) {
831 if (wc[i] == QLatin1Char('\\'))
832 rx += QLatin1Char('\\');
833 rx += wc[i++];
834 }
835 }
836 }
837 break;
838
839 case ']':
840 if(isEscaping){
841 isEscaping = false;
842 rx += QLatin1String("\\");
843 }
844 rx += c;
845 break;
846
847 default:
848 if(isEscaping){
849 isEscaping = false;
850 rx += QLatin1String("\\\\");
851 }
852 rx += c;
853 }
854 }
855 return rx;
856}
857#endif
858
859static int caretIndex(int offset, QRegExp::CaretMode caretMode)
860{
861 if (caretMode == QRegExp::CaretAtZero) {
862 return 0;
863 } else if (caretMode == QRegExp::CaretAtOffset) {
864 return offset;
865 } else { // QRegExp::CaretWontMatch
866 return -1;
867 }
868}
869
870/*
871 The QRegExpEngineKey struct uniquely identifies an engine.
872*/
873struct QRegExpEngineKey
874{
875 QString pattern;
876 QRegExp::PatternSyntax patternSyntax;
877 Qt::CaseSensitivity cs;
878
879 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
880 Qt::CaseSensitivity cs)
881 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
882
883 inline void clear() {
884 pattern.clear();
885 patternSyntax = QRegExp::RegExp;
886 cs = Qt::CaseSensitive;
887 }
888};
889
890Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
891{
892 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
893 && key1.cs == key2.cs;
894}
895
896class QRegExpEngine;
897
898//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
899
900/*
901 This is the engine state during matching.
902*/
903struct QRegExpMatchState
904{
905 const QChar *in; // a pointer to the input string data
906 int pos; // the current position in the string
907 int caretPos;
908 int len; // the length of the input string
909 bool minimal; // minimal matching?
910 int *bigArray; // big array holding the data for the next pointers
911 int *inNextStack; // is state is nextStack?
912 int *curStack; // stack of current states
913 int *nextStack; // stack of next states
914 int *curCapBegin; // start of current states' captures
915 int *nextCapBegin; // start of next states' captures
916 int *curCapEnd; // end of current states' captures
917 int *nextCapEnd; // end of next states' captures
918 int *tempCapBegin; // start of temporary captures
919 int *tempCapEnd; // end of temporary captures
920 int *capBegin; // start of captures for a next state
921 int *capEnd; // end of captures for a next state
922 int *slideTab; // bump-along slide table for bad-character heuristic
923 int *captured; // what match() returned last
924 int slideTabSize; // size of slide table
925 int capturedSize;
926#ifndef QT_NO_REGEXP_BACKREF
927 QList<QVector<int> > sleeping; // list of back-reference sleepers
928#endif
929 int matchLen; // length of match
930 int oneTestMatchedLen; // length of partial match
931
932 const QRegExpEngine *eng;
933
934 inline QRegExpMatchState() : bigArray(0), captured(0) {}
935 inline ~QRegExpMatchState() { free(bigArray); }
936
937 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
938 void prepareForMatch(QRegExpEngine *eng);
939 void match(const QChar *str, int len, int pos, bool minimal,
940 bool oneTest, int caretIndex);
941 bool matchHere();
942 bool testAnchor(int i, int a, const int *capBegin);
943};
944
945/*
946 The struct QRegExpAutomatonState represents one state in a modified NFA. The
947 input characters matched are stored in the state instead of on
948 the transitions, something possible for an automaton
949 constructed from a regular expression.
950*/
951struct QRegExpAutomatonState
952{
953#ifndef QT_NO_REGEXP_CAPTURE
954 int atom; // which atom does this state belong to?
955#endif
956 int match; // what does it match? (see CharClassBit and BackRefBit)
957 QVector<int> outs; // out-transitions
958 QMap<int, int> reenter; // atoms reentered when transiting out
959 QMap<int, int> anchors; // anchors met when transiting out
960
961 inline QRegExpAutomatonState() { }
962#ifndef QT_NO_REGEXP_CAPTURE
963 inline QRegExpAutomatonState(int a, int m)
964 : atom(a), match(m) { }
965#else
966 inline QRegExpAutomatonState(int m)
967 : match(m) { }
968#endif
969};
970
971Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
972
973/*
974 The struct QRegExpCharClassRange represents a range of characters (e.g.,
975 [0-9] denotes range 48 to 57).
976*/
977struct QRegExpCharClassRange
978{
979 ushort from; // 48
980 ushort len; // 10
981};
982
983Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
984
985#ifndef QT_NO_REGEXP_CAPTURE
986/*
987 The struct QRegExpAtom represents one node in the hierarchy of regular
988 expression atoms.
989*/
990struct QRegExpAtom
991{
992 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
993
994 int parent; // index of parent in array of atoms
995 int capture; // index of capture, from 1 to ncap - 1
996};
997
998Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
999#endif
1000
1001struct QRegExpLookahead;
1002
1003#ifndef QT_NO_REGEXP_ANCHOR_ALT
1004/*
1005 The struct QRegExpAnchorAlternation represents a pair of anchors with
1006 OR semantics.
1007*/
1008struct QRegExpAnchorAlternation
1009{
1010 int a; // this anchor...
1011 int b; // ...or this one
1012};
1013
1014Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1015#endif
1016
1017#ifndef QT_NO_REGEXP_CCLASS
1018/*
1019 The class QRegExpCharClass represents a set of characters, such as can
1020 be found in regular expressions (e.g., [a-z] denotes the set
1021 {a, b, ..., z}).
1022*/
1023class QRegExpCharClass
1024{
1025public:
1026 QRegExpCharClass();
1027 inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1028
1029 QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1030
1031 void clear();
1032 bool negative() const { return n; }
1033 void setNegative(bool negative);
1034 void addCategories(int cats);
1035 void addRange(ushort from, ushort to);
1036 void addSingleton(ushort ch) { addRange(ch, ch); }
1037
1038 bool in(QChar ch) const;
1039#ifndef QT_NO_REGEXP_OPTIM
1040 const QVector<int> &firstOccurrence() const { return occ1; }
1041#endif
1042
1043#if defined(QT_DEBUG)
1044 void dump() const;
1045#endif
1046
1047private:
1048 int c; // character classes
1049 QVector<QRegExpCharClassRange> r; // character ranges
1050 bool n; // negative?
1051#ifndef QT_NO_REGEXP_OPTIM
1052 QVector<int> occ1; // first-occurrence array
1053#endif
1054};
1055#else
1056struct QRegExpCharClass
1057{
1058 int dummy;
1059
1060#ifndef QT_NO_REGEXP_OPTIM
1061 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1062
1063 const QVector<int> &firstOccurrence() const { return occ1; }
1064 QVector<int> occ1;
1065#endif
1066};
1067#endif
1068
1069Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1070
1071/*
1072 The QRegExpEngine class encapsulates a modified nondeterministic
1073 finite automaton (NFA).
1074*/
1075class QRegExpEngine
1076{
1077public:
1078 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1079 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1080
1081 QRegExpEngine(const QRegExpEngineKey &key);
1082 ~QRegExpEngine();
1083
1084 bool isValid() const { return valid; }
1085 const QString &errorString() const { return yyError; }
1086 int captureCount() const { return officialncap; }
1087
1088 int createState(QChar ch);
1089 int createState(const QRegExpCharClass &cc);
1090#ifndef QT_NO_REGEXP_BACKREF
1091 int createState(int bref);
1092#endif
1093
1094 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1095#ifndef QT_NO_REGEXP_CAPTURE
1096 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1097#endif
1098
1099#ifndef QT_NO_REGEXP_ANCHOR_ALT
1100 int anchorAlternation(int a, int b);
1101 int anchorConcatenation(int a, int b);
1102#else
1103 int anchorAlternation(int a, int b) { return a & b; }
1104 int anchorConcatenation(int a, int b) { return a | b; }
1105#endif
1106 void addAnchors(int from, int to, int a);
1107
1108#ifndef QT_NO_REGEXP_OPTIM
1109 void heuristicallyChooseHeuristic();
1110#endif
1111
1112#if defined(QT_DEBUG)
1113 void dump() const;
1114#endif
1115
1116 QAtomicInt ref;
1117
1118private:
1119 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1120 enum { InitialState = 0, FinalState = 1 };
1121
1122 void setup();
1123 int setupState(int match);
1124
1125 /*
1126 Let's hope that 13 lookaheads and 14 back-references are
1127 enough.
1128 */
1129 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1130 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1131 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1132 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1133 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1134 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1135
1136 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1137 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1138#ifndef QT_NO_REGEXP_CAPTURE
1139 int startAtom(bool officialCapture);
1140 void finishAtom(int atom, bool needCapture);
1141#endif
1142
1143#ifndef QT_NO_REGEXP_LOOKAHEAD
1144 int addLookahead(QRegExpEngine *eng, bool negative);
1145#endif
1146
1147#ifndef QT_NO_REGEXP_OPTIM
1148 bool goodStringMatch(QRegExpMatchState &matchState) const;
1149 bool badCharMatch(QRegExpMatchState &matchState) const;
1150#else
1151 bool bruteMatch(QRegExpMatchState &matchState) const;
1152#endif
1153
1154 QVector<QRegExpAutomatonState> s; // array of states
1155#ifndef QT_NO_REGEXP_CAPTURE
1156 QVector<QRegExpAtom> f; // atom hierarchy
1157 int nf; // number of atoms
1158 int cf; // current atom
1159 QVector<int> captureForOfficialCapture;
1160#endif
1161 int officialncap; // number of captures, seen from the outside
1162 int ncap; // number of captures, seen from the inside
1163#ifndef QT_NO_REGEXP_CCLASS
1164 QVector<QRegExpCharClass> cl; // array of character classes
1165#endif
1166#ifndef QT_NO_REGEXP_LOOKAHEAD
1167 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1168#endif
1169#ifndef QT_NO_REGEXP_ANCHOR_ALT
1170 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1171#endif
1172#ifndef QT_NO_REGEXP_OPTIM
1173 bool caretAnchored; // does the regexp start with ^?
1174 bool trivial; // is the good-string all that needs to match?
1175#endif
1176 bool valid; // is the regular expression valid?
1177 Qt::CaseSensitivity cs; // case sensitive?
1178 bool greedyQuantifiers; // RegExp2?
1179 bool xmlSchemaExtensions;
1180#ifndef QT_NO_REGEXP_BACKREF
1181 int nbrefs; // number of back-references
1182#endif
1183
1184#ifndef QT_NO_REGEXP_OPTIM
1185 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1186
1187 int goodEarlyStart; // the index where goodStr can first occur in a match
1188 int goodLateStart; // the index where goodStr can last occur in a match
1189 QString goodStr; // the string that any match has to contain
1190
1191 int minl; // the minimum length of a match
1192 QVector<int> occ1; // first-occurrence array
1193#endif
1194
1195 /*
1196 The class Box is an abstraction for a regular expression
1197 fragment. It can also be seen as one node in the syntax tree of
1198 a regular expression with synthetized attributes.
1199
1200 Its interface is ugly for performance reasons.
1201 */
1202 class Box
1203 {
1204 public:
1205 Box(QRegExpEngine *engine);
1206 Box(const Box &b) { operator=(b); }
1207
1208 Box &operator=(const Box &b);
1209
1210 void clear() { operator=(Box(eng)); }
1211 void set(QChar ch);
1212 void set(const QRegExpCharClass &cc);
1213#ifndef QT_NO_REGEXP_BACKREF
1214 void set(int bref);
1215#endif
1216
1217 void cat(const Box &b);
1218 void orx(const Box &b);
1219 void plus(int atom);
1220 void opt();
1221 void catAnchor(int a);
1222#ifndef QT_NO_REGEXP_OPTIM
1223 void setupHeuristics();
1224#endif
1225
1226#if defined(QT_DEBUG)
1227 void dump() const;
1228#endif
1229
1230 private:
1231 void addAnchorsToEngine(const Box &to) const;
1232
1233 QRegExpEngine *eng; // the automaton under construction
1234 QVector<int> ls; // the left states (firstpos)
1235 QVector<int> rs; // the right states (lastpos)
1236 QMap<int, int> lanchors; // the left anchors
1237 QMap<int, int> ranchors; // the right anchors
1238 int skipanchors; // the anchors to match if the box is skipped
1239
1240#ifndef QT_NO_REGEXP_OPTIM
1241 int earlyStart; // the index where str can first occur
1242 int lateStart; // the index where str can last occur
1243 QString str; // a string that has to occur in any match
1244 QString leftStr; // a string occurring at the left of this box
1245 QString rightStr; // a string occurring at the right of this box
1246 int maxl; // the maximum length of this box (possibly InftyLen)
1247#endif
1248
1249 int minl; // the minimum length of this box
1250#ifndef QT_NO_REGEXP_OPTIM
1251 QVector<int> occ1; // first-occurrence array
1252#endif
1253 };
1254
1255 friend class Box;
1256
1257 void setupCategoriesRangeMap();
1258
1259 /*
1260 This is the lexical analyzer for regular expressions.
1261 */
1262 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1263 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1264 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1265 int getChar();
1266 int getEscape();
1267#ifndef QT_NO_REGEXP_INTERVAL
1268 int getRep(int def);
1269#endif
1270#ifndef QT_NO_REGEXP_LOOKAHEAD
1271 void skipChars(int n);
1272#endif
1273 void error(const char *msg);
1274 void startTokenizer(const QChar *rx, int len);
1275 int getToken();
1276
1277 const QChar *yyIn; // a pointer to the input regular expression pattern
1278 int yyPos0; // the position of yyTok in the input pattern
1279 int yyPos; // the position of the next character to read
1280 int yyLen; // the length of yyIn
1281 int yyCh; // the last character read
1282 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1283 int yyMinRep; // attribute for Tok_Quantifier
1284 int yyMaxRep; // ditto
1285 QString yyError; // syntax error or overflow during parsing?
1286
1287 /*
1288 This is the syntactic analyzer for regular expressions.
1289 */
1290 int parse(const QChar *rx, int len);
1291 void parseAtom(Box *box);
1292 void parseFactor(Box *box);
1293 void parseTerm(Box *box);
1294 void parseExpression(Box *box);
1295
1296 int yyTok; // the last token read
1297 bool yyMayCapture; // set this to false to disable capturing
1298 QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
1299
1300 friend struct QRegExpMatchState;
1301};
1302
1303#ifndef QT_NO_REGEXP_LOOKAHEAD
1304/*
1305 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1306 (?=foo) and (?!bar)).
1307*/
1308struct QRegExpLookahead
1309{
1310 QRegExpEngine *eng; // NFA representing the embedded regular expression
1311 bool neg; // negative lookahead?
1312
1313 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1314 : eng(eng0), neg(neg0) { }
1315 inline ~QRegExpLookahead() { delete eng; }
1316};
1317#endif
1318
1319/*! \internal
1320 convert the pattern string to the RegExp syntax.
1321
1322 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1323 */
1324Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1325{
1326 switch (patternSyntax) {
1327#ifndef QT_NO_REGEXP_WILDCARD
1328 case QRegExp::Wildcard:
1329 return wc2rx(pattern, false);
1330 break;
1331 case QRegExp::WildcardUnix:
1332 return wc2rx(pattern, true);
1333 break;
1334#endif
1335 case QRegExp::FixedString:
1336 return QRegExp::escape(pattern);
1337 break;
1338 case QRegExp::W3CXmlSchema11:
1339 default:
1340 return pattern;
1341 }
1342}
1343
1344QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1345 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1346 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1347{
1348 setup();
1349
1350 QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1351
1352 valid = (parse(rx.unicode(), rx.length()) == rx.length());
1353 if (!valid) {
1354#ifndef QT_NO_REGEXP_OPTIM
1355 trivial = false;
1356#endif
1357 error(RXERR_LEFTDELIM);
1358 }
1359}
1360
1361QRegExpEngine::~QRegExpEngine()
1362{
1363#ifndef QT_NO_REGEXP_LOOKAHEAD
1364 qDeleteAll(ahead);
1365#endif
1366}
1367
1368void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1369{
1370 /*
1371 We use one QVector<int> for all the big data used a lot in
1372 matchHere() and friends.
1373 */
1374 int ns = eng->s.size(); // number of states
1375 int ncap = eng->ncap;
1376#ifndef QT_NO_REGEXP_OPTIM
1377 int newSlideTabSize = qMax(eng->minl + 1, 16);
1378#else
1379 int newSlideTabSize = 0;
1380#endif
1381 int numCaptures = eng->captureCount();
1382 int newCapturedSize = 2 + 2 * numCaptures;
1383 bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1384
1385 // set all internal variables only _after_ bigArray is realloc'ed
1386 // to prevent a broken regexp in oom case
1387
1388 slideTabSize = newSlideTabSize;
1389 capturedSize = newCapturedSize;
1390 inNextStack = bigArray;
1391 memset(inNextStack, -1, ns * sizeof(int));
1392 curStack = inNextStack + ns;
1393 nextStack = inNextStack + 2 * ns;
1394
1395 curCapBegin = inNextStack + 3 * ns;
1396 nextCapBegin = curCapBegin + ncap * ns;
1397 curCapEnd = curCapBegin + 2 * ncap * ns;
1398 nextCapEnd = curCapBegin + 3 * ncap * ns;
1399
1400 tempCapBegin = curCapBegin + 4 * ncap * ns;
1401 tempCapEnd = tempCapBegin + ncap;
1402 capBegin = tempCapBegin + 2 * ncap;
1403 capEnd = tempCapBegin + 3 * ncap;
1404
1405 slideTab = tempCapBegin + 4 * ncap;
1406 captured = slideTab + slideTabSize;
1407 memset(captured, -1, capturedSize*sizeof(int));
1408 this->eng = eng;
1409}
1410
1411/*
1412 Tries to match in str and returns an array of (begin, length) pairs
1413 for captured text. If there is no match, all pairs are (-1, -1).
1414*/
1415void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1416 bool minimal0, bool oneTest, int caretIndex)
1417{
1418 bool matched = false;
1419 QChar char_null;
1420
1421#ifndef QT_NO_REGEXP_OPTIM
1422 if (eng->trivial && !oneTest) {
1423 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1424 matchLen = eng->goodStr.length();
1425 matched = (pos != -1);
1426 } else
1427#endif
1428 {
1429 in = str0;
1430 if (in == 0)
1431 in = &char_null;
1432 pos = pos0;
1433 caretPos = caretIndex;
1434 len = len0;
1435 minimal = minimal0;
1436 matchLen = 0;
1437 oneTestMatchedLen = 0;
1438
1439 if (eng->valid && pos >= 0 && pos <= len) {
1440#ifndef QT_NO_REGEXP_OPTIM
1441 if (oneTest) {
1442 matched = matchHere();
1443 } else {
1444 if (pos <= len - eng->minl) {
1445 if (eng->caretAnchored) {
1446 matched = matchHere();
1447 } else if (eng->useGoodStringHeuristic) {
1448 matched = eng->goodStringMatch(*this);
1449 } else {
1450 matched = eng->badCharMatch(*this);
1451 }
1452 }
1453 }
1454#else
1455 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1456#endif
1457 }
1458 }
1459
1460 if (matched) {
1461 int *c = captured;
1462 *c++ = pos;
1463 *c++ = matchLen;
1464
1465 int numCaptures = (capturedSize - 2) >> 1;
1466#ifndef QT_NO_REGEXP_CAPTURE
1467 for (int i = 0; i < numCaptures; ++i) {
1468 int j = eng->captureForOfficialCapture.at(i);
1469 int len = capEnd[j] - capBegin[j];
1470 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1471 *c++ = len;
1472 }
1473#endif
1474 } else {
1475 // we rely on 2's complement here
1476 memset(captured, -1, capturedSize * sizeof(int));
1477 }
1478}
1479
1480/*
1481 The three following functions add one state to the automaton and
1482 return the number of the state.
1483*/
1484
1485int QRegExpEngine::createState(QChar ch)
1486{
1487 return setupState(ch.unicode());
1488}
1489
1490int QRegExpEngine::createState(const QRegExpCharClass &cc)
1491{
1492#ifndef QT_NO_REGEXP_CCLASS
1493 int n = cl.size();
1494 cl += QRegExpCharClass(cc);
1495 return setupState(CharClassBit | n);
1496#else
1497 Q_UNUSED(cc);
1498 return setupState(CharClassBit);
1499#endif
1500}
1501
1502#ifndef QT_NO_REGEXP_BACKREF
1503int QRegExpEngine::createState(int bref)
1504{
1505 if (bref > nbrefs) {
1506 nbrefs = bref;
1507 if (nbrefs > MaxBackRefs) {
1508 error(RXERR_LIMIT);
1509 return 0;
1510 }
1511 }
1512 return setupState(BackRefBit | bref);
1513}
1514#endif
1515
1516/*
1517 The two following functions add a transition between all pairs of
1518 states (i, j) where i is found in from, and j is found in to.
1519
1520 Cat-transitions are distinguished from plus-transitions for
1521 capturing.
1522*/
1523
1524void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1525{
1526 for (int i = 0; i < from.size(); i++)
1527 mergeInto(&s[from.at(i)].outs, to);
1528}
1529
1530#ifndef QT_NO_REGEXP_CAPTURE
1531void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1532{
1533 for (int i = 0; i < from.size(); i++) {
1534 QRegExpAutomatonState &st = s[from.at(i)];
1535 const QVector<int> oldOuts = st.outs;
1536 mergeInto(&st.outs, to);
1537 if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1538 for (int j = 0; j < to.size(); j++) {
1539 // ### st.reenter.contains(to.at(j)) check looks suspicious
1540 if (!st.reenter.contains(to.at(j)) &&
1541 qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1542 st.reenter.insert(to.at(j), atom);
1543 }
1544 }
1545 }
1546}
1547#endif
1548
1549#ifndef QT_NO_REGEXP_ANCHOR_ALT
1550/*
1551 Returns an anchor that means a OR b.
1552*/
1553int QRegExpEngine::anchorAlternation(int a, int b)
1554{
1555 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1556 return a & b;
1557
1558 int n = aa.size();
1559#ifndef QT_NO_REGEXP_OPTIM
1560 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1561 return Anchor_Alternation | (n - 1);
1562#endif
1563
1564 QRegExpAnchorAlternation element = {a, b};
1565 aa.append(element);
1566 return Anchor_Alternation | n;
1567}
1568
1569/*
1570 Returns an anchor that means a AND b.
1571*/
1572int QRegExpEngine::anchorConcatenation(int a, int b)
1573{
1574 if (((a | b) & Anchor_Alternation) == 0)
1575 return a | b;
1576 if ((b & Anchor_Alternation) != 0)
1577 qSwap(a, b);
1578
1579 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1580 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1581 return anchorAlternation(aprime, bprime);
1582}
1583#endif
1584
1585/*
1586 Adds anchor a on a transition caracterised by its from state and
1587 its to state.
1588*/
1589void QRegExpEngine::addAnchors(int from, int to, int a)
1590{
1591 QRegExpAutomatonState &st = s[from];
1592 if (st.anchors.contains(to))
1593 a = anchorAlternation(st.anchors.value(to), a);
1594 st.anchors.insert(to, a);
1595}
1596
1597#ifndef QT_NO_REGEXP_OPTIM
1598/*
1599 This function chooses between the good-string and the bad-character
1600 heuristics. It computes two scores and chooses the heuristic with
1601 the highest score.
1602
1603 Here are some common-sense constraints on the scores that should be
1604 respected if the formulas are ever modified: (1) If goodStr is
1605 empty, the good-string heuristic scores 0. (2) If the regular
1606 expression is trivial, the good-string heuristic should be used.
1607 (3) If the search is case insensitive, the good-string heuristic
1608 should be used, unless it scores 0. (Case insensitivity turns all
1609 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1610 big, the good-string heuristic should score less.
1611*/
1612void QRegExpEngine::heuristicallyChooseHeuristic()
1613{
1614 if (minl == 0) {
1615 useGoodStringHeuristic = false;
1616 } else if (trivial) {
1617 useGoodStringHeuristic = true;
1618 } else {
1619 /*
1620 Magic formula: The good string has to constitute a good
1621 proportion of the minimum-length string, and appear at a
1622 more-or-less known index.
1623 */
1624 int goodStringScore = (64 * goodStr.length() / minl) -
1625 (goodLateStart - goodEarlyStart);
1626 /*
1627 Less magic formula: We pick some characters at random, and
1628 check whether they are good or bad.
1629 */
1630 int badCharScore = 0;
1631 int step = qMax(1, NumBadChars / 32);
1632 for (int i = 1; i < NumBadChars; i += step) {
1633 if (occ1.at(i) == NoOccurrence)
1634 badCharScore += minl;
1635 else
1636 badCharScore += occ1.at(i);
1637 }
1638 badCharScore /= minl;
1639 useGoodStringHeuristic = (goodStringScore > badCharScore);
1640 }
1641}
1642#endif
1643
1644#if defined(QT_DEBUG)
1645void QRegExpEngine::dump() const
1646{
1647 int i, j;
1648 qDebug("Case %ssensitive engine", cs ? "" : "in");
1649 qDebug(" States");
1650 for (i = 0; i < s.size(); i++) {
1651 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1652#ifndef QT_NO_REGEXP_CAPTURE
1653 if (nf > 0)
1654 qDebug(" in atom %d", s[i].atom);
1655#endif
1656 int m = s[i].match;
1657 if ((m & CharClassBit) != 0) {
1658 qDebug(" match character class %d", m ^ CharClassBit);
1659#ifndef QT_NO_REGEXP_CCLASS
1660 cl[m ^ CharClassBit].dump();
1661#else
1662 qDebug(" negative character class");
1663#endif
1664 } else if ((m & BackRefBit) != 0) {
1665 qDebug(" match back-reference %d", m ^ BackRefBit);
1666 } else if (m >= 0x20 && m <= 0x7e) {
1667 qDebug(" match 0x%.4x (%c)", m, m);
1668 } else {
1669 qDebug(" match 0x%.4x", m);
1670 }
1671 for (j = 0; j < s[i].outs.size(); j++) {
1672 int next = s[i].outs[j];
1673 qDebug(" -> %d", next);
1674 if (s[i].reenter.contains(next))
1675 qDebug(" [reenter %d]", s[i].reenter[next]);
1676 if (s[i].anchors.value(next) != 0)
1677 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1678 }
1679 }
1680#ifndef QT_NO_REGEXP_CAPTURE
1681 if (nf > 0) {
1682 qDebug(" Atom Parent Capture");
1683 for (i = 0; i < nf; i++) {
1684 if (f[i].capture == QRegExpAtom::NoCapture) {
1685 qDebug(" %6d %6d nil", i, f[i].parent);
1686 } else {
1687 int cap = f[i].capture;
1688 bool official = captureForOfficialCapture.contains(cap);
1689 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1690 official ? "official" : "");
1691 }
1692 }
1693 }
1694#endif
1695#ifndef QT_NO_REGEXP_ANCHOR_ALT
1696 for (i = 0; i < aa.size(); i++)
1697 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1698#endif
1699}
1700#endif
1701
1702void QRegExpEngine::setup()
1703{
1704 ref = 1;
1705#ifndef QT_NO_REGEXP_CAPTURE
1706 f.resize(32);
1707 nf = 0;
1708 cf = -1;
1709#endif
1710 officialncap = 0;
1711 ncap = 0;
1712#ifndef QT_NO_REGEXP_OPTIM
1713 caretAnchored = true;
1714 trivial = true;
1715#endif
1716 valid = false;
1717#ifndef QT_NO_REGEXP_BACKREF
1718 nbrefs = 0;
1719#endif
1720#ifndef QT_NO_REGEXP_OPTIM
1721 useGoodStringHeuristic = true;
1722 minl = 0;
1723 occ1.fill(0, NumBadChars);
1724#endif
1725}
1726
1727int QRegExpEngine::setupState(int match)
1728{
1729#ifndef QT_NO_REGEXP_CAPTURE
1730 s += QRegExpAutomatonState(cf, match);
1731#else
1732 s += QRegExpAutomatonState(match);
1733#endif
1734 return s.size() - 1;
1735}
1736
1737#ifndef QT_NO_REGEXP_CAPTURE
1738/*
1739 Functions startAtom() and finishAtom() should be called to delimit
1740 atoms. When a state is created, it is assigned to the current atom.
1741 The information is later used for capturing.
1742*/
1743int QRegExpEngine::startAtom(bool officialCapture)
1744{
1745 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1746 f.resize((nf + 1) << 1);
1747 f[nf].parent = cf;
1748 cf = nf++;
1749 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1750 return cf;
1751}
1752
1753void QRegExpEngine::finishAtom(int atom, bool needCapture)
1754{
1755 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1756 f[atom].capture = QRegExpAtom::UnofficialCapture;
1757 cf = f.at(atom).parent;
1758}
1759#endif
1760
1761#ifndef QT_NO_REGEXP_LOOKAHEAD
1762/*
1763 Creates a lookahead anchor.
1764*/
1765int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1766{
1767 int n = ahead.size();
1768 if (n == MaxLookaheads) {
1769 error(RXERR_LIMIT);
1770 return 0;
1771 }
1772 ahead += new QRegExpLookahead(eng, negative);
1773 return Anchor_FirstLookahead << n;
1774}
1775#endif
1776
1777#ifndef QT_NO_REGEXP_CAPTURE
1778/*
1779 We want the longest leftmost captures.
1780*/
1781static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1782 const int *end2)
1783{
1784 for (int i = 0; i < ncap; i++) {
1785 int delta = begin2[i] - begin1[i]; // it has to start early...
1786 if (delta == 0)
1787 delta = end1[i] - end2[i]; // ...and end late
1788
1789 if (delta != 0)
1790 return delta > 0;
1791 }
1792 return false;
1793}
1794#endif
1795
1796/*
1797 Returns true if anchor a matches at position pos + i in the input
1798 string, otherwise false.
1799*/
1800bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1801{
1802 int j;
1803
1804#ifndef QT_NO_REGEXP_ANCHOR_ALT
1805 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1806 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1807 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1808#endif
1809
1810 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1811 if (pos + i != caretPos)
1812 return false;
1813 }
1814 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1815 if (pos + i != len)
1816 return false;
1817 }
1818#ifndef QT_NO_REGEXP_ESCAPE
1819 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1820 bool before = false;
1821 bool after = false;
1822 if (pos + i != 0)
1823 before = isWord(in[pos + i - 1]);
1824 if (pos + i != len)
1825 after = isWord(in[pos + i]);
1826 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1827 return false;
1828 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1829 return false;
1830 }
1831#endif
1832#ifndef QT_NO_REGEXP_LOOKAHEAD
1833 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1834 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1835 for (j = 0; j < ahead.size(); j++) {
1836 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1837 QRegExpMatchState matchState;
1838 matchState.prepareForMatch(ahead[j]->eng);
1839 matchState.match(in + pos + i, len - pos - i, 0,
1840 true, true, matchState.caretPos - matchState.pos - i);
1841 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1842 return false;
1843 }
1844 }
1845 }
1846#endif
1847#ifndef QT_NO_REGEXP_CAPTURE
1848#ifndef QT_NO_REGEXP_BACKREF
1849 for (j = 0; j < eng->nbrefs; j++) {
1850 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1851 int i = eng->captureForOfficialCapture.at(j);
1852 if (capBegin[i] != EmptyCapture)
1853 return false;
1854 }
1855 }
1856#endif
1857#endif
1858 return true;
1859}
1860
1861#ifndef QT_NO_REGEXP_OPTIM
1862/*
1863 The three following functions are what Jeffrey Friedl would call
1864 transmissions (or bump-alongs). Using one or the other should make
1865 no difference except in performance.
1866*/
1867
1868bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1869{
1870 int k = matchState.pos + goodEarlyStart;
1871 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1872 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1873 int from = k - goodLateStart;
1874 int to = k - goodEarlyStart;
1875 if (from > matchState.pos)
1876 matchState.pos = from;
1877
1878 while (matchState.pos <= to) {
1879 if (matchState.matchHere())
1880 return true;
1881 ++matchState.pos;
1882 }
1883 ++k;
1884 }
1885 return false;
1886}
1887
1888bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1889{
1890 int slideHead = 0;
1891 int slideNext = 0;
1892 int i;
1893 int lastPos = matchState.len - minl;
1894 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1895
1896 /*
1897 Set up the slide table, used for the bad-character heuristic,
1898 using the table of first occurrence of each character.
1899 */
1900 for (i = 0; i < minl; i++) {
1901 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1902 if (sk == NoOccurrence)
1903 sk = i + 1;
1904 if (sk > 0) {
1905 int k = i + 1 - sk;
1906 if (k < 0) {
1907 sk = i + 1;
1908 k = 0;
1909 }
1910 if (sk > matchState.slideTab[k])
1911 matchState.slideTab[k] = sk;
1912 }
1913 }
1914
1915 if (matchState.pos > lastPos)
1916 return false;
1917
1918 for (;;) {
1919 if (++slideNext >= matchState.slideTabSize)
1920 slideNext = 0;
1921 if (matchState.slideTab[slideHead] > 0) {
1922 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1923 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1924 matchState.slideTab[slideHead] = 0;
1925 } else {
1926 if (matchState.matchHere())
1927 return true;
1928 }
1929
1930 if (matchState.pos == lastPos)
1931 break;
1932
1933 /*
1934 Update the slide table. This code has much in common with
1935 the initialization code.
1936 */
1937 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1938 if (sk == NoOccurrence) {
1939 matchState.slideTab[slideNext] = minl;
1940 } else if (sk > 0) {
1941 int k = slideNext + minl - sk;
1942 if (k >= matchState.slideTabSize)
1943 k -= matchState.slideTabSize;
1944 if (sk > matchState.slideTab[k])
1945 matchState.slideTab[k] = sk;
1946 }
1947 slideHead = slideNext;
1948 ++matchState.pos;
1949 }
1950 return false;
1951}
1952#else
1953bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1954{
1955 while (matchState.pos <= matchState.len) {
1956 if (matchState.matchHere())
1957 return true;
1958 ++matchState.pos;
1959 }
1960 return false;
1961}
1962#endif
1963
1964/*
1965 Here's the core of the engine. It tries to do a match here and now.
1966*/
1967bool QRegExpMatchState::matchHere()
1968{
1969 int ncur = 1, nnext = 0;
1970 int i = 0, j, k, m;
1971 bool stop = false;
1972
1973 matchLen = -1;
1974 oneTestMatchedLen = -1;
1975 curStack[0] = QRegExpEngine::InitialState;
1976
1977 int ncap = eng->ncap;
1978#ifndef QT_NO_REGEXP_CAPTURE
1979 if (ncap > 0) {
1980 for (j = 0; j < ncap; j++) {
1981 curCapBegin[j] = EmptyCapture;
1982 curCapEnd[j] = EmptyCapture;
1983 }
1984 }
1985#endif
1986
1987#ifndef QT_NO_REGEXP_BACKREF
1988 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1989#else
1990 while (ncur > 0 && i <= len - pos && !stop)
1991#endif
1992 {
1993 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1994 for (j = 0; j < ncur; j++) {
1995 int cur = curStack[j];
1996 const QRegExpAutomatonState &scur = eng->s.at(cur);
1997 const QVector<int> &outs = scur.outs;
1998 for (k = 0; k < outs.size(); k++) {
1999 int next = outs.at(k);
2000 const QRegExpAutomatonState &snext = eng->s.at(next);
2001 bool inside = true;
2002#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2003 int needSomeSleep = 0;
2004#endif
2005
2006 /*
2007 First, check if the anchors are anchored properly.
2008 */
2009 int a = scur.anchors.value(next);
2010 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2011 inside = false;
2012
2013 /*
2014 If indeed they are, check if the input character is
2015 correct for this transition.
2016 */
2017 if (inside) {
2018 m = snext.match;
2019 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2020 if (eng->cs)
2021 inside = (m == ch);
2022 else
2023 inside = (QChar(m).toLower() == QChar(ch).toLower());
2024 } else if (next == QRegExpEngine::FinalState) {
2025 matchLen = i;
2026 stop = minimal;
2027 inside = true;
2028 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2029#ifndef QT_NO_REGEXP_CCLASS
2030 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2031 if (eng->cs)
2032 inside = cc.in(ch);
2033 else if (cc.negative())
2034 inside = cc.in(QChar(ch).toLower()) &&
2035 cc.in(QChar(ch).toUpper());
2036 else
2037 inside = cc.in(QChar(ch).toLower()) ||
2038 cc.in(QChar(ch).toUpper());
2039#endif
2040#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2041 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2042 int bref = m ^ QRegExpEngine::BackRefBit;
2043 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2044
2045 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2046 if (inside) {
2047 if (eng->cs)
2048 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2049 else
2050 inside = (in[pos + curCapBegin[ell]].toLower()
2051 == QChar(ch).toLower());
2052 }
2053
2054 if (inside) {
2055 int delta;
2056 if (curCapEnd[ell] == EmptyCapture)
2057 delta = i - curCapBegin[ell];
2058 else
2059 delta = curCapEnd[ell] - curCapBegin[ell];
2060
2061 inside = (delta <= len - (pos + i));
2062 if (inside && delta > 1) {
2063 int n = 1;
2064 if (eng->cs) {
2065 while (n < delta) {
2066 if (in[pos + curCapBegin[ell] + n]
2067 != in[pos + i + n])
2068 break;
2069 ++n;
2070 }
2071 } else {
2072 while (n < delta) {
2073 QChar a = in[pos + curCapBegin[ell] + n];
2074 QChar b = in[pos + i + n];
2075 if (a.toLower() != b.toLower())
2076 break;
2077 ++n;
2078 }
2079 }
2080 inside = (n == delta);
2081 if (inside)
2082 needSomeSleep = delta - 1;
2083 }
2084 }
2085#endif
2086 }
2087 }
2088
2089 /*
2090 We must now update our data structures.
2091 */
2092 if (inside) {
2093#ifndef QT_NO_REGEXP_CAPTURE
2094 int *capBegin, *capEnd;
2095#endif
2096 /*
2097 If the next state was not encountered yet, all
2098 is fine.
2099 */
2100 if ((m = inNextStack[next]) == -1) {
2101 m = nnext++;
2102 nextStack[m] = next;
2103 inNextStack[next] = m;
2104#ifndef QT_NO_REGEXP_CAPTURE
2105 capBegin = nextCapBegin + m * ncap;
2106 capEnd = nextCapEnd + m * ncap;
2107
2108 /*
2109 Otherwise, we'll first maintain captures in
2110 temporary arrays, and decide at the end whether
2111 it's best to keep the previous capture zones or
2112 the new ones.
2113 */
2114 } else {
2115 capBegin = tempCapBegin;
2116 capEnd = tempCapEnd;
2117#endif
2118 }
2119
2120#ifndef QT_NO_REGEXP_CAPTURE
2121 /*
2122 Updating the capture zones is much of a task.
2123 */
2124 if (ncap > 0) {
2125 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2126 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2127 int c = scur.atom, n = snext.atom;
2128 int p = -1, q = -1;
2129 int cap;
2130
2131 /*
2132 Lemma 1. For any x in the range [0..nf), we
2133 have f[x].parent < x.
2134
2135 Proof. By looking at startAtom(), it is
2136 clear that cf < nf holds all the time, and
2137 thus that f[nf].parent < nf.
2138 */
2139
2140 /*
2141 If we are reentering an atom, we empty all
2142 capture zones inside it.
2143 */
2144 if ((q = scur.reenter.value(next)) != 0) {
2145 QBitArray b(eng->nf, false);
2146 b.setBit(q, true);
2147 for (int ell = q + 1; ell < eng->nf; ell++) {
2148 if (b.testBit(eng->f.at(ell).parent)) {
2149 b.setBit(ell, true);
2150 cap = eng->f.at(ell).capture;
2151 if (cap >= 0) {
2152 capBegin[cap] = EmptyCapture;
2153 capEnd[cap] = EmptyCapture;
2154 }
2155 }
2156 }
2157 p = eng->f.at(q).parent;
2158
2159 /*
2160 Otherwise, close the capture zones we are
2161 leaving. We are leaving f[c].capture,
2162 f[f[c].parent].capture,
2163 f[f[f[c].parent].parent].capture, ...,
2164 until f[x].capture, with x such that
2165 f[x].parent is the youngest common ancestor
2166 for c and n.
2167
2168 We go up along c's and n's ancestry until
2169 we find x.
2170 */
2171 } else {
2172 p = c;
2173 q = n;
2174 while (p != q) {
2175 if (p > q) {
2176 cap = eng->f.at(p).capture;
2177 if (cap >= 0) {
2178 if (capBegin[cap] == i) {
2179 capBegin[cap] = EmptyCapture;
2180 capEnd[cap] = EmptyCapture;
2181 } else {
2182 capEnd[cap] = i;
2183 }
2184 }
2185 p = eng->f.at(p).parent;
2186 } else {
2187 q = eng->f.at(q).parent;
2188 }
2189 }
2190 }
2191
2192 /*
2193 In any case, we now open the capture zones
2194 we are entering. We work upwards from n
2195 until we reach p (the parent of the atom we
2196 reenter or the youngest common ancestor).
2197 */
2198 while (n > p) {
2199 cap = eng->f.at(n).capture;
2200 if (cap >= 0) {
2201 capBegin[cap] = i;
2202 capEnd[cap] = EmptyCapture;
2203 }
2204 n = eng->f.at(n).parent;
2205 }
2206 /*
2207 If the next state was already in
2208 nextStack, we must choose carefully which
2209 capture zones we want to keep.
2210 */
2211 if (capBegin == tempCapBegin &&
2212 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2213 nextCapEnd + m * ncap)) {
2214 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2215 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2216 }
2217 }
2218#ifndef QT_NO_REGEXP_BACKREF
2219 /*
2220 We are done with updating the capture zones.
2221 It's now time to put the next state to sleep,
2222 if it needs to, and to remove it from
2223 nextStack.
2224 */
2225 if (needSomeSleep > 0) {
2226 QVector<int> zzZ(2 + 2 * ncap);
2227 zzZ[0] = i + needSomeSleep;
2228 zzZ[1] = next;
2229 if (ncap > 0) {
2230 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2231 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2232 }
2233 inNextStack[nextStack[--nnext]] = -1;
2234 sleeping.append(zzZ);
2235 }
2236#endif
2237#endif
2238 }
2239 }
2240 }
2241#ifndef QT_NO_REGEXP_CAPTURE
2242 /*
2243 If we reached the final state, hurray! Copy the captured
2244 zone.
2245 */
2246 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2247 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2248 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2249 }
2250#ifndef QT_NO_REGEXP_BACKREF
2251 /*
2252 It's time to wake up the sleepers.
2253 */
2254 j = 0;
2255 while (j < sleeping.count()) {
2256 if (sleeping.at(j)[0] == i) {
2257 const QVector<int> &zzZ = sleeping.at(j);
2258 int next = zzZ[1];
2259 const int *capBegin = zzZ.data() + 2;
2260 const int *capEnd = zzZ.data() + 2 + ncap;
2261 bool copyOver = true;
2262
2263 if ((m = inNextStack[next]) == -1) {
2264 m = nnext++;
2265 nextStack[m] = next;
2266 inNextStack[next] = m;
2267 } else {
2268 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2269 capBegin, capEnd);
2270 }
2271 if (copyOver) {
2272 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2273 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2274 }
2275
2276 sleeping.removeAt(j);
2277 } else {
2278 ++j;
2279 }
2280 }
2281#endif
2282#endif
2283 for (j = 0; j < nnext; j++)
2284 inNextStack[nextStack[j]] = -1;
2285
2286 // avoid needless iteration that confuses oneTestMatchedLen
2287 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2288#ifndef QT_NO_REGEXP_BACKREF
2289 && sleeping.isEmpty()
2290#endif
2291 )
2292 stop = true;
2293
2294 qSwap(curStack, nextStack);
2295#ifndef QT_NO_REGEXP_CAPTURE
2296 qSwap(curCapBegin, nextCapBegin);
2297 qSwap(curCapEnd, nextCapEnd);
2298#endif
2299 ncur = nnext;
2300 nnext = 0;
2301 ++i;
2302 }
2303
2304#ifndef QT_NO_REGEXP_BACKREF
2305 /*
2306 If minimal matching is enabled, we might have some sleepers
2307 left.
2308 */
2309 if (!sleeping.isEmpty())
2310 sleeping.clear();
2311#endif
2312
2313 oneTestMatchedLen = i - 1;
2314 return (matchLen >= 0);
2315}
2316
2317#ifndef QT_NO_REGEXP_CCLASS
2318
2319QRegExpCharClass::QRegExpCharClass()
2320 : c(0), n(false)
2321{
2322#ifndef QT_NO_REGEXP_OPTIM
2323 occ1.fill(NoOccurrence, NumBadChars);
2324#endif
2325}
2326
2327QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2328{
2329 c = cc.c;
2330 r = cc.r;
2331 n = cc.n;
2332#ifndef QT_NO_REGEXP_OPTIM
2333 occ1 = cc.occ1;
2334#endif
2335 return *this;
2336}
2337
2338void QRegExpCharClass::clear()
2339{
2340 c = 0;
2341 r.resize(0);
2342 n = false;
2343}
2344
2345void QRegExpCharClass::setNegative(bool negative)
2346{
2347 n = negative;
2348#ifndef QT_NO_REGEXP_OPTIM
2349 occ1.fill(0, NumBadChars);
2350#endif
2351}
2352
2353void QRegExpCharClass::addCategories(int cats)
2354{
2355 c |= cats;
2356#ifndef QT_NO_REGEXP_OPTIM
2357 occ1.fill(0, NumBadChars);
2358#endif
2359}
2360
2361void QRegExpCharClass::addRange(ushort from, ushort to)
2362{
2363 if (from > to)
2364 qSwap(from, to);
2365 int m = r.size();
2366 r.resize(m + 1);
2367 r[m].from = from;
2368 r[m].len = to - from + 1;
2369
2370#ifndef QT_NO_REGEXP_OPTIM
2371 int i;
2372
2373 if (to - from < NumBadChars) {
2374 if (from % NumBadChars <= to % NumBadChars) {
2375 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2376 occ1[i] = 0;
2377 } else {
2378 for (i = 0; i <= to % NumBadChars; i++)
2379 occ1[i] = 0;
2380 for (i = from % NumBadChars; i < NumBadChars; i++)
2381 occ1[i] = 0;
2382 }
2383 } else {
2384 occ1.fill(0, NumBadChars);
2385 }
2386#endif
2387}
2388
2389bool QRegExpCharClass::in(QChar ch) const
2390{
2391#ifndef QT_NO_REGEXP_OPTIM
2392 if (occ1.at(BadChar(ch)) == NoOccurrence)
2393 return n;
2394#endif
2395
2396 if (c != 0 && (c & (1 << (int)ch.category())) != 0)
2397 return !n;
2398
2399 const int uc = ch.unicode();
2400 int size = r.size();
2401
2402 for (int i = 0; i < size; ++i) {
2403 const QRegExpCharClassRange &range = r.at(i);
2404 if (uint(uc - range.from) < uint(r.at(i).len))
2405 return !n;
2406 }
2407 return n;
2408}
2409
2410#if defined(QT_DEBUG)
2411void QRegExpCharClass::dump() const
2412{
2413 int i;
2414 qDebug(" %stive character class", n ? "nega" : "posi");
2415#ifndef QT_NO_REGEXP_CCLASS
2416 if (c != 0)
2417 qDebug(" categories 0x%.8x", c);
2418#endif
2419 for (i = 0; i < r.size(); i++)
2420 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2421}
2422#endif
2423#endif
2424
2425QRegExpEngine::Box::Box(QRegExpEngine *engine)
2426 : eng(engine), skipanchors(0)
2427#ifndef QT_NO_REGEXP_OPTIM
2428 , earlyStart(0), lateStart(0), maxl(0)
2429#endif
2430{
2431#ifndef QT_NO_REGEXP_OPTIM
2432 occ1.fill(NoOccurrence, NumBadChars);
2433#endif
2434 minl = 0;
2435}
2436
2437QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2438{
2439 eng = b.eng;
2440 ls = b.ls;
2441 rs = b.rs;
2442 lanchors = b.lanchors;
2443 ranchors = b.ranchors;
2444 skipanchors = b.skipanchors;
2445#ifndef QT_NO_REGEXP_OPTIM
2446 earlyStart = b.earlyStart;
2447 lateStart = b.lateStart;
2448 str = b.str;
2449 leftStr = b.leftStr;
2450 rightStr = b.rightStr;
2451 maxl = b.maxl;
2452 occ1 = b.occ1;
2453#endif
2454 minl = b.minl;
2455 return *this;
2456}
2457
2458void QRegExpEngine::Box::set(QChar ch)
2459{
2460 ls.resize(1);
2461 ls[0] = eng->createState(ch);
2462 rs = ls;
2463#ifndef QT_NO_REGEXP_OPTIM
2464 str = ch;
2465 leftStr = ch;
2466 rightStr = ch;
2467 maxl = 1;
2468 occ1[BadChar(ch)] = 0;
2469#endif
2470 minl = 1;
2471}
2472
2473void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2474{
2475 ls.resize(1);
2476 ls[0] = eng->createState(cc);
2477 rs = ls;
2478#ifndef QT_NO_REGEXP_OPTIM
2479 maxl = 1;
2480 occ1 = cc.firstOccurrence();
2481#endif
2482 minl = 1;
2483}
2484
2485#ifndef QT_NO_REGEXP_BACKREF
2486void QRegExpEngine::Box::set(int bref)
2487{
2488 ls.resize(1);
2489 ls[0] = eng->createState(bref);
2490 rs = ls;
2491 if (bref >= 1 && bref <= MaxBackRefs)
2492 skipanchors = Anchor_BackRef0Empty << bref;
2493#ifndef QT_NO_REGEXP_OPTIM
2494 maxl = InftyLen;
2495#endif
2496 minl = 0;
2497}
2498#endif
2499
2500void QRegExpEngine::Box::cat(const Box &b)
2501{
2502 eng->addCatTransitions(rs, b.ls);
2503 addAnchorsToEngine(b);
2504 if (minl == 0) {
2505 lanchors.unite(b.lanchors);
2506 if (skipanchors != 0) {
2507 for (int i = 0; i < b.ls.size(); i++) {
2508 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2509 lanchors.insert(b.ls.at(i), a);
2510 }
2511 }
2512 mergeInto(&ls, b.ls);
2513 }
2514 if (b.minl == 0) {
2515 ranchors.unite(b.ranchors);
2516 if (b.skipanchors != 0) {
2517 for (int i = 0; i < rs.size(); i++) {
2518 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2519 ranchors.insert(rs.at(i), a);
2520 }
2521 }
2522 mergeInto(&rs, b.rs);
2523 } else {
2524 ranchors = b.ranchors;
2525 rs = b.rs;
2526 }
2527
2528#ifndef QT_NO_REGEXP_OPTIM
2529 if (maxl != InftyLen) {
2530 if (rightStr.length() + b.leftStr.length() >
2531 qMax(str.length(), b.str.length())) {
2532 earlyStart = minl - rightStr.length();
2533 lateStart = maxl - rightStr.length();
2534 str = rightStr + b.leftStr;
2535 } else if (b.str.length() > str.length()) {
2536 earlyStart = minl + b.earlyStart;
2537 lateStart = maxl + b.lateStart;
2538 str = b.str;
2539 }
2540 }
2541
2542 if (leftStr.length() == maxl)
2543 leftStr += b.leftStr;
2544
2545 if (b.rightStr.length() == b.maxl) {
2546 rightStr += b.rightStr;
2547 } else {
2548 rightStr = b.rightStr;
2549 }
2550
2551 if (maxl == InftyLen || b.maxl == InftyLen) {
2552 maxl = InftyLen;
2553 } else {
2554 maxl += b.maxl;
2555 }
2556
2557 for (int i = 0; i < NumBadChars; i++) {
2558 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2559 occ1[i] = minl + b.occ1.at(i);
2560 }
2561#endif
2562
2563 minl += b.minl;
2564 if (minl == 0)
2565 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2566 else
2567 skipanchors = 0;
2568}
2569
2570void QRegExpEngine::Box::orx(const Box &b)
2571{
2572 mergeInto(&ls, b.ls);
2573 lanchors.unite(b.lanchors);
2574 mergeInto(&rs, b.rs);
2575 ranchors.unite(b.ranchors);
2576
2577 if (b.minl == 0) {
2578 if (minl == 0)
2579 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2580 else
2581 skipanchors = b.skipanchors;
2582 }
2583
2584#ifndef QT_NO_REGEXP_OPTIM
2585 for (int i = 0; i < NumBadChars; i++) {
2586 if (occ1.at(i) > b.occ1.at(i))
2587 occ1[i] = b.occ1.at(i);
2588 }
2589 earlyStart = 0;
2590 lateStart = 0;
2591 str = QString();
2592 leftStr = QString();
2593 rightStr = QString();
2594 if (b.maxl > maxl)
2595 maxl = b.maxl;
2596#endif
2597 if (b.minl < minl)
2598 minl = b.minl;
2599}
2600
2601void QRegExpEngine::Box::plus(int atom)
2602{
2603#ifndef QT_NO_REGEXP_CAPTURE
2604 eng->addPlusTransitions(rs, ls, atom);
2605#else
2606 Q_UNUSED(atom);
2607 eng->addCatTransitions(rs, ls);
2608#endif
2609 addAnchorsToEngine(*this);
2610#ifndef QT_NO_REGEXP_OPTIM
2611 maxl = InftyLen;
2612#endif
2613}
2614
2615void QRegExpEngine::Box::opt()
2616{
2617#ifndef QT_NO_REGEXP_OPTIM
2618 earlyStart = 0;
2619 lateStart = 0;
2620 str = QString();
2621 leftStr = QString();
2622 rightStr = QString();
2623#endif
2624 skipanchors = 0;
2625 minl = 0;
2626}
2627
2628void QRegExpEngine::Box::catAnchor(int a)
2629{
2630 if (a != 0) {
2631 for (int i = 0; i < rs.size(); i++) {
2632 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2633 ranchors.insert(rs.at(i), a);
2634 }
2635 if (minl == 0)
2636 skipanchors = eng->anchorConcatenation(skipanchors, a);
2637 }
2638}
2639
2640#ifndef QT_NO_REGEXP_OPTIM
2641void QRegExpEngine::Box::setupHeuristics()
2642{
2643 eng->goodEarlyStart = earlyStart;
2644 eng->goodLateStart = lateStart;
2645 eng->goodStr = eng->cs ? str : str.toLower();
2646
2647 eng->minl = minl;
2648 if (eng->cs) {
2649 /*
2650 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2651 1 at this point. An entry of occ1 has to be at most minl or
2652 infinity for the rest of the algorithm to go well.
2653
2654 We waited until here before normalizing these cases (instead of
2655 doing it in Box::orx()) because sometimes things improve by
2656 themselves. Consider for example (112|1)34.
2657 */
2658 for (int i = 0; i < NumBadChars; i++) {
2659 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2660 occ1[i] = minl;
2661 }
2662 eng->occ1 = occ1;
2663 } else {
2664 eng->occ1.fill(0, NumBadChars);
2665 }
2666
2667 eng->heuristicallyChooseHeuristic();
2668}
2669#endif
2670
2671#if defined(QT_DEBUG)
2672void QRegExpEngine::Box::dump() const
2673{
2674 int i;
2675 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2676 qDebug(" Left states:");
2677 for (i = 0; i < ls.size(); i++) {
2678 if (lanchors.value(ls[i], 0) == 0)
2679 qDebug(" %d", ls[i]);
2680 else
2681 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2682 }
2683 qDebug(" Right states:");
2684 for (i = 0; i < rs.size(); i++) {
2685 if (ranchors.value(rs[i], 0) == 0)
2686 qDebug(" %d", rs[i]);
2687 else
2688 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2689 }
2690 qDebug(" Skip anchors: 0x%.8x", skipanchors);
2691}
2692#endif
2693
2694void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2695{
2696 for (int i = 0; i < to.ls.size(); i++) {
2697 for (int j = 0; j < rs.size(); j++) {
2698 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2699 to.lanchors.value(to.ls.at(i), 0));
2700 eng->addAnchors(rs[j], to.ls[i], a);
2701 }
2702 }
2703}
2704
2705void QRegExpEngine::setupCategoriesRangeMap()
2706{
2707 categoriesRangeMap.insert("IsBasicLatin", qMakePair(0x0000, 0x007F));
2708 categoriesRangeMap.insert("IsLatin-1Supplement", qMakePair(0x0080, 0x00FF));
2709 categoriesRangeMap.insert("IsLatinExtended-A", qMakePair(0x0100, 0x017F));
2710 categoriesRangeMap.insert("IsLatinExtended-B", qMakePair(0x0180, 0x024F));
2711 categoriesRangeMap.insert("IsIPAExtensions", qMakePair(0x0250, 0x02AF));
2712 categoriesRangeMap.insert("IsSpacingModifierLetters", qMakePair(0x02B0, 0x02FF));
2713 categoriesRangeMap.insert("IsCombiningDiacriticalMarks", qMakePair(0x0300, 0x036F));
2714 categoriesRangeMap.insert("IsGreek", qMakePair(0x0370, 0x03FF));
2715 categoriesRangeMap.insert("IsCyrillic", qMakePair(0x0400, 0x04FF));
2716 categoriesRangeMap.insert("IsCyrillicSupplement", qMakePair(0x0500, 0x052F));
2717 categoriesRangeMap.insert("IsArmenian", qMakePair(0x0530, 0x058F));
2718 categoriesRangeMap.insert("IsHebrew", qMakePair(0x0590, 0x05FF));
2719 categoriesRangeMap.insert("IsArabic", qMakePair(0x0600, 0x06FF));
2720 categoriesRangeMap.insert("IsSyriac", qMakePair(0x0700, 0x074F));
2721 categoriesRangeMap.insert("IsArabicSupplement", qMakePair(0x0750, 0x077F));
2722 categoriesRangeMap.insert("IsThaana", qMakePair(0x0780, 0x07BF));
2723 categoriesRangeMap.insert("IsDevanagari", qMakePair(0x0900, 0x097F));
2724 categoriesRangeMap.insert("IsBengali", qMakePair(0x0980, 0x09FF));
2725 categoriesRangeMap.insert("IsGurmukhi", qMakePair(0x0A00, 0x0A7F));
2726 categoriesRangeMap.insert("IsGujarati", qMakePair(0x0A80, 0x0AFF));
2727 categoriesRangeMap.insert("IsOriya", qMakePair(0x0B00, 0x0B7F));
2728 categoriesRangeMap.insert("IsTamil", qMakePair(0x0B80, 0x0BFF));
2729 categoriesRangeMap.insert("IsTelugu", qMakePair(0x0C00, 0x0C7F));
2730 categoriesRangeMap.insert("IsKannada", qMakePair(0x0C80, 0x0CFF));
2731 categoriesRangeMap.insert("IsMalayalam", qMakePair(0x0D00, 0x0D7F));
2732 categoriesRangeMap.insert("IsSinhala", qMakePair(0x0D80, 0x0DFF));
2733 categoriesRangeMap.insert("IsThai", qMakePair(0x0E00, 0x0E7F));
2734 categoriesRangeMap.insert("IsLao", qMakePair(0x0E80, 0x0EFF));
2735 categoriesRangeMap.insert("IsTibetan", qMakePair(0x0F00, 0x0FFF));
2736 categoriesRangeMap.insert("IsMyanmar", qMakePair(0x1000, 0x109F));
2737 categoriesRangeMap.insert("IsGeorgian", qMakePair(0x10A0, 0x10FF));
2738 categoriesRangeMap.insert("IsHangulJamo", qMakePair(0x1100, 0x11FF));
2739 categoriesRangeMap.insert("IsEthiopic", qMakePair(0x1200, 0x137F));
2740 categoriesRangeMap.insert("IsEthiopicSupplement", qMakePair(0x1380, 0x139F));
2741 categoriesRangeMap.insert("IsCherokee", qMakePair(0x13A0, 0x13FF));
2742 categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics", qMakePair(0x1400, 0x167F));
2743 categoriesRangeMap.insert("IsOgham", qMakePair(0x1680, 0x169F));
2744 categoriesRangeMap.insert("IsRunic", qMakePair(0x16A0, 0x16FF));
2745 categoriesRangeMap.insert("IsTagalog", qMakePair(0x1700, 0x171F));
2746 categoriesRangeMap.insert("IsHanunoo", qMakePair(0x1720, 0x173F));
2747 categoriesRangeMap.insert("IsBuhid", qMakePair(0x1740, 0x175F));
2748 categoriesRangeMap.insert("IsTagbanwa", qMakePair(0x1760, 0x177F));
2749 categoriesRangeMap.insert("IsKhmer", qMakePair(0x1780, 0x17FF));
2750 categoriesRangeMap.insert("IsMongolian", qMakePair(0x1800, 0x18AF));
2751 categoriesRangeMap.insert("IsLimbu", qMakePair(0x1900, 0x194F));
2752 categoriesRangeMap.insert("IsTaiLe", qMakePair(0x1950, 0x197F));
2753 categoriesRangeMap.insert("IsNewTaiLue", qMakePair(0x1980, 0x19DF));
2754 categoriesRangeMap.insert("IsKhmerSymbols", qMakePair(0x19E0, 0x19FF));
2755 categoriesRangeMap.insert("IsBuginese", qMakePair(0x1A00, 0x1A1F));
2756 categoriesRangeMap.insert("IsPhoneticExtensions", qMakePair(0x1D00, 0x1D7F));
2757 categoriesRangeMap.insert("IsPhoneticExtensionsSupplement", qMakePair(0x1D80, 0x1DBF));
2758 categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement", qMakePair(0x1DC0, 0x1DFF));
2759 categoriesRangeMap.insert("IsLatinExtendedAdditional", qMakePair(0x1E00, 0x1EFF));
2760 categoriesRangeMap.insert("IsGreekExtended", qMakePair(0x1F00, 0x1FFF));
2761 categoriesRangeMap.insert("IsGeneralPunctuation", qMakePair(0x2000, 0x206F));
2762 categoriesRangeMap.insert("IsSuperscriptsandSubscripts", qMakePair(0x2070, 0x209F));
2763 categoriesRangeMap.insert("IsCurrencySymbols", qMakePair(0x20A0, 0x20CF));
2764 categoriesRangeMap.insert("IsCombiningMarksforSymbols", qMakePair(0x20D0, 0x20FF));
2765 categoriesRangeMap.insert("IsLetterlikeSymbols", qMakePair(0x2100, 0x214F));
2766 categoriesRangeMap.insert("IsNumberForms", qMakePair(0x2150, 0x218F));
2767 categoriesRangeMap.insert("IsArrows", qMakePair(0x2190, 0x21FF));
2768 categoriesRangeMap.insert("IsMathematicalOperators", qMakePair(0x2200, 0x22FF));
2769 categoriesRangeMap.insert("IsMiscellaneousTechnical", qMakePair(0x2300, 0x23FF));
2770 categoriesRangeMap.insert("IsControlPictures", qMakePair(0x2400, 0x243F));
2771 categoriesRangeMap.insert("IsOpticalCharacterRecognition", qMakePair(0x2440, 0x245F));
2772 categoriesRangeMap.insert("IsEnclosedAlphanumerics", qMakePair(0x2460, 0x24FF));
2773 categoriesRangeMap.insert("IsBoxDrawing", qMakePair(0x2500, 0x257F));
2774 categoriesRangeMap.insert("IsBlockElements", qMakePair(0x2580, 0x259F));
2775 categoriesRangeMap.insert("IsGeometricShapes", qMakePair(0x25A0, 0x25FF));
2776 categoriesRangeMap.insert("IsMiscellaneousSymbols", qMakePair(0x2600, 0x26FF));
2777 categoriesRangeMap.insert("IsDingbats", qMakePair(0x2700, 0x27BF));
2778 categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A", qMakePair(0x27C0, 0x27EF));
2779 categoriesRangeMap.insert("IsSupplementalArrows-A", qMakePair(0x27F0, 0x27FF));
2780 categoriesRangeMap.insert("IsBraillePatterns", qMakePair(0x2800, 0x28FF));
2781 categoriesRangeMap.insert("IsSupplementalArrows-B", qMakePair(0x2900, 0x297F));
2782 categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B", qMakePair(0x2980, 0x29FF));
2783 categoriesRangeMap.insert("IsSupplementalMathematicalOperators", qMakePair(0x2A00, 0x2AFF));
2784 categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows", qMakePair(0x2B00, 0x2BFF));
2785 categoriesRangeMap.insert("IsGlagolitic", qMakePair(0x2C00, 0x2C5F));
2786 categoriesRangeMap.insert("IsCoptic", qMakePair(0x2C80, 0x2CFF));
2787 categoriesRangeMap.insert("IsGeorgianSupplement", qMakePair(0x2D00, 0x2D2F));
2788 categoriesRangeMap.insert("IsTifinagh", qMakePair(0x2D30, 0x2D7F));
2789 categoriesRangeMap.insert("IsEthiopicExtended", qMakePair(0x2D80, 0x2DDF));
2790 categoriesRangeMap.insert("IsSupplementalPunctuation", qMakePair(0x2E00, 0x2E7F));
2791 categoriesRangeMap.insert("IsCJKRadicalsSupplement", qMakePair(0x2E80, 0x2EFF));
2792 categoriesRangeMap.insert("IsKangxiRadicals", qMakePair(0x2F00, 0x2FDF));
2793 categoriesRangeMap.insert("IsIdeographicDescriptionCharacters", qMakePair(0x2FF0, 0x2FFF));
2794 categoriesRangeMap.insert("IsCJKSymbolsandPunctuation", qMakePair(0x3000, 0x303F));
2795 categoriesRangeMap.insert("IsHiragana", qMakePair(0x3040, 0x309F));
2796 categoriesRangeMap.insert("IsKatakana", qMakePair(0x30A0, 0x30FF));
2797 categoriesRangeMap.insert("IsBopomofo", qMakePair(0x3100, 0x312F));
2798 categoriesRangeMap.insert("IsHangulCompatibilityJamo", qMakePair(0x3130, 0x318F));
2799 categoriesRangeMap.insert("IsKanbun", qMakePair(0x3190, 0x319F));
2800 categoriesRangeMap.insert("IsBopomofoExtended", qMakePair(0x31A0, 0x31BF));
2801 categoriesRangeMap.insert("IsCJKStrokes", qMakePair(0x31C0, 0x31EF));
2802 categoriesRangeMap.insert("IsKatakanaPhoneticExtensions", qMakePair(0x31F0, 0x31FF));
2803 categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths", qMakePair(0x3200, 0x32FF));
2804 categoriesRangeMap.insert("IsCJKCompatibility", qMakePair(0x3300, 0x33FF));
2805 categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA", qMakePair(0x3400, 0x4DB5));
2806 categoriesRangeMap.insert("IsYijingHexagramSymbols", qMakePair(0x4DC0, 0x4DFF));
2807 categoriesRangeMap.insert("IsCJKUnifiedIdeographs", qMakePair(0x4E00, 0x9FFF));
2808 categoriesRangeMap.insert("IsYiSyllables", qMakePair(0xA000, 0xA48F));
2809 categoriesRangeMap.insert("IsYiRadicals", qMakePair(0xA490, 0xA4CF));
2810 categoriesRangeMap.insert("IsModifierToneLetters", qMakePair(0xA700, 0xA71F));
2811 categoriesRangeMap.insert("IsSylotiNagri", qMakePair(0xA800, 0xA82F));
2812 categoriesRangeMap.insert("IsHangulSyllables", qMakePair(0xAC00, 0xD7A3));
2813 categoriesRangeMap.insert("IsPrivateUse", qMakePair(0xE000, 0xF8FF));
2814 categoriesRangeMap.insert("IsCJKCompatibilityIdeographs", qMakePair(0xF900, 0xFAFF));
2815 categoriesRangeMap.insert("IsAlphabeticPresentationForms", qMakePair(0xFB00, 0xFB4F));
2816 categoriesRangeMap.insert("IsArabicPresentationForms-A", qMakePair(0xFB50, 0xFDFF));
2817 categoriesRangeMap.insert("IsVariationSelectors", qMakePair(0xFE00, 0xFE0F));
2818 categoriesRangeMap.insert("IsVerticalForms", qMakePair(0xFE10, 0xFE1F));
2819 categoriesRangeMap.insert("IsCombiningHalfMarks", qMakePair(0xFE20, 0xFE2F));
2820 categoriesRangeMap.insert("IsCJKCompatibilityForms", qMakePair(0xFE30, 0xFE4F));
2821 categoriesRangeMap.insert("IsSmallFormVariants", qMakePair(0xFE50, 0xFE6F));
2822 categoriesRangeMap.insert("IsArabicPresentationForms-B", qMakePair(0xFE70, 0xFEFF));
2823 categoriesRangeMap.insert("IsHalfwidthandFullwidthForms", qMakePair(0xFF00, 0xFFEF));
2824 categoriesRangeMap.insert("IsSpecials", qMakePair(0xFFF0, 0xFFFF));
2825 categoriesRangeMap.insert("IsLinearBSyllabary", qMakePair(0x10000, 0x1007F));
2826 categoriesRangeMap.insert("IsLinearBIdeograms", qMakePair(0x10080, 0x100FF));
2827 categoriesRangeMap.insert("IsAegeanNumbers", qMakePair(0x10100, 0x1013F));
2828 categoriesRangeMap.insert("IsAncientGreekNumbers", qMakePair(0x10140, 0x1018F));
2829 categoriesRangeMap.insert("IsOldItalic", qMakePair(0x10300, 0x1032F));
2830 categoriesRangeMap.insert("IsGothic", qMakePair(0x10330, 0x1034F));
2831 categoriesRangeMap.insert("IsUgaritic", qMakePair(0x10380, 0x1039F));
2832 categoriesRangeMap.insert("IsOldPersian", qMakePair(0x103A0, 0x103DF));
2833 categoriesRangeMap.insert("IsDeseret", qMakePair(0x10400, 0x1044F));
2834 categoriesRangeMap.insert("IsShavian", qMakePair(0x10450, 0x1047F));
2835 categoriesRangeMap.insert("IsOsmanya", qMakePair(0x10480, 0x104AF));
2836 categoriesRangeMap.insert("IsCypriotSyllabary", qMakePair(0x10800, 0x1083F));
2837 categoriesRangeMap.insert("IsKharoshthi", qMakePair(0x10A00, 0x10A5F));
2838 categoriesRangeMap.insert("IsByzantineMusicalSymbols", qMakePair(0x1D000, 0x1D0FF));
2839 categoriesRangeMap.insert("IsMusicalSymbols", qMakePair(0x1D100, 0x1D1FF));
2840 categoriesRangeMap.insert("IsAncientGreekMusicalNotation", qMakePair(0x1D200, 0x1D24F));
2841 categoriesRangeMap.insert("IsTaiXuanJingSymbols", qMakePair(0x1D300, 0x1D35F));
2842 categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols", qMakePair(0x1D400, 0x1D7FF));
2843 categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB", qMakePair(0x20000, 0x2A6DF));
2844 categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
2845 categoriesRangeMap.insert("IsTags", qMakePair(0xE0000, 0xE007F));
2846 categoriesRangeMap.insert("IsVariationSelectorsSupplement", qMakePair(0xE0100, 0xE01EF));
2847 categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A", qMakePair(0xF0000, 0xFFFFF));
2848 categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B", qMakePair(0x100000, 0x10FFFF));
2849}
2850
2851int QRegExpEngine::getChar()
2852{
2853 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2854}
2855
2856int QRegExpEngine::getEscape()
2857{
2858#ifndef QT_NO_REGEXP_ESCAPE
2859 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2860 const char backTab[] = "\a\f\n\r\t\v";
2861 ushort low;
2862 int i;
2863#endif
2864 ushort val;
2865 int prevCh = yyCh;
2866
2867 if (prevCh == EOS) {
2868 error(RXERR_END);
2869 return Tok_Char | '\\';
2870 }
2871 yyCh = getChar();
2872#ifndef QT_NO_REGEXP_ESCAPE
2873 if ((prevCh & ~0xff) == 0) {
2874 const char *p = strchr(tab, prevCh);
2875 if (p != 0)
2876 return Tok_Char | backTab[p - tab];
2877 }
2878#endif
2879
2880 switch (prevCh) {
2881#ifndef QT_NO_REGEXP_ESCAPE
2882 case '0':
2883 val = 0;
2884 for (i = 0; i < 3; i++) {
2885 if (yyCh >= '0' && yyCh <= '7')
2886 val = (val << 3) | (yyCh - '0');
2887 else
2888 break;
2889 yyCh = getChar();
2890 }
2891 if ((val & ~0377) != 0)
2892 error(RXERR_OCTAL);
2893 return Tok_Char | val;
2894#endif
2895#ifndef QT_NO_REGEXP_ESCAPE
2896 case 'B':
2897 return Tok_NonWord;
2898#endif
2899#ifndef QT_NO_REGEXP_CCLASS
2900 case 'D':
2901 // see QChar::isDigit()
2902 yyCharClass->addCategories(0x7fffffef);
2903 return Tok_CharClass;
2904 case 'S':
2905 // see QChar::isSpace()
2906 yyCharClass->addCategories(0x7ffff87f);
2907 yyCharClass->addRange(0x0000, 0x0008);
2908 yyCharClass->addRange(0x000e, 0x001f);
2909 yyCharClass->addRange(0x007f, 0x009f);
2910 return Tok_CharClass;
2911 case 'W':
2912 // see QChar::isLetterOrNumber() and QChar::isMark()
2913 yyCharClass->addCategories(0x7fe07f81);
2914 yyCharClass->addRange(0x203f, 0x2040);
2915 yyCharClass->addSingleton(0x2040);
2916 yyCharClass->addSingleton(0x2054);
2917 yyCharClass->addSingleton(0x30fb);
2918 yyCharClass->addRange(0xfe33, 0xfe34);
2919 yyCharClass->addRange(0xfe4d, 0xfe4f);
2920 yyCharClass->addSingleton(0xff3f);
2921 yyCharClass->addSingleton(0xff65);
2922 return Tok_CharClass;
2923#endif
2924#ifndef QT_NO_REGEXP_ESCAPE
2925 case 'b':
2926 return Tok_Word;
2927#endif
2928#ifndef QT_NO_REGEXP_CCLASS
2929 case 'd':
2930 // see QChar::isDigit()
2931 yyCharClass->addCategories(0x00000010);
2932 return Tok_CharClass;
2933 case 's':
2934 // see QChar::isSpace()
2935 yyCharClass->addCategories(0x00000380);
2936 yyCharClass->addRange(0x0009, 0x000d);
2937 return Tok_CharClass;
2938 case 'w':
2939 // see QChar::isLetterOrNumber() and QChar::isMark()
2940 yyCharClass->addCategories(0x000f807e);
2941 yyCharClass->addSingleton(0x005f); // '_'
2942 return Tok_CharClass;
2943 case 'I':
2944 if (xmlSchemaExtensions) {
2945 yyCharClass->setNegative(!yyCharClass->negative());
2946 // fall through
2947 }
2948 case 'i':
2949 if (xmlSchemaExtensions) {
2950 yyCharClass->addCategories(0x000f807e);
2951 yyCharClass->addSingleton(0x003a); // ':'
2952 yyCharClass->addSingleton(0x005f); // '_'
2953 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2954 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2955 yyCharClass->addRange(0xc0, 0xd6);
2956 yyCharClass->addRange(0xd8, 0xf6);
2957 yyCharClass->addRange(0xf8, 0x2ff);
2958 yyCharClass->addRange(0x370, 0x37d);
2959 yyCharClass->addRange(0x37f, 0x1fff);
2960 yyCharClass->addRange(0x200c, 0x200d);
2961 yyCharClass->addRange(0x2070, 0x218f);
2962 yyCharClass->addRange(0x2c00, 0x2fef);
2963 yyCharClass->addRange(0x3001, 0xd7ff);
2964 yyCharClass->addRange(0xf900, 0xfdcf);
2965 yyCharClass->addRange(0xfdf0, 0xfffd);
2966 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2967 }
2968 return Tok_CharClass;
2969 case 'C':
2970 if (xmlSchemaExtensions) {
2971 yyCharClass->setNegative(!yyCharClass->negative());
2972 // fall through
2973 }
2974 case 'c':
2975 if (xmlSchemaExtensions) {
2976 yyCharClass->addCategories(0x000f807e);
2977 yyCharClass->addSingleton(0x002d); // '-'
2978 yyCharClass->addSingleton(0x002e); // '.'
2979 yyCharClass->addSingleton(0x003a); // ':'
2980 yyCharClass->addSingleton(0x005f); // '_'
2981 yyCharClass->addSingleton(0xb7);
2982 yyCharClass->addRange(0x0030, 0x0039); // [0-9]
2983 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2984 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2985 yyCharClass->addRange(0xc0, 0xd6);
2986 yyCharClass->addRange(0xd8, 0xf6);
2987 yyCharClass->addRange(0xf8, 0x2ff);
2988 yyCharClass->addRange(0x370, 0x37d);
2989 yyCharClass->addRange(0x37f, 0x1fff);
2990 yyCharClass->addRange(0x200c, 0x200d);
2991 yyCharClass->addRange(0x2070, 0x218f);
2992 yyCharClass->addRange(0x2c00, 0x2fef);
2993 yyCharClass->addRange(0x3001, 0xd7ff);
2994 yyCharClass->addRange(0xf900, 0xfdcf);
2995 yyCharClass->addRange(0xfdf0, 0xfffd);
2996 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2997 yyCharClass->addRange(0x0300, 0x036f);
2998 yyCharClass->addRange(0x203f, 0x2040);
2999 }
3000 return Tok_CharClass;
3001 case 'P':
3002 if (xmlSchemaExtensions) {
3003 yyCharClass->setNegative(!yyCharClass->negative());
3004 // fall through
3005 }
3006 case 'p':
3007 if (xmlSchemaExtensions) {
3008 if (yyCh != '{') {
3009 error(RXERR_CHARCLASS);
3010 return Tok_CharClass;
3011 }
3012
3013 QByteArray category;
3014 yyCh = getChar();
3015 while (yyCh != '}') {
3016 if (yyCh == EOS) {
3017 error(RXERR_END);
3018 return Tok_CharClass;
3019 }
3020 category.append(yyCh);
3021 yyCh = getChar();
3022 }
3023 yyCh = getChar(); // skip closing '}'
3024
3025 if (category == "M") {
3026 yyCharClass->addCategories(0x0000000e);
3027 } else if (category == "Mn") {
3028 yyCharClass->addCategories(0x00000002);
3029 } else if (category == "Mc") {
3030 yyCharClass->addCategories(0x00000004);
3031 } else if (category == "Me") {
3032 yyCharClass->addCategories(0x00000008);
3033 } else if (category == "N") {
3034 yyCharClass->addCategories(0x00000070);
3035 } else if (category == "Nd") {
3036 yyCharClass->addCategories(0x00000010);
3037 } else if (category == "Nl") {
3038 yyCharClass->addCategories(0x00000020);
3039 } else if (category == "No") {
3040 yyCharClass->addCategories(0x00000040);
3041 } else if (category == "Z") {
3042 yyCharClass->addCategories(0x00000380);
3043 } else if (category == "Zs") {
3044 yyCharClass->addCategories(0x00000080);
3045 } else if (category == "Zl") {
3046 yyCharClass->addCategories(0x00000100);
3047 } else if (category == "Zp") {
3048 yyCharClass->addCategories(0x00000200);
3049 } else if (category == "C") {
3050 yyCharClass->addCategories(0x00006c00);
3051 } else if (category == "Cc") {
3052 yyCharClass->addCategories(0x00000400);
3053 } else if (category == "Cf") {
3054 yyCharClass->addCategories(0x00000800);
3055 } else if (category == "Cs") {
3056 yyCharClass->addCategories(0x00001000);
3057 } else if (category == "Co") {
3058 yyCharClass->addCategories(0x00002000);
3059 } else if (category == "Cn") {
3060 yyCharClass->addCategories(0x00004000);
3061 } else if (category == "L") {
3062 yyCharClass->addCategories(0x000f8000);
3063 } else if (category == "Lu") {
3064 yyCharClass->addCategories(0x00008000);
3065 } else if (category == "Ll") {
3066 yyCharClass->addCategories(0x00010000);
3067 } else if (category == "Lt") {
3068 yyCharClass->addCategories(0x00020000);
3069 } else if (category == "Lm") {
3070 yyCharClass->addCategories(0x00040000);
3071 } else if (category == "Lo") {
3072 yyCharClass->addCategories(0x00080000);
3073 } else if (category == "P") {
3074 yyCharClass->addCategories(0x4f580780);
3075 } else if (category == "Pc") {
3076 yyCharClass->addCategories(0x00100000);
3077 } else if (category == "Pd") {
3078 yyCharClass->addCategories(0x00200000);
3079 } else if (category == "Ps") {
3080 yyCharClass->addCategories(0x00400000);
3081 } else if (category == "Pe") {
3082 yyCharClass->addCategories(0x00800000);
3083 } else if (category == "Pi") {
3084 yyCharClass->addCategories(0x01000000);
3085 } else if (category == "Pf") {
3086 yyCharClass->addCategories(0x02000000);
3087 } else if (category == "Po") {
3088 yyCharClass->addCategories(0x04000000);
3089 } else if (category == "S") {
3090 yyCharClass->addCategories(0x78000000);
3091 } else if (category == "Sm") {
3092 yyCharClass->addCategories(0x08000000);
3093 } else if (category == "Sc") {
3094 yyCharClass->addCategories(0x10000000);
3095 } else if (category == "Sk") {
3096 yyCharClass->addCategories(0x20000000);
3097 } else if (category == "So") {
3098 yyCharClass->addCategories(0x40000000);
3099 } else if (category.startsWith("Is")) {
3100 if (categoriesRangeMap.isEmpty())
3101 setupCategoriesRangeMap();
3102
3103 if (categoriesRangeMap.contains(category)) {
3104 const QPair<int, int> range = categoriesRangeMap.value(category);
3105 yyCharClass->addRange(range.first, range.second);
3106 } else {
3107 error(RXERR_CATEGORY);
3108 }
3109 } else {
3110 error(RXERR_CATEGORY);
3111 }
3112 }
3113 return Tok_CharClass;
3114#endif
3115#ifndef QT_NO_REGEXP_ESCAPE
3116 case 'x':
3117 val = 0;
3118 for (i = 0; i < 4; i++) {
3119 low = QChar(yyCh).toLower().unicode();
3120 if (low >= '0' && low <= '9')
3121 val = (val << 4) | (low - '0');
3122 else if (low >= 'a' && low <= 'f')
3123 val = (val << 4) | (low - 'a' + 10);
3124 else
3125 break;
3126 yyCh = getChar();
3127 }
3128 return Tok_Char | val;
3129#endif
3130 default:
3131 if (prevCh >= '1' && prevCh <= '9') {
3132#ifndef QT_NO_REGEXP_BACKREF
3133 val = prevCh - '0';
3134 while (yyCh >= '0' && yyCh <= '9') {
3135 val = (val * 10) + (yyCh - '0');
3136 yyCh = getChar();
3137 }
3138 return Tok_BackRef | val;
3139#else
3140 error(RXERR_DISABLED);
3141#endif
3142 }
3143 return Tok_Char | prevCh;
3144 }
3145}
3146
3147#ifndef QT_NO_REGEXP_INTERVAL
3148int QRegExpEngine::getRep(int def)
3149{
3150 if (yyCh >= '0' && yyCh <= '9') {
3151 int rep = 0;
3152 do {
3153 rep = 10 * rep + yyCh - '0';
3154 if (rep >= InftyRep) {
3155 error(RXERR_REPETITION);
3156 rep = def;
3157 }
3158 yyCh = getChar();
3159 } while (yyCh >= '0' && yyCh <= '9');
3160 return rep;
3161 } else {
3162 return def;
3163 }
3164}
3165#endif
3166
3167#ifndef QT_NO_REGEXP_LOOKAHEAD
3168void QRegExpEngine::skipChars(int n)
3169{
3170 if (n > 0) {
3171 yyPos += n - 1;
3172 yyCh = getChar();
3173 }
3174}
3175#endif
3176
3177void QRegExpEngine::error(const char *msg)
3178{
3179 if (yyError.isEmpty())
3180 yyError = QLatin1String(msg);
3181}
3182
3183void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3184{
3185 yyIn = rx;
3186 yyPos0 = 0;
3187 yyPos = 0;
3188 yyLen = len;
3189 yyCh = getChar();
3190 yyCharClass.reset(new QRegExpCharClass);
3191 yyMinRep = 0;
3192 yyMaxRep = 0;
3193 yyError = QString();
3194}
3195
3196int QRegExpEngine::getToken()
3197{
3198#ifndef QT_NO_REGEXP_CCLASS
3199 ushort pendingCh = 0;
3200 bool charPending;
3201 bool rangePending;
3202 int tok;
3203#endif
3204 int prevCh = yyCh;
3205
3206 yyPos0 = yyPos - 1;
3207#ifndef QT_NO_REGEXP_CCLASS
3208 yyCharClass->clear();
3209#endif
3210 yyMinRep = 0;
3211 yyMaxRep = 0;
3212 yyCh = getChar();
3213
3214 switch (prevCh) {
3215 case EOS:
3216 yyPos0 = yyPos;
3217 return Tok_Eos;
3218 case '$':
3219 return Tok_Dollar;
3220 case '(':
3221 if (yyCh == '?') {
3222 prevCh = getChar();
3223 yyCh = getChar();
3224 switch (prevCh) {
3225#ifndef QT_NO_REGEXP_LOOKAHEAD
3226 case '!':
3227 return Tok_NegLookahead;
3228 case '=':
3229 return Tok_PosLookahead;
3230#endif
3231 case ':':
3232 return Tok_MagicLeftParen;
3233 default:
3234 error(RXERR_LOOKAHEAD);
3235 return Tok_MagicLeftParen;
3236 }
3237 } else {
3238 return Tok_LeftParen;
3239 }
3240 case ')':
3241 return Tok_RightParen;
3242 case '*':
3243 yyMinRep = 0;
3244 yyMaxRep = InftyRep;
3245 return Tok_Quantifier;
3246 case '+':
3247 yyMinRep = 1;
3248 yyMaxRep = InftyRep;
3249 return Tok_Quantifier;
3250 case '.':
3251#ifndef QT_NO_REGEXP_CCLASS
3252 yyCharClass->setNegative(true);
3253#endif
3254 return Tok_CharClass;
3255 case '?':
3256 yyMinRep = 0;
3257 yyMaxRep = 1;
3258 return Tok_Quantifier;
3259 case '[':
3260#ifndef QT_NO_REGEXP_CCLASS
3261 if (yyCh == '^') {
3262 yyCharClass->setNegative(true);
3263 yyCh = getChar();
3264 }
3265 charPending = false;
3266 rangePending = false;
3267 do {
3268 if (yyCh == '-' && charPending && !rangePending) {
3269 rangePending = true;
3270 yyCh = getChar();
3271 } else {
3272 if (charPending && !rangePending) {
3273 yyCharClass->addSingleton(pendingCh);
3274 charPending = false;
3275 }
3276 if (yyCh == '\\') {
3277 yyCh = getChar();
3278 tok = getEscape();
3279 if (tok == Tok_Word)
3280 tok = '\b';
3281 } else {
3282 tok = Tok_Char | yyCh;
3283 yyCh = getChar();
3284 }
3285 if (tok == Tok_CharClass) {
3286 if (rangePending) {
3287 yyCharClass->addSingleton('-');
3288 yyCharClass->addSingleton(pendingCh);
3289 charPending = false;
3290 rangePending = false;
3291 }
3292 } else if ((tok & Tok_Char) != 0) {
3293 if (rangePending) {
3294 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3295 charPending = false;
3296 rangePending = false;
3297 } else {
3298 pendingCh = tok ^ Tok_Char;
3299 charPending = true;
3300 }
3301 } else {
3302 error(RXERR_CHARCLASS);
3303 }
3304 }
3305 } while (yyCh != ']' && yyCh != EOS);
3306 if (rangePending)
3307 yyCharClass->addSingleton('-');
3308 if (charPending)
3309 yyCharClass->addSingleton(pendingCh);
3310 if (yyCh == EOS)
3311 error(RXERR_END);
3312 else
3313 yyCh = getChar();
3314 return Tok_CharClass;
3315#else
3316 error(RXERR_END);
3317 return Tok_Char | '[';
3318#endif
3319 case '\\':
3320 return getEscape();
3321 case ']':
3322 error(RXERR_LEFTDELIM);
3323 return Tok_Char | ']';
3324 case '^':
3325 return Tok_Caret;
3326 case '{':
3327#ifndef QT_NO_REGEXP_INTERVAL
3328 yyMinRep = getRep(0);
3329 yyMaxRep = yyMinRep;
3330 if (yyCh == ',') {
3331 yyCh = getChar();
3332 yyMaxRep = getRep(InftyRep);
3333 }
3334 if (yyMaxRep < yyMinRep)
3335 error(RXERR_INTERVAL);
3336 if (yyCh != '}')
3337 error(RXERR_REPETITION);
3338 yyCh = getChar();
3339 return Tok_Quantifier;
3340#else
3341 error(RXERR_DISABLED);
3342 return Tok_Char | '{';
3343#endif
3344 case '|':
3345 return Tok_Bar;
3346 case '}':
3347 error(RXERR_LEFTDELIM);
3348 return Tok_Char | '}';
3349 default:
3350 return Tok_Char | prevCh;
3351 }
3352}
3353
3354int QRegExpEngine::parse(const QChar *pattern, int len)
3355{
3356 valid = true;
3357 startTokenizer(pattern, len);
3358 yyTok = getToken();
3359#ifndef QT_NO_REGEXP_CAPTURE
3360 yyMayCapture = true;
3361#else
3362 yyMayCapture = false;
3363#endif
3364
3365#ifndef QT_NO_REGEXP_CAPTURE
3366 int atom = startAtom(false);
3367#endif
3368 QRegExpCharClass anything;
3369 Box box(this); // create InitialState
3370 box.set(anything);
3371 Box rightBox(this); // create FinalState
3372 rightBox.set(anything);
3373
3374 Box middleBox(this);
3375 parseExpression(&middleBox);
3376#ifndef QT_NO_REGEXP_CAPTURE
3377 finishAtom(atom, false);
3378#endif
3379#ifndef QT_NO_REGEXP_OPTIM
3380 middleBox.setupHeuristics();
3381#endif
3382 box.cat(middleBox);
3383 box.cat(rightBox);
3384 yyCharClass.reset(0);
3385
3386#ifndef QT_NO_REGEXP_CAPTURE
3387 for (int i = 0; i < nf; ++i) {
3388 switch (f[i].capture) {
3389 case QRegExpAtom::NoCapture:
3390 break;
3391 case QRegExpAtom::OfficialCapture:
3392 f[i].capture = ncap;
3393 captureForOfficialCapture.append(ncap);
3394 ++ncap;
3395 ++officialncap;
3396 break;
3397 case QRegExpAtom::UnofficialCapture:
3398 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3399 }
3400 }
3401
3402#ifndef QT_NO_REGEXP_BACKREF
3403#ifndef QT_NO_REGEXP_OPTIM
3404 if (officialncap == 0 && nbrefs == 0) {
3405 ncap = nf = 0;
3406 f.clear();
3407 }
3408#endif
3409 // handle the case where there's a \5 with no corresponding capture
3410 // (captureForOfficialCapture.size() != officialncap)
3411 for (int i = 0; i < nbrefs - officialncap; ++i) {
3412 captureForOfficialCapture.append(ncap);
3413 ++ncap;
3414 }
3415#endif
3416#endif
3417
3418 if (!yyError.isEmpty())
3419 return -1;
3420
3421#ifndef QT_NO_REGEXP_OPTIM
3422 const QRegExpAutomatonState &sinit = s.at(InitialState);
3423 caretAnchored = !sinit.anchors.isEmpty();
3424 if (caretAnchored) {
3425 const QMap<int, int> &anchors = sinit.anchors;
3426 QMap<int, int>::const_iterator a;
3427 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3428 if (
3429#ifndef QT_NO_REGEXP_ANCHOR_ALT
3430 (*a & Anchor_Alternation) != 0 ||
3431#endif
3432 (*a & Anchor_Caret) == 0)
3433 {
3434 caretAnchored = false;
3435 break;
3436 }
3437 }
3438 }
3439#endif
3440
3441 // cleanup anchors
3442 int numStates = s.count();
3443 for (int i = 0; i < numStates; ++i) {
3444 QRegExpAutomatonState &state = s[i];
3445 if (!state.anchors.isEmpty()) {
3446 QMap<int, int>::iterator a = state.anchors.begin();
3447 while (a != state.anchors.end()) {
3448 if (a.value() == 0)
3449 a = state.anchors.erase(a);
3450 else
3451 ++a;
3452 }
3453 }
3454 }
3455
3456 return yyPos0;
3457}
3458
3459void QRegExpEngine::parseAtom(Box *box)
3460{
3461#ifndef QT_NO_REGEXP_LOOKAHEAD
3462 QRegExpEngine *eng = 0;
3463 bool neg;
3464 int len;
3465#endif
3466
3467 if ((yyTok & Tok_Char) != 0) {
3468 box->set(QChar(yyTok ^ Tok_Char));
3469 } else {
3470#ifndef QT_NO_REGEXP_OPTIM
3471 trivial = false;
3472#endif
3473 switch (yyTok) {
3474 case Tok_Dollar:
3475 box->catAnchor(Anchor_Dollar);
3476 break;
3477 case Tok_Caret:
3478 box->catAnchor(Anchor_Caret);
3479 break;
3480#ifndef QT_NO_REGEXP_LOOKAHEAD
3481 case Tok_PosLookahead:
3482 case Tok_NegLookahead:
3483 neg = (yyTok == Tok_NegLookahead);
3484 eng = new QRegExpEngine(cs, greedyQuantifiers);
3485 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3486 if (len >= 0)
3487 skipChars(len);
3488 else
3489 error(RXERR_LOOKAHEAD);
3490 box->catAnchor(addLookahead(eng, neg));
3491 yyTok = getToken();
3492 if (yyTok != Tok_RightParen)
3493 error(RXERR_LOOKAHEAD);
3494 break;
3495#endif
3496#ifndef QT_NO_REGEXP_ESCAPE
3497 case Tok_Word:
3498 box->catAnchor(Anchor_Word);
3499 break;
3500 case Tok_NonWord:
3501 box->catAnchor(Anchor_NonWord);
3502 break;
3503#endif
3504 case Tok_LeftParen:
3505 case Tok_MagicLeftParen:
3506 yyTok = getToken();
3507 parseExpression(box);
3508 if (yyTok != Tok_RightParen)
3509 error(RXERR_END);
3510 break;
3511 case Tok_CharClass:
3512 box->set(*yyCharClass);
3513 break;
3514 case Tok_Quantifier:
3515 error(RXERR_REPETITION);
3516 break;
3517 default:
3518#ifndef QT_NO_REGEXP_BACKREF
3519 if ((yyTok & Tok_BackRef) != 0)
3520 box->set(yyTok ^ Tok_BackRef);
3521 else
3522#endif
3523 error(RXERR_DISABLED);
3524 }
3525 }
3526 yyTok = getToken();
3527}
3528
3529void QRegExpEngine::parseFactor(Box *box)
3530{
3531#ifndef QT_NO_REGEXP_CAPTURE
3532 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3533 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3534 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3535#else
3536 const int innerAtom = -1;
3537#endif
3538
3539#ifndef QT_NO_REGEXP_INTERVAL
3540#define YYREDO() \
3541 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3542 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3543
3544 const QChar *in = yyIn;
3545 int pos0 = yyPos0;
3546 int pos = yyPos;
3547 int len = yyLen;
3548 int ch = yyCh;
3549 QRegExpCharClass charClass;
3550 if (yyTok == Tok_CharClass)
3551 charClass = *yyCharClass;
3552 int tok = yyTok;
3553 bool mayCapture = yyMayCapture;
3554#endif
3555
3556 parseAtom(box);
3557#ifndef QT_NO_REGEXP_CAPTURE
3558 finishAtom(innerAtom, magicLeftParen);
3559#endif
3560
3561 bool hasQuantifier = (yyTok == Tok_Quantifier);
3562 if (hasQuantifier) {
3563#ifndef QT_NO_REGEXP_OPTIM
3564 trivial = false;
3565#endif
3566 if (yyMaxRep == InftyRep) {
3567 box->plus(innerAtom);
3568#ifndef QT_NO_REGEXP_INTERVAL
3569 } else if (yyMaxRep == 0) {
3570 box->clear();
3571#endif
3572 }
3573 if (yyMinRep == 0)
3574 box->opt();
3575
3576#ifndef QT_NO_REGEXP_INTERVAL
3577 yyMayCapture = false;
3578 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3579 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3580
3581 Box rightBox(this);
3582 int i;
3583
3584 for (i = 0; i < beta; i++) {
3585 YYREDO();
3586 Box leftBox(this);
3587 parseAtom(&leftBox);
3588 leftBox.cat(rightBox);
3589 leftBox.opt();
3590 rightBox = leftBox;
3591 }
3592 for (i = 0; i < alpha; i++) {
3593 YYREDO();
3594 Box leftBox(this);
3595 parseAtom(&leftBox);
3596 leftBox.cat(rightBox);
3597 rightBox = leftBox;
3598 }
3599 rightBox.cat(*box);
3600 *box = rightBox;
3601#endif
3602 yyTok = getToken();
3603#ifndef QT_NO_REGEXP_INTERVAL
3604 yyMayCapture = mayCapture;
3605#endif
3606 }
3607#undef YYREDO
3608#ifndef QT_NO_REGEXP_CAPTURE
3609 if (greedyQuantifiers)
3610 finishAtom(outerAtom, hasQuantifier);
3611#endif
3612}
3613
3614void QRegExpEngine::parseTerm(Box *box)
3615{
3616#ifndef QT_NO_REGEXP_OPTIM
3617 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3618 parseFactor(box);
3619#endif
3620 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3621 Box rightBox(this);
3622 parseFactor(&rightBox);
3623 box->cat(rightBox);
3624 }
3625}
3626
3627void QRegExpEngine::parseExpression(Box *box)
3628{
3629 parseTerm(box);
3630 while (yyTok == Tok_Bar) {
3631#ifndef QT_NO_REGEXP_OPTIM
3632 trivial = false;
3633#endif
3634 Box rightBox(this);
3635 yyTok = getToken();
3636 parseTerm(&rightBox);
3637 box->orx(rightBox);
3638 }
3639}
3640
3641/*
3642 The struct QRegExpPrivate contains the private data of a regular
3643 expression other than the automaton. It makes it possible for many
3644 QRegExp objects to use the same QRegExpEngine object with different
3645 QRegExpPrivate objects.
3646*/
3647struct QRegExpPrivate
3648{
3649 QRegExpEngine *eng;
3650 QRegExpEngineKey engineKey;
3651 bool minimal;
3652#ifndef QT_NO_REGEXP_CAPTURE
3653 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3654 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3655#endif
3656 QRegExpMatchState matchState;
3657
3658 inline QRegExpPrivate()
3659 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3660 inline QRegExpPrivate(const QRegExpEngineKey &key)
3661 : eng(0), engineKey(key), minimal(false) {}
3662};
3663
3664#if !defined(QT_NO_REGEXP_OPTIM)
3665uint qHash(const QRegExpEngineKey &key)
3666{
3667 return qHash(key.pattern);
3668}
3669
3670typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3671Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3672Q_GLOBAL_STATIC(QMutex, mutex)
3673#endif // QT_NO_REGEXP_OPTIM
3674
3675static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3676{
3677 if (!eng->ref.deref()) {
3678#if !defined(QT_NO_REGEXP_OPTIM)
3679 if (globalEngineCache()) {
3680 QMutexLocker locker(mutex());
3681 QT_TRY {
3682 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3683 } QT_CATCH(const std::bad_alloc &) {
3684 // in case of an exception (e.g. oom), just delete the engine
3685 delete eng;
3686 }
3687 } else {
3688 delete eng;
3689 }
3690#else
3691 Q_UNUSED(key);
3692 delete eng;
3693#endif
3694 }
3695}
3696
3697static void prepareEngine_helper(QRegExpPrivate *priv)
3698{
3699 bool initMatchState = !priv->eng;
3700#if !defined(QT_NO_REGEXP_OPTIM)
3701 if (!priv->eng && globalEngineCache()) {
3702 QMutexLocker locker(mutex());
3703 priv->eng = globalEngineCache()->take(priv->engineKey);
3704 if (priv->eng != 0)
3705 priv->eng->ref.ref();
3706 }
3707#endif // QT_NO_REGEXP_OPTIM
3708
3709 if (!priv->eng)
3710 priv->eng = new QRegExpEngine(priv->engineKey);
3711
3712 if (initMatchState)
3713 priv->matchState.prepareForMatch(priv->eng);
3714}
3715
3716inline static void prepareEngine(QRegExpPrivate *priv)
3717{
3718 if (priv->eng)
3719 return;
3720 prepareEngine_helper(priv);
3721}
3722
3723static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3724{
3725 prepareEngine(priv);
3726 priv->matchState.prepareForMatch(priv->eng);
3727#ifndef QT_NO_REGEXP_CAPTURE
3728 priv->t = str;
3729 priv->capturedCache.clear();
3730#else
3731 Q_UNUSED(str);
3732#endif
3733}
3734
3735static void invalidateEngine(QRegExpPrivate *priv)
3736{
3737 if (priv->eng != 0) {
3738 derefEngine(priv->eng, priv->engineKey);
3739 priv->eng = 0;
3740 priv->matchState.drain();
3741 }
3742}
3743
3744/*!
3745 \enum QRegExp::CaretMode
3746
3747 The CaretMode enum defines the different meanings of the caret
3748 (\bold{^}) in a regular expression. The possible values are:
3749
3750 \value CaretAtZero
3751 The caret corresponds to index 0 in the searched string.
3752
3753 \value CaretAtOffset
3754 The caret corresponds to the start offset of the search.
3755
3756 \value CaretWontMatch
3757 The caret never matches.
3758*/
3759
3760/*!
3761 \enum QRegExp::PatternSyntax
3762
3763 The syntax used to interpret the meaning of the pattern.
3764
3765 \value RegExp A rich Perl-like pattern matching syntax. This is
3766 the default.
3767
3768 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3769 will be the default in Qt 5. (Introduced in Qt 4.2.)
3770
3771 \value Wildcard This provides a simple pattern matching syntax
3772 similar to that used by shells (command interpreters) for "file
3773 globbing". See \l{Wildcard Matching}.
3774
3775 \value WildcardUnix This is similar to Wildcard but with the
3776 behavior of a Unix shell. The wildcard characters can be escaped
3777 with the character "\".
3778
3779 \value FixedString The pattern is a fixed string. This is
3780 equivalent to using the RegExp pattern on a string in
3781 which all metacharacters are escaped using escape().
3782
3783 \value W3CXmlSchema11 The pattern is a regular expression as
3784 defined by the W3C XML Schema 1.1 specification.
3785
3786 \sa setPatternSyntax()
3787*/
3788
3789/*!
3790 Constructs an empty regexp.
3791
3792 \sa isValid(), errorString()
3793*/
3794QRegExp::QRegExp()
3795{
3796 priv = new QRegExpPrivate;
3797}
3798
3799/*!
3800 Constructs a regular expression object for the given \a pattern
3801 string. The pattern must be given using wildcard notation if \a
3802 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3803 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3804 greedy (maximal), but can be changed by calling
3805 setMinimal().
3806
3807 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3808*/
3809QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3810{
3811 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3812}
3813
3814/*!
3815 Constructs a regular expression as a copy of \a rx.
3816
3817 \sa operator=()
3818*/
3819QRegExp::QRegExp(const QRegExp &rx)
3820{
3821 priv = new QRegExpPrivate;
3822 operator=(rx);
3823}
3824
3825/*!
3826 Destroys the regular expression and cleans up its internal data.
3827*/
3828QRegExp::~QRegExp()
3829{
3830 invalidateEngine(priv);
3831 delete priv;
3832}
3833
3834/*!
3835 Copies the regular expression \a rx and returns a reference to the
3836 copy. The case sensitivity, wildcard, and minimal matching options
3837 are also copied.
3838*/
3839QRegExp &QRegExp::operator=(const QRegExp &rx)
3840{
3841 prepareEngine(rx.priv); // to allow sharing
3842 QRegExpEngine *otherEng = rx.priv->eng;
3843 if (otherEng)
3844 otherEng->ref.ref();
3845 invalidateEngine(priv);
3846 priv->eng = otherEng;
3847 priv->engineKey = rx.priv->engineKey;
3848 priv->minimal = rx.priv->minimal;
3849#ifndef QT_NO_REGEXP_CAPTURE
3850 priv->t = rx.priv->t;
3851 priv->capturedCache = rx.priv->capturedCache;
3852#endif
3853 if (priv->eng)
3854 priv->matchState.prepareForMatch(priv->eng);
3855 priv->matchState.captured = rx.priv->matchState.captured;
3856 return *this;
3857}
3858
3859/*!
3860 Returns true if this regular expression is equal to \a rx;
3861 otherwise returns false.
3862
3863 Two QRegExp objects are equal if they have the same pattern
3864 strings and the same settings for case sensitivity, wildcard and
3865 minimal matching.
3866*/
3867bool QRegExp::operator==(const QRegExp &rx) const
3868{
3869 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3870}
3871
3872/*!
3873 \fn bool QRegExp::operator!=(const QRegExp &rx) const
3874
3875 Returns true if this regular expression is not equal to \a rx;
3876 otherwise returns false.
3877
3878 \sa operator==()
3879*/
3880
3881/*!
3882 Returns true if the pattern string is empty; otherwise returns
3883 false.
3884
3885 If you call exactMatch() with an empty pattern on an empty string
3886 it will return true; otherwise it returns false since it operates
3887 over the whole string. If you call indexIn() with an empty pattern
3888 on \e any string it will return the start offset (0 by default)
3889 because the empty pattern matches the 'emptiness' at the start of
3890 the string. In this case the length of the match returned by
3891 matchedLength() will be 0.
3892
3893 See QString::isEmpty().
3894*/
3895
3896bool QRegExp::isEmpty() const
3897{
3898 return priv->engineKey.pattern.isEmpty();
3899}
3900
3901/*!
3902 Returns true if the regular expression is valid; otherwise returns
3903 false. An invalid regular expression never matches.
3904
3905 The pattern \bold{[a-z} is an example of an invalid pattern, since
3906 it lacks a closing square bracket.
3907
3908 Note that the validity of a regexp may also depend on the setting
3909 of the wildcard flag, for example \bold{*.html} is a valid
3910 wildcard regexp but an invalid full regexp.
3911
3912 \sa errorString()
3913*/
3914bool QRegExp::isValid() const
3915{
3916 if (priv->engineKey.pattern.isEmpty()) {
3917 return true;
3918 } else {
3919 prepareEngine(priv);
3920 return priv->eng->isValid();
3921 }
3922}
3923
3924/*!
3925 Returns the pattern string of the regular expression. The pattern
3926 has either regular expression syntax or wildcard syntax, depending
3927 on patternSyntax().
3928
3929 \sa patternSyntax(), caseSensitivity()
3930*/
3931QString QRegExp::pattern() const
3932{
3933 return priv->engineKey.pattern;
3934}
3935
3936/*!
3937 Sets the pattern string to \a pattern. The case sensitivity,
3938 wildcard, and minimal matching options are not changed.
3939
3940 \sa setPatternSyntax(), setCaseSensitivity()
3941*/
3942void QRegExp::setPattern(const QString &pattern)
3943{
3944 if (priv->engineKey.pattern != pattern) {
3945 invalidateEngine(priv);
3946 priv->engineKey.pattern = pattern;
3947 }
3948}
3949
3950/*!
3951 Returns Qt::CaseSensitive if the regexp is matched case
3952 sensitively; otherwise returns Qt::CaseInsensitive.
3953
3954 \sa patternSyntax(), pattern(), isMinimal()
3955*/
3956Qt::CaseSensitivity QRegExp::caseSensitivity() const
3957{
3958 return priv->engineKey.cs;
3959}
3960
3961/*!
3962 Sets case sensitive matching to \a cs.
3963
3964 If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
3965 \c{readme.txt} but not \c{README.TXT}.
3966
3967 \sa setPatternSyntax(), setPattern(), setMinimal()
3968*/
3969void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
3970{
3971 if ((bool)cs != (bool)priv->engineKey.cs) {
3972 invalidateEngine(priv);
3973 priv->engineKey.cs = cs;
3974 }
3975}
3976
3977/*!
3978 Returns the syntax used by the regular expression. The default is
3979 QRegExp::RegExp.
3980
3981 \sa pattern(), caseSensitivity()
3982*/
3983QRegExp::PatternSyntax QRegExp::patternSyntax() const
3984{
3985 return priv->engineKey.patternSyntax;
3986}
3987
3988/*!
3989 Sets the syntax mode for the regular expression. The default is
3990 QRegExp::RegExp.
3991
3992 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
3993 \l{wildcard matching}. For example, \bold{r*.txt} matches the
3994 string \c{readme.txt} in wildcard mode, but does not match
3995 \c{readme}.
3996
3997 Setting \a syntax to QRegExp::FixedString means that the pattern
3998 is interpreted as a plain string. Special characters (e.g.,
3999 backslash) don't need to be escaped then.
4000
4001 \sa setPattern(), setCaseSensitivity(), escape()
4002*/
4003void QRegExp::setPatternSyntax(PatternSyntax syntax)
4004{
4005 if (syntax != priv->engineKey.patternSyntax) {
4006 invalidateEngine(priv);
4007 priv->engineKey.patternSyntax = syntax;
4008 }
4009}
4010
4011/*!
4012 Returns true if minimal (non-greedy) matching is enabled;
4013 otherwise returns false.
4014
4015 \sa caseSensitivity(), setMinimal()
4016*/
4017bool QRegExp::isMinimal() const
4018{
4019 return priv->minimal;
4020}
4021
4022/*!
4023 Enables or disables minimal matching. If \a minimal is false,
4024 matching is greedy (maximal) which is the default.
4025
4026 For example, suppose we have the input string "We must be
4027 <b>bold</b>, very <b>bold</b>!" and the pattern
4028 \bold{<b>.*</b>}. With the default greedy (maximal) matching,
4029 the match is "We must be \underline{<b>bold</b>, very
4030 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4031 first match is: "We must be \underline{<b>bold</b>}, very
4032 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4033 very \underline{<b>bold</b>}!". In practice we might use the pattern
4034 \bold{<b>[^<]*\</b>} instead, although this will still fail for
4035 nested tags.
4036
4037 \sa setCaseSensitivity()
4038*/
4039void QRegExp::setMinimal(bool minimal)
4040{
4041 priv->minimal = minimal;
4042}
4043
4044// ### Qt 5: make non-const
4045/*!
4046 Returns true if \a str is matched exactly by this regular
4047 expression; otherwise returns false. You can determine how much of
4048 the string was matched by calling matchedLength().
4049
4050 For a given regexp string R, exactMatch("R") is the equivalent of
4051 indexIn("^R$") since exactMatch() effectively encloses the regexp
4052 in the start of string and end of string anchors, except that it
4053 sets matchedLength() differently.
4054
4055 For example, if the regular expression is \bold{blue}, then
4056 exactMatch() returns true only for input \c blue. For inputs \c
4057 bluebell, \c blutak and \c lightblue, exactMatch() returns false
4058 and matchedLength() will return 4, 3 and 0 respectively.
4059
4060 Although const, this function sets matchedLength(),
4061 capturedTexts(), and pos().
4062
4063 \sa indexIn(), lastIndexIn()
4064*/
4065bool QRegExp::exactMatch(const QString &str) const
4066{
4067 prepareEngineForMatch(priv, str);
4068 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4069 if (priv->matchState.captured[1] == str.length()) {
4070 return true;
4071 } else {
4072 priv->matchState.captured[0] = 0;
4073 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4074 return false;
4075 }
4076}
4077
4078// ### Qt 5: make non-const
4079/*!
4080 Attempts to find a match in \a str from position \a offset (0 by
4081 default). If \a offset is -1, the search starts at the last
4082 character; if -2, at the next to last character; etc.
4083
4084 Returns the position of the first match, or -1 if there was no
4085 match.
4086
4087 The \a caretMode parameter can be used to instruct whether \bold{^}
4088 should match at index 0 or at \a offset.
4089
4090 You might prefer to use QString::indexOf(), QString::contains(),
4091 or even QStringList::filter(). To replace matches use
4092 QString::replace().
4093
4094 Example:
4095 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4096
4097 Although const, this function sets matchedLength(),
4098 capturedTexts() and pos().
4099
4100 If the QRegExp is a wildcard expression (see setPatternSyntax())
4101 and want to test a string against the whole wildcard expression,
4102 use exactMatch() instead of this function.
4103
4104 \sa lastIndexIn(), exactMatch()
4105*/
4106
4107int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4108{
4109 prepareEngineForMatch(priv, str);
4110 if (offset < 0)
4111 offset += str.length();
4112 priv->matchState.match(str.unicode(), str.length(), offset,
4113 priv->minimal, false, caretIndex(offset, caretMode));
4114 return priv->matchState.captured[0];
4115}
4116
4117// ### Qt 5: make non-const
4118/*!
4119 Attempts to find a match backwards in \a str from position \a
4120 offset. If \a offset is -1 (the default), the search starts at the
4121 last character; if -2, at the next to last character; etc.
4122
4123 Returns the position of the first match, or -1 if there was no
4124 match.
4125
4126 The \a caretMode parameter can be used to instruct whether \bold{^}
4127 should match at index 0 or at \a offset.
4128
4129 Although const, this function sets matchedLength(),
4130 capturedTexts() and pos().
4131
4132 \warning Searching backwards is much slower than searching
4133 forwards.
4134
4135 \sa indexIn(), exactMatch()
4136*/
4137
4138int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4139{
4140 prepareEngineForMatch(priv, str);
4141 if (offset < 0)
4142 offset += str.length();
4143 if (offset < 0 || offset > str.length()) {
4144 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4145 return -1;
4146 }
4147
4148 while (offset >= 0) {
4149 priv->matchState.match(str.unicode(), str.length(), offset,
4150 priv->minimal, true, caretIndex(offset, caretMode));
4151 if (priv->matchState.captured[0] == offset)
4152 return offset;
4153 --offset;
4154 }
4155 return -1;
4156}
4157
4158/*!
4159 Returns the length of the last matched string, or -1 if there was
4160 no match.
4161
4162 \sa exactMatch(), indexIn(), lastIndexIn()
4163*/
4164int QRegExp::matchedLength() const
4165{
4166 return priv->matchState.captured[1];
4167}
4168
4169#ifndef QT_NO_REGEXP_CAPTURE
4170/*!
4171 \obsolete
4172 Returns the number of captures contained in the regular expression.
4173
4174 \sa captureCount()
4175 */
4176int QRegExp::numCaptures() const
4177{
4178 return captureCount();
4179}
4180
4181/*!
4182 \since 4.6
4183 Returns the number of captures contained in the regular expression.
4184 */
4185int QRegExp::captureCount() const
4186{
4187 prepareEngine(priv);
4188 return priv->eng->captureCount();
4189}
4190
4191/*!
4192 Returns a list of the captured text strings.
4193
4194 The first string in the list is the entire matched string. Each
4195 subsequent list element contains a string that matched a
4196 (capturing) subexpression of the regexp.
4197
4198 For example:
4199 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4200
4201 The above example also captures elements that may be present but
4202 which we have no interest in. This problem can be solved by using
4203 non-capturing parentheses:
4204
4205 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4206
4207 Note that if you want to iterate over the list, you should iterate
4208 over a copy, e.g.
4209 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4210
4211 Some regexps can match an indeterminate number of times. For
4212 example if the input string is "Offsets: 12 14 99 231 7" and the
4213 regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
4214 all the numbers matched. However, after calling
4215 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4216 "12"), i.e. the entire match was "12" and the first subexpression
4217 matched was "12". The correct approach is to use cap() in a
4218 \l{QRegExp#cap_in_a_loop}{loop}.
4219
4220 The order of elements in the string list is as follows. The first
4221 element is the entire matching string. Each subsequent element
4222 corresponds to the next capturing open left parentheses. Thus
4223 capturedTexts()[1] is the text of the first capturing parentheses,
4224 capturedTexts()[2] is the text of the second and so on
4225 (corresponding to $1, $2, etc., in some other regexp languages).
4226
4227 \sa cap(), pos()
4228*/
4229QStringList QRegExp::capturedTexts() const
4230{
4231 if (priv->capturedCache.isEmpty()) {
4232 prepareEngine(priv);
4233 const int *captured = priv->matchState.captured;
4234 int n = priv->matchState.capturedSize;
4235
4236 for (int i = 0; i < n; i += 2) {
4237 QString m;
4238 if (captured[i + 1] == 0)
4239 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4240 else if (captured[i] >= 0)
4241 m = priv->t.mid(captured[i], captured[i + 1]);
4242 priv->capturedCache.append(m);
4243 }
4244 priv->t.clear();
4245 }
4246 return priv->capturedCache;
4247}
4248
4249/*!
4250 \internal
4251*/
4252QStringList QRegExp::capturedTexts()
4253{
4254 return const_cast<const QRegExp *>(this)->capturedTexts();
4255}
4256
4257/*!
4258 Returns the text captured by the \a nth subexpression. The entire
4259 match has index 0 and the parenthesized subexpressions have
4260 indexes starting from 1 (excluding non-capturing parentheses).
4261
4262 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4263
4264 The order of elements matched by cap() is as follows. The first
4265 element, cap(0), is the entire matching string. Each subsequent
4266 element corresponds to the next capturing open left parentheses.
4267 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4268 is the text of the second, and so on.
4269
4270 \sa capturedTexts(), pos()
4271*/
4272QString QRegExp::cap(int nth) const
4273{
4274 return capturedTexts().value(nth);
4275}
4276
4277/*!
4278 \internal
4279*/
4280QString QRegExp::cap(int nth)
4281{
4282 return const_cast<const QRegExp *>(this)->cap(nth);
4283}
4284
4285/*!
4286 Returns the position of the \a nth captured text in the searched
4287 string. If \a nth is 0 (the default), pos() returns the position
4288 of the whole match.
4289
4290 Example:
4291 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4292
4293 For zero-length matches, pos() always returns -1. (For example, if
4294 cap(4) would return an empty string, pos(4) returns -1.) This is
4295 a feature of the implementation.
4296
4297 \sa cap(), capturedTexts()
4298*/
4299int QRegExp::pos(int nth) const
4300{
4301 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4302 return -1;
4303 else
4304 return priv->matchState.captured[2 * nth];
4305}
4306
4307/*!
4308 \internal
4309*/
4310int QRegExp::pos(int nth)
4311{
4312 return const_cast<const QRegExp *>(this)->pos(nth);
4313}
4314
4315/*!
4316 Returns a text string that explains why a regexp pattern is
4317 invalid the case being; otherwise returns "no error occurred".
4318
4319 \sa isValid()
4320*/
4321QString QRegExp::errorString() const
4322{
4323 if (isValid()) {
4324 return QString::fromLatin1(RXERR_OK);
4325 } else {
4326 return priv->eng->errorString();
4327 }
4328}
4329
4330/*!
4331 \internal
4332*/
4333QString QRegExp::errorString()
4334{
4335 return const_cast<const QRegExp *>(this)->errorString();
4336}
4337#endif
4338
4339/*!
4340 Returns the string \a str with every regexp special character
4341 escaped with a backslash. The special characters are $, (,), *, +,
4342 ., ?, [, \,], ^, {, | and }.
4343
4344 Example:
4345
4346 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4347
4348 This function is useful to construct regexp patterns dynamically:
4349
4350 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4351
4352 \sa setPatternSyntax()
4353*/
4354QString QRegExp::escape(const QString &str)
4355{
4356 QString quoted;
4357 const int count = str.count();
4358 quoted.reserve(count * 2);
4359 const QLatin1Char backslash('\\');
4360 for (int i = 0; i < count; i++) {
4361 switch (str.at(i).toLatin1()) {
4362 case '$':
4363 case '(':
4364 case ')':
4365 case '*':
4366 case '+':
4367 case '.':
4368 case '?':
4369 case '[':
4370 case '\\':
4371 case ']':
4372 case '^':
4373 case '{':
4374 case '|':
4375 case '}':
4376 quoted.append(backslash);
4377 }
4378 quoted.append(str.at(i));
4379 }
4380 return quoted;
4381}
4382
4383/*!
4384 \fn bool QRegExp::caseSensitive() const
4385
4386 Use \l caseSensitivity() instead.
4387*/
4388
4389/*!
4390 \fn void QRegExp::setCaseSensitive(bool sensitive)
4391
4392 Use \l setCaseSensitivity() instead.
4393*/
4394
4395/*!
4396 \fn bool QRegExp::wildcard() const
4397
4398 Use \l patternSyntax() instead.
4399
4400 \oldcode
4401 bool wc = rx.wildcard();
4402 \newcode
4403 bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
4404 \endcode
4405*/
4406
4407/*!
4408 \fn void QRegExp::setWildcard(bool wildcard)
4409
4410 Use \l setPatternSyntax() instead.
4411
4412 \oldcode
4413 rx.setWildcard(wc);
4414 \newcode
4415 rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
4416 \endcode
4417*/
4418
4419/*!
4420 \fn bool QRegExp::minimal() const
4421
4422 Use \l isMinimal() instead.
4423*/
4424
4425/*!
4426 \fn int QRegExp::search(const QString &str, int from = 0,
4427 CaretMode caretMode = CaretAtZero) const
4428
4429 Use \l indexIn() instead.
4430*/
4431
4432/*!
4433 \fn int QRegExp::searchRev(const QString &str, int from = -1, \
4434 CaretMode caretMode = CaretAtZero) const
4435
4436 Use \l lastIndexIn() instead.
4437*/
4438
4439/*!
4440 \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
4441
4442 Use another constructor instead.
4443
4444 \oldcode
4445 QRegExp rx("*.txt", false, true);
4446 \newcode
4447 QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
4448 \endcode
4449*/
4450
4451#ifndef QT_NO_DATASTREAM
4452/*!
4453 \relates QRegExp
4454
4455 Writes the regular expression \a regExp to stream \a out.
4456
4457 \sa {Format of the QDataStream Operators}
4458*/
4459QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4460{
4461 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4462 << (quint8)regExp.patternSyntax()
4463 << (quint8)!!regExp.isMinimal();
4464}
4465
4466/*!
4467 \relates QRegExp
4468
4469 Reads a regular expression from stream \a in into \a regExp.
4470
4471 \sa {Format of the QDataStream Operators}
4472*/
4473QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4474{
4475 QString pattern;
4476 quint8 cs;
4477 quint8 patternSyntax;
4478 quint8 isMinimal;
4479
4480 in >> pattern >> cs >> patternSyntax >> isMinimal;
4481
4482 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4483 QRegExp::PatternSyntax(patternSyntax));
4484
4485 newRegExp.setMinimal(isMinimal);
4486 regExp = newRegExp;
4487 return in;
4488}
4489#endif // QT_NO_DATASTREAM
4490
4491QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.